In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as pplt
import seaborn as sns
import pandas as pd
import numpy as np
import pickle
from gtfparse import read_gtf

In [None]:
# variants for visualization
variants = pd.read_excel('02_output_analysis/vars_facts_together.xlsx')
# cDNA and protein sequences
var_seq = pd.read_pickle('00_dataframes/cDNA_protSeq_first_vars')

variants = pd.merge(variants, var_seq[['shortcut', 'protein_seq', 'termin_prot', 'ref_cDNA', 'ref_protein_seq']], 
                    left_on='shortcut_v', right_on='shortcut', how='left')

#### UniProt domains

In [None]:
# genes for UniProt ID
with open('03_for_cDNA/uniprot_ids.txt', 'w') as o:
    for i in sorted(set(variants['HUGO_Symbol'])):
        o.write(i+'\n')

In [None]:
# Gene IDs for uniprot as DataFrame
exc_uniprot = pd.read_excel('03_for_cDNA/gene_id_uniprot.xlsx')
exc_uniprot = exc_uniprot.drop(columns=['Entry name', 'Status', 'Gene names', 'Organism'])
exc_uniprot = exc_uniprot.rename(columns={'yourlist:M20211028F248CABF64506F29A91F8037F07B67D1258226A':'HUGO_Symbol'})

In [None]:
# gff format of UniProt outcome with domains
gff = pd.read_csv('03_for_cDNA/uniprot_proteins.gff', sep='\t', comment='#', 
                  names=['Entry', 'where_from', 'item', 'start_pos', 'end_pos', 'o0', 'o1', 'o2', 'ids', 'o3'], 
                  usecols=['Entry', 'item', 'start_pos', 'end_pos', 'ids'])
gff = gff[gff['item'].isin(['Domain', 'Nucleotide binding', 'Region', 'Active site', 'Binding site', 'Zinc finger',
                            'Site', 'Metal binding', 'Motif', 'Compositional bias', 'Signal peptide', 'Propeptide', 
                            'Topological domain', 'Transmembrane', 'DNA binding', 'Transit peptide', 
                            'Initiator methionine'])]

In [None]:
# merged gene name and domains of UniProt
uni_prot = pd.merge(gff, exc_uniprot, on='Entry', how='right')
uni_prot = uni_prot.drop_duplicates()
uni_prot = uni_prot[uni_prot['start_pos'].notnull()]
uni_prot[['start_pos', 'end_pos', 'Length']] = uni_prot[['start_pos', 'end_pos', 'Length']].astype(int)
uni_prot['info_domain'] = uni_prot['ids'].str.split('=', expand=True)[1].str.split(';', expand=True)[0]
uni_prot = uni_prot[['HUGO_Symbol', 'Entry', 'start_pos', 'end_pos', 'item', 'info_domain', 'Length', 
                     'Protein names', 'ids']]
uni_prot = uni_prot.sort_values(by=['HUGO_Symbol', 'start_pos', 'end_pos']).reset_index(drop=True)

#### Nonsense-mediated decay line

In [None]:
# GTF file with sequence information
gtf_df = read_gtf('GRCh37_sequence_information.gtf')
# only those reference transcripts within variant file
gtf_df['transcript_id'] = gtf_df['transcript_id'].str.split('.',expand=True)[0]
gtf_df = gtf_df[gtf_df['transcript_id'].isin(set(variants['RefSeq accession']))]
gtf_df = gtf_df[gtf_df['feature']=='CDS']
gtf_df = gtf_df[gtf_df['seqname']!='NW_003871056.3']
gtf_df['exon_number'] = gtf_df['exon_number'].astype(int)

In [None]:
# nonsense mediated decay after 50 bp before last exon-intron boundary
def nmd_pos(gene):
    gtf_spec = gtf_df[gtf_df['gene']==gene]
    nmd_exon = max(gtf_spec['exon_number'].unique())-1
    len_exon = (int(gtf_spec['end'][gtf_spec['exon_number']==nmd_exon])-
                int(gtf_spec['start'][gtf_spec['exon_number']==nmd_exon]))
    strand = gtf_spec['strand'].unique()[0]
    len_cdna = 0

    for i in gtf_spec['exon_number'].unique():
        if i < nmd_exon:
            len_cdna += (int(gtf_spec['end'][gtf_spec['exon_number']==i])-
                         int(gtf_spec['start'][gtf_spec['exon_number']==i]))

    if len_exon > 50:
        if strand=='+':
            nmd_pos = int(gtf_spec['end'][gtf_df['exon_number']==nmd_exon])-50
            len_cdna += (nmd_pos-int(gtf_spec['start'][gtf_df['exon_number']==nmd_exon]))
        else:
            nmd_pos = int(gtf_spec['start'][gtf_df['exon_number']==nmd_exon])+50
            len_cdna += (int(gtf_spec['end'][gtf_df['exon_number']==nmd_exon])-nmd_pos)

    return (len_cdna, round(len_cdna/3))

In [None]:
nmd_lst = []
for g in variants['HUGO_Symbol'].unique():
    c,p = nmd_pos(g)
    nmd_lst.append([g,c,p])

nmd_df = pd.DataFrame(nmd_lst, columns=['gene', 'NMD_cDNA', 'NMD_protein'])

nmd_df.to_pickle('DataFrame_nonsense_mediated_decay')

#### Visualization of protein domains

In [None]:
# ind = index, variants = DataFrame of variants to analyze, uni_prot = DataFrame of protein domain information
def prot_domains(ind, variants, uni_prot):
    if ind in variants.index:
        prot = variants.iloc[ind]['HUGO_Symbol']
        ips_prot = uni_prot[uni_prot['HUGO_Symbol']==prot]
        shortcut = variants.iloc[ind]['shortcut_v']
        
        nmd_pos = int(nmd_df['NMD_protein'][nmd_df['gene']==prot])

        prot_change_pos = variants.iloc[ind]['prot_change_pos']
        aa_next_pos = variants.iloc[ind]['aa_next_pos']
        termin_pos = variants.iloc[ind]['ter_p_len']

        max_pos = variants.iloc[ind]['ref_p_len']

        seq_alt = variants.iloc[ind]['protein_seq']
        seq_ref = variants.iloc[ind]['ref_protein_seq']
    
        if type(prot_change_pos)==int:
            try:
                seq_alt = seq_alt[prot_change_pos-10:prot_change_pos+9]
                seq_ref = seq_ref[prot_change_pos-10:prot_change_pos+9]
            except:
                seq_alt = seq_alt[prot_change_pos-10:]
                seq_ref = seq_ref[prot_change_pos-10:]
            seq_match = ''
            for a,r in zip(seq_alt, seq_ref):
                if a==r:
                    seq_match += '|'
                else:
                    seq_match += ' '

            inds = ips_prot.index
            starts = ips_prot['start_pos']
            ends = ips_prot['end_pos']
            ids = ips_prot['info_domain']
            ys = np.arange(0.01,1,1/len(ips_prot))

            fig = plt.subplots(figsize=(14, 6))
            ax1 = plt.subplot(1,1,1)

            plt.ylim([0,1])
            plt.xlim([0,max_pos])
            ax1.get_yaxis().set_visible(False)
            ax1.set_xlabel('amino acid position', fontsize=12)
            ax1.xaxis.set_label_coords(0.5, -0.1)

            ax1.set_title('Protein domains of %s (%s)' %(prot, shortcut), fontsize = 18, fontweight = 'bold', 
                          loc = 'center')

            for i,s,e,y,t in zip(inds, starts, ends, ys, ids):
                ax1.add_patch(pplt.Rectangle((s, y), e-s, 0.01, fc = 'black', ec = 'black', alpha = 1))
                plt.text(max_pos+5, y, t, size = 13, ha='left')
            if prot_change_pos != 'no_change':
                ax1.add_patch(pplt.Rectangle((prot_change_pos, 0), 0.01, 1, fc = 'green', ec = 'green', alpha = 1))
                plt.text(x=prot_change_pos, y=-0.10, s='aa substitution', ha='center', c='green')
                if aa_next_pos not in ['probably frameshift', 'error']:
                    ax1.add_patch(pplt.Rectangle((prot_change_pos, 0), aa_next_pos-prot_change_pos, 1, 
                                                 fc = 'green', ec = 'green', alpha = 0.5))
                elif aa_next_pos == 'probably frameshift':
                    ax1.add_patch(pplt.Rectangle((prot_change_pos, 0), termin_pos-prot_change_pos, 1, 
                                                 fc = 'magenta', ec = 'magenta', alpha = 0.5))
            if aa_next_pos not in ['probably frameshift', 'error']:
                ax1.add_patch(pplt.Rectangle((termin_pos+(aa_next_pos-prot_change_pos), 0), 0.01, 1, fc = 'red', 
                                             ec = 'red', alpha = 1))
                plt.text(x=termin_pos+(aa_next_pos-prot_change_pos), y=-0.07, s='stop', ha='center', c='red')
            elif aa_next_pos == 'probably frameshift':
                ax1.add_patch(pplt.Rectangle((termin_pos, 0), 0.01, 1, fc = 'red', ec = 'red', alpha = 1))
                plt.text(x=termin_pos, y=-0.07, s='stop', ha='center', c='red')
            
            ax1.add_patch(pplt.Rectangle((nmd_pos, 0), 0.01, 1, fc = 'blue', ec = 'blue', alpha = 1))
            plt.text(x=nmd_pos, y=-0.13, s='NMD', ha='center', c='blue')
            plt.text(x=1, y=0.9, s=seq_alt, size = 15, ha='left', family='monospace')
            plt.text(x=1, y=0.85, s=seq_match, size = 15, ha='left', family='monospace')
            plt.text(x=1, y=0.8, s=seq_ref, size = 15, ha='left', family='monospace')
            plt.gcf().subplots_adjust(bottom = 0.12, top = 0.94, left = 0.01, right = 0.6)
            plt.savefig('02_output_analysis/00_domain_figs/dom_%s.png'%shortcut)
        else:
            print('no change in protein sequence')
    else:
        print('index not in dataframe')

In [None]:
# to start visualization of first variant (0) in variant DataFrame (variants) with UniProt domain DataFrame (uni_prot)
prot_domains(0, variants, uni_prot)