In [1]:
import pandas as pd
from Bio.SeqUtils import seq1
import re
import requests
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")


In [2]:
def fetch_sequence(uniprot_id):
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
    response = requests.get(url)
    if response.ok:
        # Extract the sequence from the FASTA format
        lines = response.text.split('\n')
        sequence = ''.join(lines[1:])  # Skip the header line
        return sequence
    else:
        print("Failed to fetch sequence")
        return None

In [3]:
residue_data = pd.read_csv('../final_data.csv')
# residue_data = residue_data[residue_data['assoc_gene'] == gene]
residue_data.head()

Unnamed: 0,gene_symbol,pdb_id,chain,uniprot_id,assoc_gene,resolution,res_num,pdb_res,uniprot_res,network_score,outside_range,residue_match
0,EGFR,3POZ,A,P00533,EGFR,1.5,701,Q,Q,-3.791355,False,True
1,EGFR,3POZ,A,P00533,EGFR,1.5,702,A,A,-2.634055,False,True
2,EGFR,3POZ,A,P00533,EGFR,1.5,703,L,L,-1.231055,False,True
3,EGFR,3POZ,A,P00533,EGFR,1.5,704,L,L,0.452193,False,True
4,EGFR,3POZ,A,P00533,EGFR,1.5,705,R,R,-1.086163,False,True


In [4]:
residue_data['pdb_chain'] = residue_data['pdb_id'] + '_' + residue_data['chain']

In [5]:
def check_sequence(mut_df, uniprot_id, aa_col, score_col, convert_to_seq1=True):
    # Fetch the sequence of the protein against the uniprot sequence
    # aa_col should have either the format <original_3letterAA><residueNum><mutated_3letterAA> or the 1 letter code (if convert_to_seq1 is False)
    # returns a dataframe with columns: mutagenesis_aa, res_num, mutagenesis_functional_score
    
    sequence = fetch_sequence(uniprot_id)
    if sequence is None:
        print("Failed to fetch sequence")
        return None

    # Define regular expressions to extract components
    pattern = re.compile(r'([A-Za-z]+)(\d+)([A-Za-z]+)')  # has format like 'R248W'

    # Iterate over mutations and extract components
    tmp = list()
    for _, row in mut_df.iterrows():
        mutation = str(row[aa_col])
        # if mutation is not none or na and does not have '=' in it
        if mutation and '=' not in mutation and 'nan' not in mutation:
            # mutation = mutation.split('.')[1]
            try:
                original_aa, residue_number, _ = pattern.match(mutation).groups() # extract original amino acid and residue number
            except:
                print("Error in row:", mutation)
                continue
            if convert_to_seq1:
                original_aa = seq1(original_aa)

            tmp.append((original_aa, int(residue_number), row[score_col]))
    
    # unique values in tmp ordered by residue number
    tmp = sorted(list(set(tmp)), key=lambda x: x[1])
    tmp = pd.DataFrame(tmp, columns=['mutagenesis_aa', 'res_num', 'mutagenesis_functional_score'])
        
    
    # check if the sequence is correct
    for _, row in tmp.iterrows():
        if row['res_num'] > len(sequence):
            print(f"Residue number {row['res_num']} exceeds sequence length. Stopping...")
            break
        else:
            if not sequence[row['res_num'] - 1] == row['mutagenesis_aa']:
                print("Mismatch at", row['res_num'], "| Expected:", row['mutagenesis_aa'], "| Found:", sequence[row['res_num'] - 1])
                continue
    
    return tmp

In [6]:
all_mut_df = pd.DataFrame()

## TP53

In [7]:
gene = "TP53"
uniprot_id = "P04637"

In [8]:
mut_df = pd.read_csv('data/urn_mavedb_00000068-a-1_scores.csv')
mut_df['aa'] = mut_df['hgvs_pro'].apply(lambda x: x.split('.')[1])
mut_df.head()

Unnamed: 0,accession,hgvs_nt,hgvs_splice,hgvs_pro,score,aa
0,urn:mavedb:00000068-a-1#17,,,p.Ter394Tyr,0.02998,Ter394Tyr
1,urn:mavedb:00000068-a-1#24,,,p.Ter394Pro,-0.560503,Ter394Pro
2,urn:mavedb:00000068-a-1#18,,,p.Ter394Trp,0.062089,Ter394Trp
3,urn:mavedb:00000068-a-1#25,,,p.Ter394Asn,-0.291063,Ter394Asn
4,urn:mavedb:00000068-a-1#20,,,p.Ter394Thr,-0.398481,Ter394Thr


In [9]:
mut_df = check_sequence(mut_df, uniprot_id, 'aa', 'score', convert_to_seq1=True)
mut_df = mut_df.groupby('res_num')['mutagenesis_functional_score'].mean().reset_index() # group by residue number and amino acid and take the mean of the scores
mut_df['gene'] = gene
mut_df.head()

Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Mismatch at 72 | Expected: R | Found: P
Residue number 394 exceeds sequence length. Stopping...


Unnamed: 0,res_num,mutagenesis_functional_score,gene
0,1,-0.37166,TP53
1,2,-0.572327,TP53
2,3,-0.548656,TP53
3,4,-0.364501,TP53
4,5,-0.526651,TP53


In [10]:
all_mut_df = pd.concat([all_mut_df, mut_df])

In [11]:
all_mut_df

Unnamed: 0,res_num,mutagenesis_functional_score,gene
0,1,-0.371660,TP53
1,2,-0.572327,TP53
2,3,-0.548656,TP53
3,4,-0.364501,TP53
4,5,-0.526651,TP53
...,...,...,...
389,390,-0.974713,TP53
390,391,-0.418890,TP53
391,392,-0.398974,TP53
392,393,-0.471366,TP53


## EGFR

In [12]:
mut_df = pd.read_excel('data/41467_2024_45594_MOESM4_ESM.xlsx', sheet_name='All')
mut_df.head()

Unnamed: 0,Amino Acid,EGFR Position,Z-score,Intracellular_Extracellular,Domain,p-value
0,I643C,643,6.600609,Extracellular,domain IV,0.194558
1,K642C,642,6.541242,Extracellular,domain IV,0.140137
2,Q218C,218,6.47343,Extracellular,domain II,0.154892
3,E709W,709,6.271532,Intracellular,kinase,0.165503
4,S229C,229,6.162138,Extracellular,domain II,0.063936


In [13]:
gene = "EGFR"
uniprot_id = "P00533"

In [14]:
# check sequence
tmp = check_sequence(mut_df, uniprot_id, 'Amino Acid', 'Z-score', convert_to_seq1=False)
tmp = tmp.groupby(['res_num', 'mutagenesis_aa'])['mutagenesis_functional_score'].mean().reset_index()
tmp.head()

Unnamed: 0,res_num,mutagenesis_aa,mutagenesis_functional_score
0,2,R,0.223462
1,3,P,0.029544
2,4,S,0.250935
3,5,G,0.156703
4,6,T,0.121446


all good. just have a bunch of nans. but average z-score is already given in the other sheet. The manually calculated average scores are different from the second sheet...we will just use the given sheet and consider it as correct.

In [15]:
mut_df = pd.read_excel('data/41467_2024_45594_MOESM4_ESM.xlsx', sheet_name='Average Z-score')
mut_df['res_num'] = mut_df['EGFR Position'].astype(int)
mut_df['mutagenesis_functional_score'] = mut_df['Z-score'].astype(float)
mut_df = mut_df[['res_num', 'mutagenesis_functional_score']]
mut_df['gene'] = gene
mut_df.head()

Unnamed: 0,res_num,mutagenesis_functional_score,gene
0,2,0.213915,EGFR
1,3,0.051533,EGFR
2,4,0.227036,EGFR
3,5,0.124016,EGFR
4,6,0.08868,EGFR


In [16]:
all_mut_df = pd.concat([all_mut_df, mut_df])

In [17]:
all_mut_df

Unnamed: 0,res_num,mutagenesis_functional_score,gene
0,1,-0.371660,TP53
1,2,-0.572327,TP53
2,3,-0.548656,TP53
3,4,-0.364501,TP53
4,5,-0.526651,TP53
...,...,...,...
1203,1206,-0.015149,EGFR
1204,1207,0.391593,EGFR
1205,1208,0.067981,EGFR
1206,1209,0.197518,EGFR


## KRAS

In [18]:
uniprot_id = "P01116"
gene = "KRAS"

mut_df = pd.read_csv('data/urn_mavedb_00000115-a-7_scores.csv')


# calculate mean and std dev of scores for each residue
mut_df['aa'] = mut_df['hgvs_pro'].apply(lambda x: x.split('.')[1])
mut_df = check_sequence(mut_df, uniprot_id, 'aa', 'score', convert_to_seq1=True)
mut_df = mut_df.groupby('res_num')['mutagenesis_functional_score'].mean().reset_index()
mut_df['gene'] = gene
mut_df.head()


Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 151 | Expected: G | Found: R
Mismatch at 153 | Expected: D | Found: E
Mismatch at 153 | Expected: D | Found: E
Mismatch at 153 | Expected: D | Found: E
Mismatch at 153 | Expected: D | Found: E
Mismatch at 153 | Expected: D | Found: E
Mismatch at 153 

Unnamed: 0,res_num,mutagenesis_functional_score,gene
0,2,0.32102,KRAS
1,3,0.315804,KRAS
2,4,0.375025,KRAS
3,5,1.043912,KRAS
4,6,1.318499,KRAS


In [19]:
all_mut_df = pd.concat([all_mut_df, mut_df])

In [20]:
all_mut_df

Unnamed: 0,res_num,mutagenesis_functional_score,gene
0,1,-0.371660,TP53
1,2,-0.572327,TP53
2,3,-0.548656,TP53
3,4,-0.364501,TP53
4,5,-0.526651,TP53
...,...,...,...
182,184,0.496453,KRAS
183,185,0.011041,KRAS
184,186,0.049888,KRAS
185,187,-0.366097,KRAS


# merge

In [21]:
# merge the two dataframes
residue_data = residue_data.merge(all_mut_df, left_on=["res_num", "assoc_gene"], right_on= ["res_num", "gene"], how='left')
residue_data

Unnamed: 0,gene_symbol,pdb_id,chain,uniprot_id,assoc_gene,resolution,res_num,pdb_res,uniprot_res,network_score,outside_range,residue_match,pdb_chain,mutagenesis_functional_score,gene
0,EGFR,3POZ,A,P00533,EGFR,1.5,701,Q,Q,-3.791355,False,True,3POZ_A,0.186400,EGFR
1,EGFR,3POZ,A,P00533,EGFR,1.5,702,A,A,-2.634055,False,True,3POZ_A,-0.492053,EGFR
2,EGFR,3POZ,A,P00533,EGFR,1.5,703,L,L,-1.231055,False,True,3POZ_A,-0.259270,EGFR
3,EGFR,3POZ,A,P00533,EGFR,1.5,704,L,L,0.452193,False,True,3POZ_A,-0.132664,EGFR
4,EGFR,3POZ,A,P00533,EGFR,1.5,705,R,R,-1.086163,False,True,3POZ_A,-0.116560,EGFR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52394,TP53,8E7A,A,P04637,TP53,1.3,287,E,E,-1.837237,False,True,8E7A_A,-0.055440,TP53
52395,TP53,8E7A,A,P04637,TP53,1.3,288,N,N,-1.837237,False,True,8E7A_A,0.079706,TP53
52396,TP53,8E7A,A,P04637,TP53,1.3,289,L,L,-1.196778,False,True,8E7A_A,0.163353,TP53
52397,TP53,8E7A,A,P04637,TP53,1.3,290,R,R,-1.624046,False,True,8E7A_A,0.093340,TP53


# Plot the data

In [22]:
import plotly.graph_objects as go

In [26]:
for gene in ['TP53', 'EGFR', 'KRAS']:
    tmp = residue_data[residue_data['gene']==gene]

    # plot network score vs functional score using plotly
    # Create scatter plot
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=tmp['network_score'], y=tmp['mutagenesis_functional_score'],
                            mode='markers', marker=dict(color='blue'),
                            name='Network Score vs. Functional Score'))

    # Update layout
    fig.update_layout(xaxis_title='Network Score', yaxis_title='Functional Score',
                    title=f'Network Score vs. Functional Score for {gene} Gene')

    # make opacity 0.4
    fig.update_traces(marker=dict(size=5, opacity=0.4))

    fig.show()

### 1. save to pdf for each pdb

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

for gene in ['TP53']:

    # Initialize a PdfPages object to save plots in a PDF file
    pdf_pages = PdfPages(f'results/{gene}_scatterplots.pdf')

    # plot subplots with 3 columns showing the network score against mutagenesis_functional_score for each pdb_chain in seaborn
    unique_pdb_chains = residue_data[residue_data['assoc_gene']==gene]['pdb_chain'].unique()
    n = len(unique_pdb_chains)
    rows = n // 3 + 1

    fig, axes = plt.subplots(rows, 3, figsize=(15, 5*rows))

    for i, pdb_chain in enumerate(unique_pdb_chains):
        ax = axes[i // 3, i % 3]
        sns.scatterplot(x='network_score', y='mutagenesis_functional_score', data=residue_data[residue_data['pdb_chain']==pdb_chain], ax=ax)
        ax.set_xlabel('Network Score')
        ax.set_ylabel('Mutagenesis Functional Score')
        ax.set_title(f'{pdb_chain}')
        ax.grid(True)

    # set supertitle at the top of the figure
    plt.suptitle(f"Network Score vs Average Mutagenesis Functional Score (Z-score) for {gene}", y=1.1, verticalalignment='top', fontsize = 15)

    # Remove any unused subplots
    for j in range(i+1, rows*3):
        fig.delaxes(axes.flatten()[j])

    plt.tight_layout()

    # Save the plots in the PDF document
    pdf_pages.savefig(fig)

    # # Close the PdfPages object
    pdf_pages.close()

    plt.close()
    # plt.show()

## 2. Interactive dash

In [None]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output, State
import plotly.express as px
import pandas as pd

df = residue_data

# Get unique genes
unique_genes = df['gene_symbol'].unique()

# Initialize Dash app
app = dash.Dash(__name__)

# Define layout
app.layout = html.Div([
    dcc.Dropdown(
        id='gene-dropdown',
        options=[{'label': gene, 'value': gene} for gene in unique_genes],
        value='TP53',  # Default value set to 'TP53'
        clearable=False
    ),
    html.Br(),
    dcc.Dropdown(
        id='pdb-dropdown',
        options=[{'label': 'All PDBs', 'value': 'all'}] + [{'label': pdb_id, 'value': pdb_id} for pdb_id in df['pdb_chain'].unique()],
        multi=True
    ),
    dcc.Graph(id='scatter-plot')
])

# Define callback to update PDB dropdown based on selected gene
@app.callback(
    Output('pdb-dropdown', 'value'),
    [Input('gene-dropdown', 'value')]
)
def update_pdb_value(selected_gene):
    return ['all'] + list(df[df['gene_symbol'] == selected_gene]['pdb_chain'].unique())

# Define callback to update scatter plot
@app.callback(
    Output('scatter-plot', 'figure'),
    [Input('gene-dropdown', 'value'),
     Input('pdb-dropdown', 'value')]
)
def update_plot(selected_gene, selected_pdb_ids):

    if 'all' in selected_pdb_ids:
        filtered_df = df[df['gene_symbol'] == selected_gene]
    else:
        filtered_df = df[df['pdb_chain'].isin(selected_pdb_ids)]
    title = "Network score distribution against mutagenesis functional score"
    fig = px.scatter(filtered_df, x='network_score', y='mutagenesis_functional_score',
                     hover_data=['pdb_id', 'chain', 'res_num', 'pdb_res', 'uniprot_res'],
                     labels={'mutagenesis_functional_score': 'Functional Score from saturation mutagenesis',
                             'network_score': 'Network Score',
                             'residue_match': 'Residue Match',
                                    'pdb_id': 'PDB ID',
                                    'chain': 'Chain',
                                    'res_num': 'Residue Number',
                                    'pdb_res': 'PDB Residue',
                                    'uniprot_res': 'UniProt Residue'},
                     title=f'{title} ({len(selected_pdb_ids)} PDB chains selected)' if 'all' not in selected_pdb_ids else f'{title} (All PDB chains selected)')
    
    fig.update_xaxes(title_text='Network Score')
    fig.update_yaxes(title_text='Mutagenesis Functional Score')
    fig.update_traces(opacity=.4)
    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)
