# Explore Nextclade Mutations

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib as plt
import pickle

In [None]:
plt.rcParams['figure.figsize'] = [7, 7]
plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower

In [None]:
# load nextclade counts
with open("results/nextclade.counts.pkl", 'rb') as f:
    nextclade_counts = pickle.load(f)

## Inspect nextclade counts

In [None]:
type(nextclade_counts)

In [None]:
list(nextclade_counts.keys())[0:3]

In [None]:
nextclade_counts[list(nextclade_counts.keys())[1]]

In [None]:
# get data frame form index
nextclade_counts_df = pd.DataFrame.from_dict(nextclade_counts, orient='index')

In [None]:
nextclade_counts_df.head()

## Preprocess Nextclade

In [None]:
def preprocess_nextclade_counts(nextclade_counts_df):
    # convert index to columns and split, extract aa changes
    idx = nextclade_counts_df.index
    nextclade_counts_df[['protein','aa_change']] = nextclade_counts_df.index.to_frame()[0].str.split(':',expand=True)
    nextclade_counts_df["aa_from"] = nextclade_counts_df['aa_change'].astype(str).str[0]
    nextclade_counts_df["aa_to"] = nextclade_counts_df['aa_change'].astype(str).str[-1]
    nextclade_counts_df["aa_change"] = nextclade_counts_df["aa_from"] + '_' + nextclade_counts_df["aa_to"]
    nextclade_counts_df = nextclade_counts_df.rename(columns={0:'count'})
    
    return nextclade_counts_df

In [None]:
nextclade_counts_df = preprocess_nextclade_counts(nextclade_counts_df)

In [None]:
# inspect
nextclade_counts_df.head()

## Nextclade: sum of event occurences

In [None]:
# Get total summed counts
summarized_counts = nextclade_counts_df.groupby(['aa_from','aa_to']).agg({'count':'sum'})

In [None]:
summarized_counts.head()

In [None]:
# Sanity check -- OK
nextclade_counts_df[(nextclade_counts_df['aa_to'] == 'E') & (nextclade_counts_df['aa_from'] == '*')]

In [None]:
summarized_counts_1 = summarized_counts['count']
summarized_counts_1 = summarized_counts_1.reset_index()
nextclade_comparison_table = summarized_counts_1.copy()
aa_changes = summarized_counts_1.pivot(index='aa_from',columns='aa_to',values='count')

In [None]:
aa_changes

In [None]:
aa_changes_np = aa_changes.to_numpy()
aa_changes_np = np.nan_to_num(aa_changes_np, nan=0.0)

In [None]:
# Sanity Check
aa_changes_np.max()

In [None]:
argmax_row_num = aa_changes_np.argmax() // aa_changes_np.shape[1]
argmax_col_num = aa_changes_np.argmax() % aa_changes_np.shape[1]
print(f"Maximal value is at ({argmax_row_num}, {argmax_col_num}), zero-indexed")

In [None]:
aa_changes_np[17,9]

In [None]:
aa_changes.columns[9]

In [None]:
aa_changes.index[17]

In [None]:
# OK
summarized_counts.loc[[("T","I")]]

In [None]:
fig = sns.heatmap(aa_changes_np,
            xticklabels=aa_changes.columns,
            yticklabels=aa_changes.index,)

fig.set(xlabel='aa_to',ylabel='aa_from',title='Total aa changes in Nextclade')

## Compare with mutations.tsv

In [None]:
def preprocess_mutations(mutations_tsv):
    mutations_tsv[['gene','aa_change']] = mutations_tsv['mutation'].str.split(':',expand=True)
    mutations_tsv['aa_from'] = mutations_tsv['aa_change'].str[0:1]
    mutations_tsv['aa_to'] = mutations_tsv['aa_change'].str[-1]
    mutations_tsv['aa_position'] = mutations_tsv['aa_change'].str[1:-1]
    return mutations_tsv

In [None]:
mutations_tsv = pd.read_table('paper/mutations.tsv')
mutations_proc = preprocess_mutations(mutations_tsv)

In [None]:
mutations_proc.head()

In [None]:
def get_top55_spike(mutations_proc):
    return mutations_proc[mutations_proc['gene'] == 'S'].sort_values('Δ log R', ascending=False).head(55)

In [None]:
def get_event_frequencies(mutations_proc):
    mutations_1 = mutations_proc[['aa_from','aa_to']].copy()
    mutations_1['count'] = 1
    mutations_1 = mutations_1.groupby(['aa_from','aa_to']).aggregate({'count':'sum'})
    mutations_1 = mutations_1.reset_index()
    pre_pivot = mutations_1.copy()
    mutations_1 = mutations_1.pivot(index='aa_from',columns='aa_to',values='count')
    cols = mutations_1.columns
    rows = mutations_1.index
    
    mutations_1 = mutations_1.values
    mutations_1 = np.nan_to_num(mutations_1, 0)
    
    return {
        'vals': mutations_1,
        'cols': cols,
        'rows': rows,     
        'prepivot': pre_pivot,
    }

In [None]:
mutations_1 = get_event_frequencies(mutations_proc)

In [None]:
mutations_tsv_comparison_table = mutations_1['prepivot']

In [None]:
mutations_1

In [None]:
fig = sns.heatmap(mutations_1['vals'],
            xticklabels=mutations_1['cols'],
            yticklabels=mutations_1['rows'],)

fig.set(xlabel='aa_to',ylabel='aa_from',title='Total types of aa changes in Mutations.tsv')

In [None]:
# Obtain Statistical significance and deltas for each event type

In [None]:
mutations_tsv_comparison_table

In [None]:
nextclade_comparison_table

In [None]:
set(mutations_tsv_comparison_table['aa_from'].unique())

In [None]:
def score_mutations(test_data, background_data, exclude_mutations = set(('*')), verbose=True):
    # from
    test_mutations_from = set(test_data['aa_from'].unique())
    bg_mutations_from = set(background_data['aa_from'].unique())
    common_mutations_from = test_mutations_from.intersection(bg_mutations_from)
    used_mutations_from = common_mutations_from - exclude_mutations
    
    # to
    test_mutations_to = set(test_data['aa_to'].unique())
    bg_mutations_to = set(background_data['aa_to'].unique())
    common_mutations_to = test_mutations_to.intersection(bg_mutations_to)
    used_mutations_to = common_mutations_to - exclude_mutations
    
    # subset
    test_data_subset = test_data[
        (test_data['aa_from'].isin(used_mutations_from)) & 
        (test_data['aa_to'].isin(used_mutations_to))
    ]
    bg_data_subset = background_data[
        (background_data['aa_from'].isin(used_mutations_from)) &
        (background_data['aa_to'].isin(used_mutations_to))
    ]
    
    test_sum = test_data_subset['count'].sum()
    bg_sum = bg_data_subset['count'].sum()
    
    n_tests = len(used_mutations_from) * len(used_mutations_to)
    alpha = 0.05
    threshold = alpha / n_tests
    
    ret = {'from': [], 'to': [], 'pval': [], 'obs': [], 'exp': [], 'significant': []}
    
    # Calculate background 
    for from_mut in used_mutations_from:
        for to_mut in used_mutations_to:
            obs_count = test_data_subset.loc[
                (test_data_subset['aa_from'] == from_mut) &
                (test_data_subset['aa_to'] == to_mut)
            ]['count'].values
            if len(obs_count) ==1:
                obs_count = obs_count[0]
            else:
                obs_count = 0
                
            bg_count = bg_data_subset.loc[
                (bg_data_subset['aa_from'] == from_mut) &
                (bg_data_subset['aa_to'] == to_mut)
            ]['count'].values
            if len(bg_count) == 1:
                bg_count = bg_count[0]
            else:
                bg_count = 0
            
            bg_prob = bg_count / bg_sum
            
            pval = stats.binom_test(obs_count, test_sum, bg_prob, alternative='greater')
            sign = pval < threshold
             
            if (verbose):
                print(f"{from_mut} -> {to_mut}, obs: {obs_count}/{test_sum} bg_prob: {bg_prob}, pval: {pval}")
    
            ret['from'].append(from_mut)
            ret['to'].append(to_mut)
            ret['obs'].append(obs_count)
            ret['exp'].append((bg_prob * test_sum))
            ret['pval'].append(pval)
            ret['significant'].append(sign)
            
    results = pd.DataFrame(ret)
    
    results = results.sort_values('pval')
    
    return results

In [None]:
import scipy.stats as stats

In [None]:
stats.binom_test(50, 100, 0.5)

In [None]:
# Mutations vs Nextclade
r = score_mutations(mutations_tsv_comparison_table, nextclade_comparison_table, verbose = False)

In [None]:
#r.to_csv('score_mutations.csv')

In [None]:
r.head(40)

## Enrichment of top mutations

In [None]:
mutations_tsv_comparison_table.head()

In [None]:
top55_spike = mutations_proc.loc[mutations_proc['gene'] == 'S'].sort_values('Δ log R',ascending =False).head(55)

In [None]:
top55_spike_freq = get_event_frequencies(top55_spike)

In [None]:
top55_spike_freq

In [None]:
mutations_tsv_comparison_table
top55_spike_freq['prepivot']
nextclade_comparison_table

# In Summary

## Input to model VS nextclade

In [None]:
r = score_mutations(mutations_tsv_comparison_table, nextclade_comparison_table, verbose = False)

In [None]:
r[(r['from'] == 'V') & (r['to'] == 'F')]

In [None]:
r[(r['from'] == 'A') & (r['to'] == 'V')]

## Top 55 VS Input to Model

In [None]:
r = score_mutations(top55_spike_freq['prepivot'], mutations_tsv_comparison_table, verbose = False)

In [None]:
r[(r['from'] == 'V') & (r['to'] == 'F')]

In [None]:
r[(r['from'] == 'A') & (r['to'] == 'V')]

## Top 55 VS nextclade

In [None]:
r = score_mutations(top55_spike_freq['prepivot'], nextclade_comparison_table, verbose = False)

In [None]:
r[(r['from'] == 'V') & (r['to'] == 'F')]

In [None]:
r[(r['from'] == 'A') & (r['to'] == 'V')]