In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LinearRegression
from scipy import stats
import matplotlib.pyplot as plt
Amino_acid_dict = {'Ala': 'A',
                   'Cys': 'C',
                   'Asp': 'D',
                   'Glu': 'E',
                   'Phe': 'F',
                   'Gly': 'G',
                   'His': 'H',
                   'Ile': 'I',
                   'Lys': 'K',
                   'Leu': 'L',
                   'Met': 'M',
                   'Asn': 'N',                   
                   'Pro': 'P',
                   'Gln': 'Q',
                   'Arg': 'R',
                   'Ser': 'S',
                   'Thr': 'T',
                   'Val': 'V',
                   'Trp': 'W',
                   'Tyr': 'Y',
                   'Ter': '*',
                   'wt':'wt'
                  }

codontab = {
    'TCA': 'S',    # Serina
    'TCC': 'S',    # Serina
    'TCG': 'S',    # Serina
    'TCT': 'S',    # Serina
    'TTC': 'F',    # Fenilalanina
    'TTT': 'F',    # Fenilalanina
    'TTA': 'L',    # Leucina
    'TTG': 'L',    # Leucina
    'TAC': 'Y',    # Tirosina
    'TAT': 'Y',    # Tirosina
    'TAA': '*',    # Stop
    'TAG': '*',    # Stop
    'TGC': 'C',    # Cisteina
    'TGT': 'C',    # Cisteina
    'TGA': '*',    # Stop
    'TGG': 'W',    # Triptofano
    'CTA': 'L',    # Leucina
    'CTC': 'L',    # Leucina
    'CTG': 'L',    # Leucina
    'CTT': 'L',    # Leucina
    'CCA': 'P',    # Prolina
    'CCC': 'P',    # Prolina
    'CCG': 'P',    # Prolina
    'CCT': 'P',    # Prolina
    'CAC': 'H',    # Histidina
    'CAT': 'H',    # Histidina
    'CAA': 'Q',    # Glutamina
    'CAG': 'Q',    # Glutamina
    'CGA': 'R',    # Arginina
    'CGC': 'R',    # Arginina
    'CGG': 'R',    # Arginina
    'CGT': 'R',    # Arginina
    'ATA': 'I',    # Isoleucina
    'ATC': 'I',    # Isoleucina
    'ATT': 'I',    # Isoleucina
    'ATG': 'M',    # Methionina
    'ACA': 'T',    # Treonina
    'ACC': 'T',    # Treonina
    'ACG': 'T',    # Treonina
    'ACT': 'T',    # Treonina
    'AAC': 'N',    # Asparagina
    'AAT': 'N',    # Asparagina
    'AAA': 'K',    # Lisina
    'AAG': 'K',    # Lisina
    'AGC': 'S',    # Serina
    'AGT': 'S',    # Serina
    'AGA': 'R',    # Arginina
    'AGG': 'R',    # Arginina
    'GTA': 'V',    # Valina
    'GTC': 'V',    # Valina
    'GTG': 'V',    # Valina
    'GTT': 'V',    # Valina
    'GCA': 'A',    # Alanina
    'GCC': 'A',    # Alanina
    'GCG': 'A',    # Alanina
    'GCT': 'A',    # Alanina
    'GAC': 'D',    # Acido Aspartico
    'GAT': 'D',    # Acido Aspartico
    'GAA': 'E',    # Acido Glutamico
    'GAG': 'E',    # Acido Glutamico
    'GGA': 'G',    # Glicina
    'GGC': 'G',    # Glicina
    'GGG': 'G',    # Glicina
    'GGT': 'G'     # Glicina
}

In [2]:
df_count = pd.read_csv('../raw_data/YAP1_amino_acid_variant.csv', engine='python', skiprows=4)

site_list = []
score_list = []
double_mutant = {}
single_mutant = {}
for i,j,k,l,m in zip(df_count['hgvs_pro'].tolist(), df_count['101208_c_0'].tolist(), df_count['101208_c_1'].tolist(), df_count['101208_c_2'].tolist(), df_count['101208_c_3'].tolist()):
    r = re.compile("([a-zA-Z]+)([0-9]+)")
    string_list = re.split(r'(\w+)', i)
    string_list = [i for i in string_list if len(i)>6]
    if len(string_list)==1:
        single_mutant[string_list[0]] = [j,k,l,m]
    if len(string_list)==2:
        double_mutant[str(string_list)] = [j,k,l,m]
#     print(string_list, j)

wt_row = df_count.loc[df_count['hgvs_pro'].str.contains('wt')]
total_single_count = []
total_double_count = []
total_ter_count = []
for i in range(4):
    total_single_count.append(sum([float(v[i]) for v in single_mutant.values()])+wt_row['101208_c_'+str(i)].values[0])
    total_double_count.append(sum([float(v[i]) for v in double_mutant.values()])+wt_row['101208_c_'+str(i)].values[0])
    total_ter_count.append(sum([float(v[i]) for v,x in zip(single_mutant.values(), single_mutant.keys()) if x[-3:]=='Ter']))

total_single_count

carryover_rate = [i/j for i,j in zip(total_ter_count, total_single_count)]

single_count_table = pd.DataFrame.from_dict(single_mutant,orient='index').reset_index()
double_count_table = pd.DataFrame.from_dict(double_mutant,orient='index').reset_index()

double_count_table['mutant1'] = [i.strip('][').split(', ')[0][1:-1] for i in double_count_table['index']]
double_count_table['mutant2'] = [i.strip('][').split(', ')[1][1:-1] for i in double_count_table['index']]
double_count_table = double_count_table.drop('index', axis=1)

double_count_table


Unnamed: 0,0,1,2,3,mutant1,mutant2
0,37,17,6,6,Ala11Gly,His23Asp
1,1,1,3,1,Pro5Ser,Ser15Leu
2,33,12,6,12,Arg18Leu,Gln26Glu
3,54,55,29,130,Asp25Asn,Thr29Ser
4,65,45,13,24,Pro5Gln,Asn22Lys
...,...,...,...,...,...,...
20914,37,14,4,10,Pro5Gln,Glu9Ala
20915,34,13,5,2,Met10Val,His23Tyr
20916,42,40,20,106,Ala6Gly,Gln17Arg
20917,14,10,7,6,Ala6Pro,Thr29Met


In [29]:
df_count = pd.read_csv('../raw_data/YAP1_amino_acid_variant.csv', engine='python', skiprows=4)

site_list = []
score_list = []
double_mutant = {}
single_mutant = {}
for i,j,k,l,m in zip(df_count['hgvs_pro'].tolist(), df_count['101208_c_0'].tolist(), df_count['101208_c_1'].tolist(), df_count['101208_c_2'].tolist(), df_count['101208_c_3'].tolist()):
    r = re.compile("([a-zA-Z]+)([0-9]+)")
    string_list = re.split(r'(\w+)', i)
    string_list = [i for i in string_list if len(i)>6]
    if len(string_list)==1:
        single_mutant[string_list[0]] = [j,k,l,m]
    if len(string_list)==2:
        double_mutant[str(string_list)] = [j,k,l,m]
#     print(string_list, j)

wt_row = df_count.loc[df_count['hgvs_pro'].str.contains('wt')]
total_single_count = []
total_double_count = []
total_ter_count = []
for i in range(4):
    total_single_count.append(sum([float(v[i]) for v in single_mutant.values()])+wt_row['101208_c_'+str(i)].values[0])
    total_double_count.append(sum([float(v[i]) for v in double_mutant.values()])+wt_row['101208_c_'+str(i)].values[0])
    total_ter_count.append(sum([float(v[i]) for v,x in zip(single_mutant.values(), single_mutant.keys()) if x[-3:]=='Ter']))

total_single_count

carryover_rate = [i/j for i,j in zip(total_ter_count, total_single_count)]

single_count_table = pd.DataFrame.from_dict(single_mutant,orient='index').reset_index()
double_count_table = pd.DataFrame.from_dict(double_mutant,orient='index').reset_index()

double_count_table['mutant1'] = [i.strip('][').split(', ')[0][1:-1] for i in double_count_table['index']]
double_count_table['mutant2'] = [i.strip('][').split(', ')[1][1:-1] for i in double_count_table['index']]
double_count_table = double_count_table.drop('index', axis=1)

double_count_table

for i in range(4):
    double_count_table[[i]]/=total_double_count[i]
    single_count_table[[i]]/=total_single_count[i]
df2 = {'index':'wt', 
       0: wt_row['101208_c_'+str(0)].values[0]/total_single_count[0],
       1: wt_row['101208_c_'+str(1)].values[0]/total_single_count[1],
       2: wt_row['101208_c_'+str(2)].values[0]/total_single_count[2],
       3: wt_row['101208_c_'+str(3)].values[0]/total_single_count[3]}
single_count_table = single_count_table.append(df2, ignore_index = True)
df2 = {'mutant1':'wt', 
       'mutant2':'wt', 
       0: wt_row['101208_c_'+str(0)].values[0]/total_double_count[0],
       1: wt_row['101208_c_'+str(1)].values[0]/total_double_count[1],
       2: wt_row['101208_c_'+str(2)].values[0]/total_double_count[2],
       3: wt_row['101208_c_'+str(3)].values[0]/total_double_count[3]}
double_count_table = double_count_table.append(df2, ignore_index = True)
# print(double_count_table[3].sum(), single_count_table[1].sum())
for i in range(1,4):
    
    NSCOR_input = [k*(1-carryover_rate[i]) for k in single_count_table[i-1]]
    NSCOR_output = [j - k*carryover_rate[i] for k,j in zip(single_count_table[i-1], single_count_table[i])]
    single_count_table['NSCOR_ratio_'+str(i)] = [j/k for k,j in zip(NSCOR_input, NSCOR_output)]
    NSCOR_input = [k*(1-carryover_rate[i]) for k in double_count_table[i-1]]
    NSCOR_output = [j - k*carryover_rate[i] for k,j in zip(double_count_table[i-1], double_count_table[i])]
    double_count_table['NSCOR_ratio_'+str(i)] = [j/k for k,j in zip(NSCOR_input, NSCOR_output)]
    

single_count_table['projected_0'] = 0

single_count_table['projected_1'] = [np.log2(i) for i in single_count_table['NSCOR_ratio_1'].tolist()]
single_count_table['projected_2'] = [np.log2(i*j) for i,j in zip(single_count_table['NSCOR_ratio_1'].tolist(),
                                                                 single_count_table['NSCOR_ratio_2'].tolist())]
single_count_table['projected_3'] = [np.log2(i*j*k) for i,j,k in zip(single_count_table['NSCOR_ratio_1'].tolist(),
                                                                     single_count_table['NSCOR_ratio_2'].tolist(),
                                                                     single_count_table['NSCOR_ratio_3'].tolist())]
double_count_table['projected_0'] = 0

double_count_table['projected_1'] = [np.log2(i) for i in double_count_table['NSCOR_ratio_1'].tolist()]
double_count_table['projected_2'] = [np.log2(i*j) for i,j in zip(double_count_table['NSCOR_ratio_1'].tolist(),
                                                                 double_count_table['NSCOR_ratio_2'].tolist())]
double_count_table['projected_3'] = [np.log2(i*j*k) for i,j,k in zip(double_count_table['NSCOR_ratio_1'].tolist(),
                                                                     double_count_table['NSCOR_ratio_2'].tolist(),
                                                                     double_count_table['NSCOR_ratio_3'].tolist())]
# print(double_count_table[3].sum(), single_count_table[1].sum())
slope = []
for i in range(single_count_table.shape[0]):
#     print(i)
    X = np.array([1,2,3,4]).reshape(4,1)
    row = single_count_table.iloc[i,:]
    y =  np.array([row['projected_0'], row['projected_1'], row['projected_2'], row['projected_3']]).reshape(4,1)
    reg = LinearRegression().fit(X, y)
    
    if reg.score(X,y)>=0.75:
#         reg = LinearRegression().fit(X, y)
        slope.append(reg.coef_[0][0])
    else:
        slope.append(np.nan)
#     print(row)
single_count_table['slope_v'] = slope

slope = []
for i in range(double_count_table.shape[0]):
    X = np.array([1,2,3,4]).reshape(4,1)
    row = double_count_table.iloc[i,:]
    y =  np.array([row['projected_0'], row['projected_1'], row['projected_2'], row['projected_3']]).reshape(4,1)
    reg = LinearRegression().fit(X, y)
    if reg.score(X,y)>0:
#         reg = LinearRegression().fit(X, y)
        slope.append(reg.coef_[0][0])
    else:
        slope.append(np.nan)

double_count_table['slope_v'] = slope

# double_count_table = double_count_table.dropna()
# single_count_table = single_count_table.dropna()
# print(double_count_table[3].sum(), single_count_table[1].sum())
wt_score = single_count_table.loc[single_count_table['index']=='wt']['slope_v'].values[0]
single_count_table['function_score'] = [2**(i-wt_score) for i in single_count_table['slope_v'].tolist()]
single_count_table

double_count_table['function_score'] = [2**(i-wt_score) for i in double_count_table['slope_v'].tolist()]
double_count_table
# print(double_count_table[3].sum(), single_count_table[1].sum())
predicted_score = []
single_mut_list = single_count_table['index'].tolist()
for i in range(double_count_table.shape[0]-1):
#     print(i)
    mut1 = double_count_table.iloc[i]['mutant1']
    mut2 = double_count_table.iloc[i]['mutant2']
    if mut1 in single_mut_list and mut2 in single_mut_list:
        single_score1 = single_count_table.loc[single_count_table['index']==mut1]['function_score'].values[0]
        single_score2 = single_count_table.loc[single_count_table['index']==mut2]['function_score'].values[0]
        predicted_score.append(single_score1*single_score2)
    else:
        predicted_score.append(np.nan)
predicted_score.append(1)
# print(double_count_table[3].sum(), single_count_table[1].sum())

double_count_table['predicted_score'] = predicted_score
double_count_table['epistasis_paper'] = double_count_table['function_score'] - double_count_table['predicted_score']
double_count_table = double_count_table.fillna(0)
double_count_table


Unnamed: 0,0,1,2,3,mutant1,mutant2,NSCOR_ratio_1,NSCOR_ratio_2,NSCOR_ratio_3,projected_0,projected_1,projected_2,projected_3,slope_v,function_score,predicted_score,epistasis_paper
0,3.440468e-05,0.000021,0.000015,4.484462e-06,Ala11Gly,His23Asp,0.589283,0.695041,0.304669,0,-0.762967,-1.287798,-3.002482,-0.953228,0.454957,0.310687,0.144270
1,9.298563e-07,0.000001,0.000007,7.474104e-07,Pro5Ser,Ser15Leu,1.322325,6.024057,0.097298,0,0.403077,2.993813,-0.367633,0.148784,0.976582,0.000000,0.000000
2,3.068526e-05,0.000015,0.000015,8.968925e-06,Arg18Leu,Gln26Glu,0.459335,0.991097,0.615726,0,-1.122382,-1.135283,-1.834923,-0.551767,0.600928,0.571756,0.029172
3,5.021224e-05,0.000067,0.000070,9.716335e-05,Asp25Asn,Thr29Ser,1.347439,1.046002,1.388005,0,0.430220,0.495105,0.968119,0.296924,1.082190,0.000000,0.000000
4,6.044066e-05,0.000055,0.000031,1.793785e-05,Pro5Gln,Asn22Lys,0.905055,0.566092,0.567871,0,-0.143922,-0.964815,-1.781179,-0.616443,0.574583,0.368192,0.206391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20915,3.161511e-05,0.000016,0.000012,1.494821e-06,Met10Val,His23Tyr,0.484717,0.758807,0.118035,0,-1.044786,-1.442981,-4.525693,-1.397527,0.334367,0.395160,-0.060793
20916,3.905396e-05,0.000049,0.000048,7.922550e-05,Ala6Gly,Gln17Arg,1.257748,0.991097,1.642214,0,0.330843,0.317941,1.033583,0.308785,1.091124,0.000000,0.000000
20917,1.301799e-05,0.000012,0.000017,4.484462e-06,Ala6Pro,Thr29Met,0.934860,1.393734,0.260233,0,-0.097178,0.381778,-1.560349,-0.420209,0.658302,0.561981,0.096321
20918,4.184353e-05,0.000039,0.000010,8.221515e-06,Lys12Glu,Arg34Cys,0.930555,0.236153,0.849019,0,-0.103837,-2.186042,-2.422174,-0.934873,0.460782,0.461159,-0.000377


In [30]:
double_count_table[double_count_table['predicted_score']<0]

Unnamed: 0,0,1,2,3,mutant1,mutant2,NSCOR_ratio_1,NSCOR_ratio_2,NSCOR_ratio_3,projected_0,projected_1,projected_2,projected_3,slope_v,function_score,predicted_score,epistasis_paper


In [31]:
EPISTASIS_FILE = './rep1_-3.txt'
INDEX_FILE     = './index_matrix.csv'
SEQUENCE       = "DVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPR"
AA  = sorted(['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '*'])

with open(EPISTASIS_FILE) as f:
    content = f.readlines()
content = [float(x.strip()) for x in content]
df_index = pd.read_csv(INDEX_FILE, header=None)
df_index.rename(columns = {0 : 'state_1', 1 : 'state_2', 2 : 'mat_index'}, inplace = True)
index_list = df_index['mat_index'].tolist()
selection_list = [content[i] for i in index_list]
df_index['selection_coefficients'] = selection_list
df_index['site_1'] = df_index['state_1']/21 + 2
df_index['site_1'] = df_index['site_1'].astype(int)
df_index['AA_1']   = df_index['state_1']%21
df_index['AA_1']   = [AA[x] for x in df_index['AA_1'].tolist()]
df_index['site_2'] = df_index['state_2']/21 + 2
df_index['site_2'] = df_index['site_2'].astype(int)
df_index['AA_2']   = df_index['state_2']%21
df_index['AA_2']   = [AA[x] for x in df_index['AA_2'].tolist()]

cols = ['site_1', 'AA_1', 'site_2', 'AA_2']
df_index['all_variant'] = df_index[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis = 1)

WT_list = []; axis_1 = []; axis_2 = []
for i in range(len(SEQUENCE)):
    WT_list.append(i * 21 + AA.index(SEQUENCE[i]))
df_WT_epi = df_index[(~df_index['state_1'].isin(WT_list))&(~df_index['state_2'].isin(WT_list))&(df_index['state_2'] != df_index['state_1'])]
df_WT_epi = df_WT_epi.sort_values(['site_1', 'site_2'])

df_WT_epi['absolute_s'] = np.abs(df_WT_epi['selection_coefficients'])
df_WT_epi = df_WT_epi.groupby(['site_1', 'site_2'], as_index=False)['absolute_s'].agg('sum')
df_WT_epi['site_1'] += 8
df_WT_epi['site_2'] += 8
df_WT_epi_ = df_WT_epi.copy()
df_WT_epi_ = df_WT_epi_.rename(columns={"site_1":"site_2", "site_2":"site_1"})
df_WT_epi  = df_WT_epi.append(df_WT_epi_, ignore_index=True, sort=False)
flights = df_WT_epi.pivot("site_1", "site_2", "absolute_s")


df_index['site_1'] = df_index['site_1']-1
df_index['site_2'] = df_index['site_2']-1
df_index

df_index.loc[df_index['all_variant']=='']

double_count_table_ = double_count_table.iloc[:-1]
double_count_table_['site_1'] = [int(i[3:-3])+1 for i in double_count_table_['mutant1'].tolist() ]
double_count_table_['site_2'] = [int(i[3:-3])+1 for i in double_count_table_['mutant2'].tolist() ]
double_count_table_['AA_1'] = [Amino_acid_dict[i[-3:]] for i in double_count_table_['mutant1'].tolist()]
double_count_table_['AA_2'] = [Amino_acid_dict[i[-3:]] for i in double_count_table_['mutant2'].tolist()]
double_count_table_['all_variant'] = [str(i)+'_'+str(j)+'_'+str(k)+'_'+str(l) for i,j,k,l in zip(double_count_table_['site_1'], double_count_table_['AA_1'],double_count_table_['site_2'],double_count_table_['AA_2'])]
double_count_table_['epistasis_paper'] = double_count_table_['function_score'] - double_count_table_['predicted_score']

pd.set_option('display.max_columns', 500)
df_merge = pd.merge(double_count_table_, df_index, how="inner", on=["all_variant"])
df_merge = df_merge.rename(columns={'selection_coefficients':'epistasis_MPL'})
df_merge

df_merge['mu1_single'] = [str(i)+'_'+str(j)+'_'+str(i)+'_'+str(j) for i,j in zip(df_merge['site_1_x'], df_merge['AA_1_x'])]
df_merge['mu2_single'] = [str(i)+'_'+str(j)+'_'+str(i)+'_'+str(j) for i,j in zip(df_merge['site_2_x'], df_merge['AA_2_x'])]
df_temp = df_index[['all_variant', 'selection_coefficients']]
df_temp = df_temp.rename(columns={'all_variant':'mu1_single'})
df_merge = pd.merge(df_temp, df_merge, how="inner", on=["mu1_single"])
df_merge = df_merge.rename(columns={'selection_coefficients':'selection_coefficients_mut1'})
df_temp = df_index[['all_variant', 'selection_coefficients']]
df_temp = df_temp.rename(columns={'all_variant':'mu2_single'})
df_merge = pd.merge(df_temp, df_merge, how="inner", on=["mu2_single"])
df_merge = df_merge.rename(columns={'selection_coefficients':'selection_coefficients_mut2', 
                                    0:"freq_gen0", 1:"freq_gen1", 2:"freq_gen2", 3:"freq_gen3",
                                    "site_1_y": "site_1", "AA_1_y": "AA_1", "site_2_y": "site_2", "AA_2_y": "AA_2"})
df_merge = df_merge.drop(["mu2_single", "mu1_single", "all_variant",
                          "site_1_x", "site_2_x", "AA_1_x", "AA_2_x", 
                          "state_1", "state_2", "mat_index"], axis=1)

df_merge1 = df_merge.copy()
df_merge1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,selection_coefficients_mut2,selection_coefficients_mut1,freq_gen0,freq_gen1,freq_gen2,freq_gen3,mutant1,mutant2,NSCOR_ratio_1,NSCOR_ratio_2,NSCOR_ratio_3,projected_0,projected_1,projected_2,projected_3,slope_v,function_score,predicted_score,epistasis_paper,epistasis_MPL,site_1,AA_1,site_2,AA_2
0,0.012913,0.020034,0.000021,0.000035,0.000010,5.979283e-06,Pro33Thr,Arg34Ser,1.676098,0.262186,0.615726,0,0.745106,-1.186233,-1.885872,-0.758896,0.520559,0.242141,0.278418,0.010507,33,T,34,S
1,0.012913,0.027446,0.000035,0.000035,0.000010,2.989642e-06,Pro33Ser,Arg34Ser,1.001137,0.262186,0.304669,0,0.001639,-1.929700,-3.644384,-1.286449,0.361128,0.244539,0.116589,0.009983,33,S,34,S
2,0.012913,0.032836,0.000037,0.000023,0.000022,1.195857e-05,Pro33Arg,Arg34Ser,0.610358,0.938119,0.546602,0,-0.712272,-0.804430,-1.675866,-0.511976,0.617733,0.309797,0.307936,0.002757,33,R,34,S
3,0.012913,0.018224,0.000027,0.000032,0.000015,6.726694e-06,Pro33Gln,Arg34Ser,1.182036,0.449086,0.460198,0,0.241274,-0.913662,-2.033336,-0.725495,0.532752,0.254773,0.277979,0.009727,33,Q,34,S
4,0.012913,0.024075,0.000033,0.000023,0.000024,7.474104e-07,Pro33Leu,Arg34Ser,0.702381,1.044076,0.024718,0,-0.509674,-0.447448,-5.785738,-1.729499,0.265638,0.231093,0.034545,0.006959,33,L,34,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19152,0.010342,0.022378,0.000086,0.000076,0.000044,3.737052e-05,Pro5Arg,Met10Ile,0.870283,0.568978,0.857659,0,-0.200444,-1.014000,-1.235523,-0.452012,0.643949,0.483363,0.160585,0.017653,5,R,10,I
19153,0.010342,0.004376,0.000079,0.000061,0.000085,1.016478e-04,Ala6Ser,Met10Ile,0.763920,1.393734,1.202291,0,-0.388507,0.090448,0.356234,0.154766,0.980640,0.807257,0.173383,-0.009499,6,S,10,I
19154,0.010342,-0.022180,0.000036,0.000045,0.000075,7.997291e-05,Val2Asp,Met10Ile,1.252780,1.671227,1.067260,0,0.325133,1.066041,1.159953,0.422077,1.180262,1.237021,-0.056760,0.000610,2,D,10,I
19155,0.010342,0.006867,0.000091,0.000066,0.000039,2.840160e-05,Pro5Gln,Met10Ile,0.713452,0.581004,0.732373,0,-0.487113,-1.270492,-1.719843,-0.594291,0.583474,0.558703,0.024770,-0.007054,5,Q,10,I


In [37]:
df_count = pd.read_csv('../raw_data/YAP1_amino_acid_variant.csv', engine='python', skiprows=4)
site_list = []
score_list = []
double_mutant = {}
single_mutant = {}
for i,j,k,l,m in zip(df_count['hgvs_pro'].tolist(), 
                     df_count['110307_c_0'].tolist(), 
                     df_count['110307_c_1'].tolist(), 
                     df_count['110307_c_2'].tolist(), 
                     df_count['110307_c_3'].tolist()):
    r = re.compile("([a-zA-Z]+)([0-9]+)")
    string_list = re.split(r'(\w+)', i)
    string_list = [i for i in string_list if len(i)>6]
    if len(string_list)==1:
        single_mutant[string_list[0]] = [j,k,l,m]
    if len(string_list)==2:
        double_mutant[str(string_list)] = [j,k,l,m]

wt_row = df_count.loc[df_count['hgvs_pro'].str.contains('wt')]
total_single_count = []
total_double_count = []
total_ter_count = []
for i in range(4):
    total_single_count.append(sum([float(v[i]) for v in single_mutant.values()])+wt_row['110307_c_'+str(i)].values[0])
    total_double_count.append(sum([float(v[i]) for v in double_mutant.values()])+wt_row['110307_c_'+str(i)].values[0])
    total_ter_count.append(sum([float(v[i]) for v,x in zip(single_mutant.values(), single_mutant.keys()) if x[-3:]=='Ter']))

carryover_rate = [i/j for i,j in zip(total_ter_count, total_single_count)]

single_count_table = pd.DataFrame.from_dict(single_mutant,orient='index').reset_index()
double_count_table = pd.DataFrame.from_dict(double_mutant,orient='index').reset_index()

double_count_table['mutant1'] = [i.strip('][').split(', ')[0][1:-1] for i in double_count_table['index']]
double_count_table['mutant2'] = [i.strip('][').split(', ')[1][1:-1] for i in double_count_table['index']]
double_count_table = double_count_table.drop('index', axis=1)

double_count_table

for i in range(4):
    double_count_table[[i]]/=total_single_count[i]
    single_count_table[[i]]/=total_double_count[i]
df2 = {'index':'wt', 
       0: wt_row['110307_c_'+str(0)].values[0]/total_single_count[0],
       1: wt_row['110307_c_'+str(1)].values[0]/total_single_count[1],
       2: wt_row['110307_c_'+str(2)].values[0]/total_single_count[2],
       3: wt_row['110307_c_'+str(3)].values[0]/total_single_count[3]}
single_count_table = single_count_table.append(df2, ignore_index = True)
df2 = {'mutant1':'wt', 
       'mutant2':'wt', 
       0: wt_row['110307_c_'+str(0)].values[0]/total_single_count[0],
       1: wt_row['110307_c_'+str(1)].values[0]/total_single_count[1],
       2: wt_row['110307_c_'+str(2)].values[0]/total_single_count[2],
       3: wt_row['110307_c_'+str(3)].values[0]/total_single_count[3]}
double_count_table = double_count_table.append(df2, ignore_index = True)
for i in range(1,4):
    
    NSCOR_input = [k*(1-carryover_rate[i]) for k in single_count_table[i-1]]
    NSCOR_output = [j - k*carryover_rate[i] for k,j in zip(single_count_table[i-1], single_count_table[i])]
    single_count_table['NSCOR_ratio_'+str(i)] = [j/k for k,j in zip(NSCOR_input, NSCOR_output)]
    NSCOR_input = [k*(1-carryover_rate[i]) for k in double_count_table[i-1]]
    NSCOR_output = [j - k*carryover_rate[i] for k,j in zip(double_count_table[i-1], double_count_table[i])]
    double_count_table['NSCOR_ratio_'+str(i)] = [j/k for k,j in zip(NSCOR_input, NSCOR_output)]
    

single_count_table['projected_0'] = 0

single_count_table['projected_1'] = [np.log2(i) for i in single_count_table['NSCOR_ratio_1'].tolist()]
single_count_table['projected_2'] = [np.log2(i*j) for i,j in zip(single_count_table['NSCOR_ratio_1'].tolist(),
                                                                 single_count_table['NSCOR_ratio_2'].tolist())]
single_count_table['projected_3'] = [np.log2(i*j*k) for i,j,k in zip(single_count_table['NSCOR_ratio_1'].tolist(),
                                                                     single_count_table['NSCOR_ratio_2'].tolist(),
                                                                     single_count_table['NSCOR_ratio_3'].tolist())]
double_count_table['projected_0'] = 0

double_count_table['projected_1'] = [np.log2(i) for i in double_count_table['NSCOR_ratio_1'].tolist()]
double_count_table['projected_2'] = [np.log2(i*j) for i,j in zip(double_count_table['NSCOR_ratio_1'].tolist(),
                                                                 double_count_table['NSCOR_ratio_2'].tolist())]
double_count_table['projected_3'] = [np.log2(i*j*k) for i,j,k in zip(double_count_table['NSCOR_ratio_1'].tolist(),
                                                                     double_count_table['NSCOR_ratio_2'].tolist(),
                                                                     double_count_table['NSCOR_ratio_3'].tolist())]

slope = []
for i in range(single_count_table.shape[0]):
#     print(i)
    X = np.array([1,2,3,4]).reshape(4,1)
    row = single_count_table.iloc[i,:]
    y =  np.array([row['projected_0'], row['projected_1'], row['projected_2'], row['projected_3']]).reshape(4,1)
    reg = LinearRegression().fit(X, y)
    
    if reg.score(X,y)>=0.75:
#         reg = LinearRegression().fit(X, y)
        slope.append(reg.coef_[0][0])
    else:
        slope.append(np.nan)
#     print(row)
single_count_table['slope_v'] = slope

slope = []
for i in range(double_count_table.shape[0]):
    X = np.array([1,2,3,4]).reshape(4,1)
    row = double_count_table.iloc[i,:]
    y =  np.array([row['projected_0'], row['projected_1'], row['projected_2'], row['projected_3']]).reshape(4,1)
    reg = LinearRegression().fit(X, y)
    if reg.score(X,y)>0:
#         reg = LinearRegression().fit(X, y)
        slope.append(reg.coef_[0][0])
    else:
        slope.append(np.nan)

double_count_table['slope_v'] = slope

double_count_table = double_count_table.dropna()
single_count_table = single_count_table.dropna()

wt_score = single_count_table.loc[single_count_table['index']=='wt']['slope_v'].values[0]
single_count_table['function_score'] = [2**(i-wt_score) for i in single_count_table['slope_v'].tolist()]
single_count_table

double_count_table['function_score'] = [2**(i-wt_score) for i in double_count_table['slope_v'].tolist()]
double_count_table

predicted_score = []
single_mut_list = single_count_table['index'].tolist()
for i in range(double_count_table.shape[0]-1):
#     print(i)
    mut1 = double_count_table.iloc[i]['mutant1']
    mut2 = double_count_table.iloc[i]['mutant2']
    if mut1 in single_mut_list and mut2 in single_mut_list:
        single_score1 = single_count_table.loc[single_count_table['index']==mut1]['function_score'].values[0]
        single_score2 = single_count_table.loc[single_count_table['index']==mut2]['function_score'].values[0]
        predicted_score.append(single_score1*single_score2)
    else:
        predicted_score.append(np.nan)
predicted_score.append(1)
double_count_table['predicted_score'] = predicted_score
double_count_table['epistasis_paper'] = double_count_table['function_score'] - double_count_table['predicted_score']
double_count_table = double_count_table.dropna()
double_count_table


Unnamed: 0,0,1,2,3,mutant1,mutant2,NSCOR_ratio_1,NSCOR_ratio_2,NSCOR_ratio_3,projected_0,projected_1,projected_2,projected_3,slope_v,function_score,predicted_score,epistasis_paper
0,0.000026,0.000016,0.000012,0.000004,Ala11Gly,His23Asp,0.604466,0.765932,0.311113,0,-0.726266,-1.110978,-2.795466,-0.877111,0.483723,0.364938,0.118785
1,0.000001,0.000002,0.000002,0.000001,Pro5Ser,Ser15Leu,1.401768,1.000881,0.861426,0,0.487248,0.488519,0.273317,0.082122,0.940491,0.672154,0.268337
2,0.000029,0.000022,0.000014,0.000004,Arg18Leu,Gln26Glu,0.742866,0.637012,0.295469,0,-0.428826,-1.079433,-2.838354,-0.916567,0.470673,0.721229,-0.250556
4,0.000044,0.000035,0.000021,0.000009,Pro5Gln,Asn22Lys,0.803792,0.594482,0.402731,0,-0.315106,-1.065400,-2.377511,-0.788283,0.514442,0.449077,0.065365
5,0.000001,0.000002,0.000002,0.000004,Pro3His,Gly16Ser,1.880149,1.051681,1.977170,0,0.910847,0.983545,1.966981,0.597364,1.344182,1.048759,0.295422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20915,0.000028,0.000020,0.000004,0.000002,Met10Val,His23Tyr,0.717027,0.198243,0.365540,0,-0.479901,-2.814557,-4.266457,-1.513403,0.311210,0.490448,-0.179237
20916,0.000044,0.000057,0.000050,0.000057,Ala6Gly,Gln17Arg,1.289555,0.882953,1.145434,0,0.366874,0.187282,0.383176,0.096994,0.950236,1.225103,-0.274867
20917,0.000021,0.000010,0.000009,0.000004,Ala6Pro,Thr29Met,0.470184,0.868802,0.472412,0,-1.088703,-1.291604,-2.373487,-0.732336,0.534784,0.681421,-0.146637
20918,0.000036,0.000029,0.000007,0.000005,Lys12Glu,Arg34Cys,0.805632,0.230862,0.672774,0,-0.311807,-2.426704,-2.998511,-1.111043,0.411317,0.561259,-0.149943


In [38]:
EPISTASIS_FILE = './rep2_-3.txt'
INDEX_FILE     = './index_matrix.csv'
SEQUENCE       = "DVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPR"
AA  = sorted(['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '*'])

with open(EPISTASIS_FILE) as f:
    content = f.readlines()
content = [float(x.strip()) for x in content]
df_index = pd.read_csv(INDEX_FILE, header=None)
df_index.rename(columns = {0 : 'state_1', 1 : 'state_2', 2 : 'mat_index'}, inplace = True)
index_list = df_index['mat_index'].tolist()
selection_list = [content[i] for i in index_list]
df_index['selection_coefficients'] = selection_list
df_index['site_1'] = df_index['state_1']/21 + 2
df_index['site_1'] = df_index['site_1'].astype(int)
df_index['AA_1']   = df_index['state_1']%21
df_index['AA_1']   = [AA[x] for x in df_index['AA_1'].tolist()]
df_index['site_2'] = df_index['state_2']/21 + 2
df_index['site_2'] = df_index['site_2'].astype(int)
df_index['AA_2']   = df_index['state_2']%21
df_index['AA_2']   = [AA[x] for x in df_index['AA_2'].tolist()]

cols = ['site_1', 'AA_1', 'site_2', 'AA_2']
df_index['all_variant'] = df_index[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis = 1)

WT_list = []; axis_1 = []; axis_2 = []
for i in range(len(SEQUENCE)):
    WT_list.append(i * 21 + AA.index(SEQUENCE[i]))
df_WT_epi = df_index[(~df_index['state_1'].isin(WT_list))&(~df_index['state_2'].isin(WT_list))&(df_index['state_2'] != df_index['state_1'])]
df_WT_epi = df_WT_epi.sort_values(['site_1', 'site_2'])

df_WT_epi['absolute_s'] = np.abs(df_WT_epi['selection_coefficients'])
df_WT_epi = df_WT_epi.groupby(['site_1', 'site_2'], as_index=False)['absolute_s'].agg('sum')
df_WT_epi['site_1'] += 8
df_WT_epi['site_2'] += 8
df_WT_epi_ = df_WT_epi.copy()
df_WT_epi_ = df_WT_epi_.rename(columns={"site_1":"site_2", "site_2":"site_1"})
df_WT_epi  = df_WT_epi.append(df_WT_epi_, ignore_index=True, sort=False)
flights = df_WT_epi.pivot("site_1", "site_2", "absolute_s")


df_index['site_1'] = df_index['site_1']-1
df_index['site_2'] = df_index['site_2']-1
df_index

df_index.loc[df_index['all_variant']=='']

double_count_table_ = double_count_table.iloc[:-1]
double_count_table_['site_1'] = [int(i[3:-3])+1 for i in double_count_table_['mutant1'].tolist() ]
double_count_table_['site_2'] = [int(i[3:-3])+1 for i in double_count_table_['mutant2'].tolist() ]
double_count_table_['AA_1'] = [Amino_acid_dict[i[-3:]] for i in double_count_table_['mutant1'].tolist()]
double_count_table_['AA_2'] = [Amino_acid_dict[i[-3:]] for i in double_count_table_['mutant2'].tolist()]
double_count_table_['all_variant'] = [str(i)+'_'+str(j)+'_'+str(k)+'_'+str(l) for i,j,k,l in zip(double_count_table_['site_1'], double_count_table_['AA_1'],double_count_table_['site_2'],double_count_table_['AA_2'])]
double_count_table_['epistasis_paper'] = double_count_table_['function_score'] - double_count_table_['predicted_score']

pd.set_option('display.max_columns', 500)
df_merge = pd.merge(double_count_table_, df_index, how="inner", on=["all_variant"])
df_merge = df_merge.rename(columns={'selection_coefficients':'epistasis_MPL'})
df_merge

df_merge['mu1_single'] = [str(i)+'_'+str(j)+'_'+str(i)+'_'+str(j) for i,j in zip(df_merge['site_1_x'], df_merge['AA_1_x'])]
df_merge['mu2_single'] = [str(i)+'_'+str(j)+'_'+str(i)+'_'+str(j) for i,j in zip(df_merge['site_2_x'], df_merge['AA_2_x'])]
df_temp = df_index[['all_variant', 'selection_coefficients']]
df_temp = df_temp.rename(columns={'all_variant':'mu1_single'})
df_merge = pd.merge(df_temp, df_merge, how="inner", on=["mu1_single"])
df_merge = df_merge.rename(columns={'selection_coefficients':'selection_coefficients_mut1'})
df_temp = df_index[['all_variant', 'selection_coefficients']]
df_temp = df_temp.rename(columns={'all_variant':'mu2_single'})
df_merge = pd.merge(df_temp, df_merge, how="inner", on=["mu2_single"])
df_merge = df_merge.rename(columns={'selection_coefficients':'selection_coefficients_mut2', 
                                    0:"freq_gen0", 1:"freq_gen1", 2:"freq_gen2", 3:"freq_gen3",
                                    "site_1_y": "site_1", "AA_1_y": "AA_1", "site_2_y": "site_2", "AA_2_y": "AA_2"})
df_merge = df_merge.drop(["mu2_single", "mu1_single", "all_variant",
                          "site_1_x", "site_2_x", "AA_1_x", "AA_2_x", 
                          "state_1", "state_2", "mat_index"], axis=1)

df_merge2 = df_merge.copy()
df_merge2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,selection_coefficients_mut2,selection_coefficients_mut1,freq_gen0,freq_gen1,freq_gen2,freq_gen3,mutant1,mutant2,NSCOR_ratio_1,NSCOR_ratio_2,NSCOR_ratio_3,projected_0,projected_1,projected_2,projected_3,slope_v,function_score,predicted_score,epistasis_paper,epistasis_MPL,site_1,AA_1,site_2,AA_2
0,0.010166,0.015774,0.000036,0.000021,0.000017,0.000009,Pro33Thr,Arg34Ser,0.570121,0.832374,0.496036,0,-0.810660,-1.075355,-2.086838,-0.652521,0.565204,0.298245,0.266959,0.006452,33,T,34,S
1,0.010166,0.023210,0.000041,0.000026,0.000010,0.000004,Pro33Ser,Arg34Ser,0.629980,0.371744,0.388080,0,-0.666622,-2.094239,-3.459813,-1.180705,0.391927,0.302289,0.089639,0.008476,33,S,34,S
2,0.010166,0.029457,0.000031,0.000032,0.000012,0.000009,Pro33Arg,Arg34Ser,1.042982,0.381607,0.692099,0,0.060715,-1.329127,-1.860077,-0.697007,0.548041,0.374377,0.173664,0.005866,33,R,34,S
3,0.010166,0.012607,0.000031,0.000019,0.000013,0.000006,Pro33Gln,Arg34Ser,0.587681,0.676860,0.489511,0,-0.766894,-1.329964,-2.360550,-0.764472,0.523003,0.307563,0.215440,0.009286,33,Q,34,S
4,0.010166,0.019384,0.000025,0.000025,0.000014,0.000004,Pro33Leu,Arg34Ser,1.008433,0.557160,0.276604,0,0.012115,-0.831722,-2.685829,-0.890132,0.479377,0.289323,0.190054,0.009317,33,L,34,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13608,0.011916,-0.019060,0.000016,0.000022,0.000033,0.000049,Val2Asp,Ile24Met,1.385272,1.501791,1.502139,0,0.470170,1.056854,1.643872,0.551830,1.302419,1.691565,-0.389146,-0.002562,2,D,24,M
13609,0.011916,0.005118,0.000018,0.000027,0.000022,0.000024,Lys12Gln,Ile24Met,1.532236,0.797682,1.090429,0,0.615638,0.289524,0.414419,0.091714,0.946765,1.051833,-0.105068,0.000590,12,Q,24,M
13610,0.011916,0.005851,0.000032,0.000027,0.000019,0.000009,Pro5Gln,Ile24Met,0.840908,0.697999,0.483513,0,-0.249981,-0.768684,-1.817058,-0.596988,0.587384,0.811246,-0.223862,-0.000969,5,Q,24,M
13611,0.011916,-0.004124,0.000017,0.000020,0.000026,0.000025,Met10Lys,Ile24Met,1.201157,1.280280,0.963520,0,0.264425,0.620884,0.567271,0.205827,1.024693,1.223469,-0.198777,-0.007611,10,K,24,M


In [39]:
df_merge1

Unnamed: 0,selection_coefficients_mut2,selection_coefficients_mut1,freq_gen0,freq_gen1,freq_gen2,freq_gen3,mutant1,mutant2,NSCOR_ratio_1,NSCOR_ratio_2,NSCOR_ratio_3,projected_0,projected_1,projected_2,projected_3,slope_v,function_score,predicted_score,epistasis_paper,epistasis_MPL,site_1,AA_1,site_2,AA_2
0,0.012913,0.020034,0.000020,0.000031,0.000007,4.335804e-06,Pro33Thr,Arg34Ser,1.570969,0.230539,0.581999,0,0.651654,-1.465264,-2.246177,-0.885545,0.476810,0.287980,0.188829,0.010507,33,T,34,S
1,0.012913,0.027446,0.000033,0.000031,0.000007,2.167902e-06,Pro33Ser,Arg34Ser,0.937506,0.230539,0.287805,0,-0.093100,-2.210018,-4.006852,-1.413747,0.330628,0.290811,0.039818,0.009983,33,S,34,S
2,0.012913,0.032836,0.000034,0.000020,0.000017,8.671607e-06,Pro33Arg,Arg34Ser,0.570753,0.829437,0.516622,0,-0.809060,-1.078856,-2.031674,-0.636482,0.566657,0.368164,0.198493,0.002757,33,R,34,S
3,0.012913,0.018224,0.000025,0.000027,0.000011,4.877779e-06,Pro33Gln,Arg34Ser,1.107283,0.396138,0.434902,0,0.147024,-1.188899,-2.390137,-0.850633,0.488489,0.302949,0.185540,0.009727,33,Q,34,S
4,0.012913,0.024075,0.000030,0.000020,0.000019,5.419755e-07,Pro33Leu,Arg34Ser,0.657119,0.923318,0.023032,0,-0.605774,-0.720875,-6.161110,-1.859843,0.242690,0.274872,-0.032182,0.006959,33,L,34,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13629,0.012447,-0.022180,0.000027,0.000018,0.000043,5.040372e-05,Val2Asp,Ile24Met,0.664157,2.397807,1.183176,0,-0.590404,0.671311,0.913976,0.400364,1.162632,1.756747,-0.594116,-0.014236,2,D,24,M
13630,0.012447,0.006328,0.000031,0.000027,0.000024,2.818272e-05,Lys12Gln,Ile24Met,0.885405,0.876377,1.170385,0,-0.175590,-0.365966,-0.138983,-0.060732,0.844575,1.050915,-0.206340,-0.003648,12,Q,24,M
13631,0.012447,0.006867,0.000034,0.000022,0.000017,7.587656e-06,Pro5Gln,Ile24Met,0.634391,0.748967,0.451246,0,-0.656556,-1.073582,-2.221595,-0.708181,0.539184,0.794182,-0.254998,-0.004278,5,Q,24,M
13632,0.012447,-0.003631,0.000015,0.000023,0.000017,2.438890e-05,Met10Lys,Ile24Met,1.613285,0.714219,1.464578,0,0.690001,0.204439,0.754924,0.177921,0.996506,1.220758,-0.224251,-0.003775,10,K,24,M


In [53]:
df1

Unnamed: 0,site_1,AA_1,site_2,AA_2,epistasis_MPL
0,33,T,34,S,0.010507
1,33,S,34,S,0.009983
2,33,R,34,S,0.002757
3,33,Q,34,S,0.009727
4,33,L,34,S,0.006959
...,...,...,...,...,...
13629,2,D,24,M,-0.014236
13630,12,Q,24,M,-0.003648
13631,5,Q,24,M,-0.004278
13632,10,K,24,M,-0.003775


In [73]:
df_epistasis = pd.DataFrame(columns = ["site1", "amino_acid1", 
                                       "site2", "amino_acid2",
                                       "rep_1", "rep_2", "joint"])

df1 = df_merge1[["site_1", "AA_1", "site_2", "AA_2", "epistasis_MPL"]].copy()
df2 = df_merge2[["site_1", "AA_1", "site_2", "AA_2", "epistasis_MPL"]].copy()
df_merged_selection = df1.merge(df2, on=["site_1", "AA_1", "site_2", "AA_2"])
df_merged_selection
df_epistasis["site1"] = df_merged_selection["site_1"]
df_epistasis["site2"] = df_merged_selection["site_2"]
df_epistasis["amino_acid1"] = df_merged_selection["AA_1"]
df_epistasis["amino_acid2"] = df_merged_selection["AA_2"]
df_epistasis["rep_1"] = df_merged_selection["epistasis_MPL_x"]
df_epistasis["rep_2"] = df_merged_selection["epistasis_MPL_y"]
df_epistasis.to_csv("../../output/epistasis/YAP1_popDMS.csv")

df_epistasis



Unnamed: 0,site1,amino_acid1,site2,amino_acid2,rep_1,rep_2,joint
0,33,T,34,S,0.010507,0.006452,
1,33,S,34,S,0.009983,0.008476,
2,33,R,34,S,0.002757,0.005866,
3,33,Q,34,S,0.009727,0.009286,
4,33,L,34,S,0.006959,0.009317,
...,...,...,...,...,...,...,...
13105,2,D,24,M,-0.014236,-0.002562,
13106,12,Q,24,M,-0.003648,0.000590,
13107,5,Q,24,M,-0.004278,-0.000969,
13108,10,K,24,M,-0.003775,-0.007611,


In [75]:
df_epistasis = pd.DataFrame(columns = ["site1", "amino_acid1", 
                                       "site2", "amino_acid2",
                                       "rep_1", "rep_2", "average"])

df1 = df_merge1[["site_1", "AA_1", "site_2", "AA_2", "epistasis_paper"]].copy()
df2 = df_merge2[["site_1", "AA_1", "site_2", "AA_2", "epistasis_paper"]].copy()
df_merged_selection = df1.merge(df2, on=["site_1", "AA_1", "site_2", "AA_2"])
df_merged_selection
df_epistasis["site1"] = df_merged_selection["site_1"]
df_epistasis["site2"] = df_merged_selection["site_2"]
df_epistasis["amino_acid1"] = df_merged_selection["AA_1"]
df_epistasis["amino_acid2"] = df_merged_selection["AA_2"]
df_epistasis["rep_1"] = df_merged_selection["epistasis_paper_x"]
df_epistasis["rep_2"] = df_merged_selection["epistasis_paper_y"]
df_epistasis["average"] = (df_epistasis["rep_1"]+df_epistasis["rep_2"])/2
df_epistasis.to_csv("../../output/epistasis/YAP1_prference.csv")

df_epistasis



Unnamed: 0,site1,amino_acid1,site2,amino_acid2,rep_1,rep_2,average
0,33,T,34,S,0.188829,0.266959,0.227894
1,33,S,34,S,0.039818,0.089639,0.064728
2,33,R,34,S,0.198493,0.173664,0.186078
3,33,Q,34,S,0.185540,0.215440,0.200490
4,33,L,34,S,-0.032182,0.190054,0.078936
...,...,...,...,...,...,...,...
13105,2,D,24,M,-0.594116,-0.389146,-0.491631
13106,12,Q,24,M,-0.206340,-0.105068,-0.155704
13107,5,Q,24,M,-0.254998,-0.223862,-0.239430
13108,10,K,24,M,-0.224251,-0.198777,-0.211514


In [62]:
for i in range(df_epistasis.shape[0]):
    site = df_epistasis.at[i, "site1"]
    if df_epistasis.at[i, "amino_acid1"] == SEQUENCE[site-1]:
        print()
        df_epistasis.at[i, "WT_indicator1"] = True
    else:
        df_epistasis.at[i, "WT_indicator1"] = False
    site = df_epistasis.at[i, "site2"]
    if df_epistasis.at[i, "amino_acid2"] == SEQUENCE[site-1]:
        df_epistasis.at[i, "WT_indicator2"] = True
    else:
        df_epistasis.at[i, "WT_indicator2"] = False
df_epistasis

Unnamed: 0,site1,amino_acid1,WT_indicator1,site2,amino_acid2,WT_indicator2,rep_1,rep_2,joint
0,33,T,False,34,S,False,0.010507,0.006452,
1,33,S,False,34,S,False,0.009983,0.008476,
2,33,R,False,34,S,False,0.002757,0.005866,
3,33,Q,False,34,S,False,0.009727,0.009286,
4,33,L,False,34,S,False,0.006959,0.009317,
...,...,...,...,...,...,...,...,...,...
13105,2,D,False,24,M,False,-0.014236,-0.002562,
13106,12,Q,False,24,M,False,-0.003648,0.000590,
13107,5,Q,False,24,M,False,-0.004278,-0.000969,
13108,10,K,False,24,M,False,-0.003775,-0.007611,


In [79]:
df_epistasis["site2"].min()

2