In [1]:
import pandas as pd
import popDMS
import numpy as np
import scipy.stats as st
AA  = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I','L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']


# HIV Env BG505, HIV Env BF520, HIV Env BG505 FP16, FP20, VRC34

In [2]:
def transform_FP_VRC(index_file, pref_file, rep_num):
    df_index = pd.read_csv(index_file, delimiter = ',')
    df = pd.read_csv(pref_file)
    df = df.drop('wildtype', axis = 1)
    df = df.dropna()
    df['site'] = df['site'].astype('string')
    df_index['original'] = df_index['original'].astype('string')
    df_index['new'] = df_index['new'].astype('string')

    df['site'] = df['site'].map(df_index.set_index('new')['original'])
    df['site'] = df['site'].astype('int')
    df = df.rename(columns={'mutation': 'amino_acid', 'mutdiffsel': rep_num})
    return df

# FP16
index_file = './data/prefs/BG505_to_HXB2_numbering.txt'
Target_name = 'HIV bnAbs FP16'

pref_file = './data/prefs/FP16-02-500ug-rep-1.csv'
rep_num = 'rep_1'
df1 = transform_FP_VRC(index_file, pref_file, rep_num)

pref_file = './data/prefs/FP16-02-500ug-rep-3.csv'
rep_num = 'rep_2'
df2 = transform_FP_VRC(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')

#FP20
index_file = './data/prefs/BG505_to_HXB2_numbering.txt'
Target_name = 'HIV bnAbs FP20'

pref_file = './data/prefs/FP20-01-500ug-rep-1.csv'
rep_num = 'rep_1'
df1 = transform_FP_VRC(index_file, pref_file, rep_num)

pref_file = './data/prefs/FP20-01-500ug-rep-3.csv'
rep_num = 'rep_2'
df2 = transform_FP_VRC(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')

#VRC34
index_file = './data/prefs/BG505_to_HXB2_numbering.txt'
Target_name = 'HIV bnAbs VRC34'

pref_file = './data/prefs/VRC34-33ug-rep-1.csv'
rep_num = 'rep_1'
df1 = transform_FP_VRC(index_file, pref_file, rep_num)

pref_file = './data/prefs/VRC34-33ug-rep-3.csv'
rep_num = 'rep_2'
df2 = transform_FP_VRC(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')


# HIV Env BF520 human host, rhesus host

In [3]:
def transform_host(index_file, pref_file, rep_num):
    df_index = pd.read_csv(index_file, delimiter = ',')
    df = pd.read_csv(pref_file)
    df = df.dropna()
    df['site'] = df['site'].astype('string')
    df_index['original'] = df_index['original'].astype('string')
    df_index['new'] = df_index['new'].astype('string')

    df['site'] = df['site'].map(df_index.set_index('new')['original'])
    df = df.dropna()
    df['site'] = df['site'].astype('int')
    
    site_list = []
    AA_list = []
    df_pref_only = df[AA]
    for i in df['site'].tolist():
        site_list += [i]*20
        AA_list += AA
    df_pref = pd.DataFrame(columns=['site', 'amino_acid', rep_num]) 
    df_pref['site'] = site_list
    df_pref['amino_acid'] = AA_list
    df_pref[rep_num] = df_pref_only.values.flatten()
    return df_pref

# BG505
index_file = './data/prefs/BG505_to_HXB2_numbering.txt'
Target_name = 'HIV BG505'

pref_file = './data/prefs/HIV Env BG505-1_prefs.csv'
rep_num = 'rep_1'
df1 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV Env BG505-2_prefs.csv'
rep_num = 'rep_2'
df2 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV Env BG505-3_prefs.csv'
rep_num = 'rep_3'
df3 = transform_host(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df3, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2', 'rep_3']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')


# BF520
index_file = './data/prefs/BF520c2_to_HXB2.csv'
Target_name = 'HIV BF520'

pref_file = './data/prefs/HIV Env BF520-1_prefs.csv'
rep_num = 'rep_1'
df1 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV Env BF520-2_prefs.csv'
rep_num = 'rep_2'
df2 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV Env BF520-3_prefs.csv'
rep_num = 'rep_3'
df3 = transform_host(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df3, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2', 'rep_3']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')

# BF520 human host
index_file = './data/prefs/BF520c2_to_HXB2.csv'
Target_name = 'HIV BF520 human host'

pref_file = './data/prefs/HIV BF520 human host-1_prefs.csv'
rep_num = 'rep_1'
df1 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV BF520 human host-2_prefs.csv'
rep_num = 'rep_2'
df2 = transform_host(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')

# BF520 rhesus host
index_file = './data/prefs/BF520c2_to_HXB2.csv'
Target_name = 'HIV BF520 rhesus host'

pref_file = './data/prefs/HIV BF520 rhesus host-1_prefs.csv'
rep_num = 'rep_1'
df1 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV BF520 rhesus host-2_prefs.csv'
rep_num = 'rep_2'
df2 = transform_host(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')


# PR8, Aichi68C

In [3]:
def preference_merge(REP, Target_name, count_file):
    pref_list = ['./data/prefs/'+Target_name+'-1_prefs.csv',
                 './data/prefs/'+Target_name+'-2_prefs.csv',
                 './data/prefs/'+Target_name+'-3_prefs.csv']

    df_count = pd.read_csv(count_file)
    site_list = df_count['site'].tolist()
    site_list_merged = []
    for site in site_list:
        site_list_merged += [site]*20

    aa_list_merged = []
    for i in range(len(site_list)):
        aa_list_merged += AA

    rep=[i+1 for i in range(REP)]
    rep_list=[]
    for replicate in rep:
        rep_list.append('rep_'+str(replicate))
    df_pref_merged = pd.DataFrame(columns=['site', 'amino_acid'] + rep_list + ['average'])
    df_pref_merged['site'] = site_list_merged
    df_pref_merged['amino_acid'] = aa_list_merged

    for i in rep:
        df_pref = pd.read_csv(pref_list[i-1])
        df_pref_aa = df_pref[AA]
        df_pref_merged['rep_'+str(i)] = df_pref_aa.values.flatten()
    df_pref_merged['average'] = df_pref_merged[rep_list].mean(axis = 1)
    df_pref_merged.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')

REP=3
Target_name = 'PR8'
count_file = './data/raw_data/PR8_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

REP=2
Target_name = 'Aichi68C'
count_file = './data/raw_data/Aichi68C_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)


# Flu WSN, A549, CCL141

In [29]:
def transform_WSN(df, rep_num, WSN):
    if WSN == True:
        df = df.drop(['WT_AA', 'SITE_ENTROPY'], axis = 1)
        df.set_axis(['site']+[i[3] for i in df.columns[1:]], axis=1,inplace=True)
    site_list = []
    AA_list = []
    df_pref_only = df[AA]
    for i in df['site'].tolist():
        site_list += [i]*20
        AA_list += AA
    df_pref = pd.DataFrame(columns=['site', 'amino_acid', rep_num]) 
    df_pref['site'] = site_list
    df_pref['amino_acid'] = AA_list
    df_pref[rep_num] = df_pref_only.values.flatten()
    return df_pref

Target_name = 'WSN'
df = pd.read_csv('./data/prefs/WSN-1_prefs.txt', delimiter = '\t')
rep_num = 'rep_1'
df1 = transform_WSN(df, rep_num, True)

df = pd.read_csv('./data/prefs/WSN-2_prefs.txt', delimiter = '\t')
rep_num = 'rep_2'
df2 = transform_WSN(df, rep_num, True)

df = pd.read_csv('./data/prefs/WSN-3_prefs.txt', delimiter = '\t')
rep_num = 'rep_3'
df3 = transform_WSN(df, rep_num, True)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df3, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2', 'rep_3']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')


Target_name = 'A549'
df = pd.read_csv('./data/prefs/A549-1_prefs.csv', delimiter = ',')
rep_num = 'rep_1'
df1 = transform_WSN(df, rep_num, False)

df = pd.read_csv('./data/prefs/A549-2_prefs.csv', delimiter = ',')
rep_num = 'rep_2'
df2 = transform_WSN(df, rep_num, False)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')


Target_name = 'CCL141'
df = pd.read_csv('./data/prefs/CCL141-1_prefs.csv', delimiter = ',')
rep_num = 'rep_1'
df1 = transform_WSN(df, rep_num, False)

df = pd.read_csv('./data/prefs/CCL141-2_prefs.csv', delimiter = ',')
rep_num = 'rep_2'
df2 = transform_WSN(df, rep_num, False)

df = pd.read_csv('./data/prefs/CCL141-3_prefs.csv', delimiter = ',')
rep_num = 'rep_3'
df3 = transform_WSN(df, rep_num, False)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df3, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2', 'rep_3']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')


In [None]:
REP=3
Target_name = 'Matrix_M1'
count_file = './data/raw_data/HomM1_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

In [None]:
REP=2
Target_name = 'MS'
count_file = './data/raw_data/MS_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

In [None]:
REP=2
Target_name = 'MxA'
count_file = './data/raw_data/MxA_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

In [None]:
REP=2
Target_name = 'MxAneg'
count_file = './data/raw_data/MxAneg_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

In [25]:
pd.read_csv('./outputs/selection_coefficients/A549.csv.gz')

Unnamed: 0,site,amino_acid,rep_1,rep_2,rep_3,joint
0,1,A,-0.069898,-0.048427,-0.058717,-0.103474
1,1,R,-0.085384,-0.069942,-0.076981,-0.136801
2,1,N,-0.036199,-0.032016,-0.022438,-0.058073
3,1,D,-0.019406,-0.018494,-0.022437,-0.036630
4,1,C,-0.025904,-0.017354,-0.032003,-0.048657
...,...,...,...,...,...,...
15955,760,T,-0.058473,-0.061532,-0.075577,-0.121429
15956,760,W,0.039764,0.032528,0.044873,0.171268
15957,760,Y,-0.056840,-0.057206,-0.034837,-0.087070
15958,760,V,-0.065433,-0.058801,-0.054878,-0.099144


In [37]:
pd.read_csv('./outputs/merged_preference/CCL141.csv.gz')

Unnamed: 0,site,amino_acid,rep_1,rep_2,rep_3,average
0,1,A,0.001849,0.001237,0.005409,0.002832
1,1,V,0.001937,0.004136,0.004994,0.003689
2,1,Y,0.000254,0.001184,0.004126,0.001855
3,1,W,0.001238,0.000548,0.001958,0.001248
4,1,T,0.003965,0.009419,0.006840,0.006741
...,...,...,...,...,...,...
15175,759,R,0.000019,0.000532,0.000119,0.000224
15176,759,A,0.000006,0.002508,0.000096,0.000870
15177,759,Y,0.000211,0.000415,0.000615,0.000414
15178,759,H,0.000466,0.003033,0.000244,0.001248


In [24]:
count_file = './data/raw_data/CCL141_DNA_codoncounts.csv'
pd.read_csv(count_file)

Unnamed: 0,site,wildtype,AAA,AAC,AAG,AAT,ACA,ACC,ACG,ACT,...,TCG,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT
0,1,ATG,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GAG,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,AGA,12,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,ATA,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,AAA,153619,2,34,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,756,ATG,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
756,757,GCC,0,0,0,0,0,10,0,0,...,0,0,0,0,0,0,0,0,0,0
757,758,ATC,0,0,0,0,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
758,759,AAT,0,0,0,155651,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
count_file = './data/prefs/A549-1_prefs.csv'
pd.read_csv(count_file)



Unnamed: 0,site,A,C,D,E,F,G,H,I,K,...,M,N,P,Q,R,S,T,V,W,Y
0,1,0.000079,0.000181,0.002725,0.000253,0.000394,4.276792e-05,0.000068,0.004258,0.003104,...,0.982066,0.000086,0.000142,0.000109,0.000080,0.000547,0.002523,0.001803,0.000407,0.000251
1,2,0.000178,0.001716,0.004248,0.141090,0.000032,2.855230e-03,0.005050,0.004157,0.000697,...,0.790964,0.030666,0.000016,0.000017,0.000007,0.000086,0.000018,0.002022,0.001590,0.014586
2,3,0.003569,0.003252,0.003879,0.000150,0.001131,1.682079e-02,0.000149,0.013531,0.039286,...,0.062661,0.015303,0.000009,0.044659,0.744626,0.004987,0.000177,0.004888,0.003137,0.035286
3,4,0.006393,0.004169,0.000111,0.001639,0.000299,2.405719e-02,0.000587,0.186632,0.002530,...,0.011342,0.020429,0.000057,0.033278,0.000143,0.000138,0.686929,0.001594,0.001985,0.000478
4,5,0.006295,0.001481,0.000048,0.010391,0.612576,7.799074e-05,0.004788,0.054126,0.078485,...,0.002668,0.009370,0.000077,0.003633,0.001020,0.157425,0.019032,0.002223,0.001943,0.000505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
754,755,0.000040,0.000015,0.000047,0.000016,0.000058,2.886788e-05,0.000026,0.000010,0.000035,...,0.000120,0.000026,0.000033,0.000275,0.996387,0.000002,0.000008,0.000004,0.001017,0.000067
755,756,0.004510,0.000463,0.000342,0.003837,0.000227,3.124086e-04,0.000216,0.006927,0.041520,...,0.812531,0.001617,0.000024,0.000535,0.023372,0.000056,0.099686,0.003133,0.000062,0.000077
756,757,0.899900,0.000126,0.005277,0.002322,0.000043,5.008420e-02,0.000124,0.000986,0.000167,...,0.002183,0.001140,0.000033,0.008763,0.000570,0.006005,0.013399,0.007808,0.000373,0.000659
757,758,0.000011,0.000008,0.000025,0.000117,0.000176,7.820674e-07,0.000007,0.136557,0.459254,...,0.189562,0.000465,0.000003,0.000029,0.082583,0.126735,0.003068,0.000058,0.000861,0.000010
