In [1]:
import pandas as pd
import popDMS
import numpy as np
import scipy.stats as st
AA  = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I','L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']


# HIV Env BG505, HIV Env BF520, HIV Env BG505 FP16, FP20, VRC34

In [2]:
def transform_FP_VRC(index_file, pref_file, rep_num):
    df_index = pd.read_csv(index_file, delimiter = ',')
    df = pd.read_csv(pref_file)
    df = df.drop('wildtype', axis = 1)
    df = df.dropna()
    df['site'] = df['site'].astype('string')
    df_index['original'] = df_index['original'].astype('string')
    df_index['new'] = df_index['new'].astype('string')

    df['site'] = df['site'].map(df_index.set_index('new')['original'])
    df['site'] = df['site'].astype('int')
    df = df.rename(columns={'mutation': 'amino_acid', 'mutdiffsel': rep_num})
    return df

# FP16
index_file = './data/prefs/BG505_to_HXB2_numbering.txt'
Target_name = 'HIV bnAbs FP16'

pref_file = './data/prefs/FP16-02-500ug-rep-1.csv'
rep_num = 'rep_1'
df1 = transform_FP_VRC(index_file, pref_file, rep_num)

pref_file = './data/prefs/FP16-02-500ug-rep-3.csv'
rep_num = 'rep_2'
df2 = transform_FP_VRC(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')

#FP20
index_file = './data/prefs/BG505_to_HXB2_numbering.txt'
Target_name = 'HIV bnAbs FP20'

pref_file = './data/prefs/FP20-01-500ug-rep-1.csv'
rep_num = 'rep_1'
df1 = transform_FP_VRC(index_file, pref_file, rep_num)

pref_file = './data/prefs/FP20-01-500ug-rep-3.csv'
rep_num = 'rep_2'
df2 = transform_FP_VRC(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')

#VRC34
index_file = './data/prefs/BG505_to_HXB2_numbering.txt'
Target_name = 'HIV bnAbs VRC34'

pref_file = './data/prefs/VRC34-33ug-rep-1.csv'
rep_num = 'rep_1'
df1 = transform_FP_VRC(index_file, pref_file, rep_num)

pref_file = './data/prefs/VRC34-33ug-rep-3.csv'
rep_num = 'rep_2'
df2 = transform_FP_VRC(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')


# HIV Env BF520 human host, rhesus host

In [3]:
def transform_host(index_file, pref_file, rep_num):
    df_index = pd.read_csv(index_file, delimiter = ',')
    df = pd.read_csv(pref_file)
    df = df.dropna()
    df['site'] = df['site'].astype('string')
    df_index['original'] = df_index['original'].astype('string')
    df_index['new'] = df_index['new'].astype('string')

    df['site'] = df['site'].map(df_index.set_index('new')['original'])
    df = df.dropna()
    df['site'] = df['site'].astype('int')
    
    site_list = []
    AA_list = []
    df_pref_only = df[AA]
    for i in df['site'].tolist():
        site_list += [i]*20
        AA_list += AA
    df_pref = pd.DataFrame(columns=['site', 'amino_acid', rep_num]) 
    df_pref['site'] = site_list
    df_pref['amino_acid'] = AA_list
    df_pref[rep_num] = df_pref_only.values.flatten()
    return df_pref

# BG505
index_file = './data/prefs/BG505_to_HXB2_numbering.txt'
Target_name = 'HIV BG505'

pref_file = './data/prefs/HIV Env BG505-1_prefs.csv'
rep_num = 'rep_1'
df1 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV Env BG505-2_prefs.csv'
rep_num = 'rep_2'
df2 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV Env BG505-3_prefs.csv'
rep_num = 'rep_3'
df3 = transform_host(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df3, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2', 'rep_3']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')


# BF520
index_file = './data/prefs/BF520c2_to_HXB2.csv'
Target_name = 'HIV BF520'

pref_file = './data/prefs/HIV Env BF520-1_prefs.csv'
rep_num = 'rep_1'
df1 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV Env BF520-2_prefs.csv'
rep_num = 'rep_2'
df2 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV Env BF520-3_prefs.csv'
rep_num = 'rep_3'
df3 = transform_host(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df3, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2', 'rep_3']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')

# BF520 human host
index_file = './data/prefs/BF520c2_to_HXB2.csv'
Target_name = 'HIV BF520 human host'

pref_file = './data/prefs/HIV BF520 human host-1_prefs.csv'
rep_num = 'rep_1'
df1 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV BF520 human host-2_prefs.csv'
rep_num = 'rep_2'
df2 = transform_host(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')

# BF520 rhesus host
index_file = './data/prefs/BF520c2_to_HXB2.csv'
Target_name = 'HIV BF520 rhesus host'

pref_file = './data/prefs/HIV BF520 rhesus host-1_prefs.csv'
rep_num = 'rep_1'
df1 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV BF520 rhesus host-2_prefs.csv'
rep_num = 'rep_2'
df2 = transform_host(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')


# PR8, Aichi68C

In [3]:
def preference_merge(REP, Target_name, count_file):
    pref_list = ['./data/prefs/'+Target_name+'-1_prefs.csv',
                 './data/prefs/'+Target_name+'-2_prefs.csv',
                 './data/prefs/'+Target_name+'-3_prefs.csv']

    df_count = pd.read_csv(count_file)
    site_list = df_count['site'].tolist()
    site_list_merged = []
    for site in site_list:
        site_list_merged += [site]*20

    aa_list_merged = []
    for i in range(len(site_list)):
        aa_list_merged += AA

    rep=[i+1 for i in range(REP)]
    rep_list=[]
    for replicate in rep:
        rep_list.append('rep_'+str(replicate))
    df_pref_merged = pd.DataFrame(columns=['site', 'amino_acid'] + rep_list + ['average'])
    df_pref_merged['site'] = site_list_merged
    df_pref_merged['amino_acid'] = aa_list_merged

    for i in rep:
        df_pref = pd.read_csv(pref_list[i-1])
        df_pref_aa = df_pref[AA]
        df_pref_merged['rep_'+str(i)] = df_pref_aa.values.flatten()
    df_pref_merged['average'] = df_pref_merged[rep_list].mean(axis = 1)
    df_pref_merged.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')

REP=3
Target_name = 'PR8'
count_file = './data/raw_data/PR8_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

REP=2
Target_name = 'Aichi68C'
count_file = './data/raw_data/Aichi68C_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)


# Flu_WSN

In [23]:
def transform_WSN(df, rep_num):
    df = df.drop(['WT_AA', 'SITE_ENTROPY'], axis = 1)
    df.set_axis(['site']+[i[3] for i in df.columns[1:]], axis=1,inplace=True)
    site_list = []
    AA_list = []
    df_pref_only = df[AA]
    for i in df['site'].tolist():
        site_list += [i]*20
        AA_list += AA
    df_pref = pd.DataFrame(columns=['site', 'amino_acid', rep_num]) 
    df_pref['site'] = site_list
    df_pref['amino_acid'] = AA_list
    df_pref[rep_num] = df_pref_only.values.flatten()
    return df_pref

Target_name = 'WSN'
df = pd.read_csv('./data/prefs/WSN-1_prefs.txt', delimiter = '\t')
rep_num = 'rep_1'
df1 = transform_WSN(df, rep_num)

df = pd.read_csv('./data/prefs/WSN-2_prefs.txt', delimiter = '\t')
rep_num = 'rep_2'
df2 = transform_WSN(df, rep_num)

df = pd.read_csv('./data/prefs/WSN-3_prefs.txt', delimiter = '\t')
rep_num = 'rep_3'
df3 = transform_WSN(df, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df3, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2', 'rep_3']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv('./outputs/merged_preference/' + Target_name + '.csv.gz', index = False, compression = 'gzip')


# pd.read_csv('./data/prefs/TpoR_MPL_prefs.csv.gz')
# df

In [None]:
REP=3
Target_name = 'Matrix_M1'
count_file = './data/raw_data/HomM1_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

In [None]:
REP=3
Target_name = 'A549'
count_file = './data/raw_data/A549_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

In [None]:
REP=3
Target_name = 'CCL141'
count_file = './data/raw_data/CCL141_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

In [None]:
REP=2
Target_name = 'MS'
count_file = './data/raw_data/MS_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

In [None]:
REP=2
Target_name = 'MxA'
count_file = './data/raw_data/MxA_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

In [None]:
REP=2
Target_name = 'MxAneg'
count_file = './data/raw_data/MxAneg_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

In [22]:
pd.read_csv('./outputs/merged_preference/WSN.csv.gz')

Unnamed: 0,site,amino_acid,rep_1,rep_2,rep_3,average
0,2,A,0.009085,0.000758,0.113307,0.041050
1,2,V,0.064900,0.017342,0.034190,0.038811
2,2,Y,0.009500,0.005090,0.054999,0.023197
3,2,W,0.007614,0.008877,0.008929,0.008473
4,2,T,0.085162,0.221281,0.095259,0.133901
...,...,...,...,...,...,...
11275,565,R,0.016207,0.000556,0.003558,0.006774
11276,565,A,0.091113,0.081396,0.002253,0.058254
11277,565,Y,0.007314,0.002776,0.007182,0.005757
11278,565,H,0.009102,0.001989,0.011795,0.007629


In [4]:
count_file = './data/raw_data/PR8_DNA_codoncounts.csv'
pd.read_csv(count_file)

Unnamed: 0,site,wildtype,AAA,AAC,AAG,AAT,ACA,ACC,ACG,ACT,...,TCG,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT
0,1,ATG,0,0,93,0,0,0,131,0,...,0,0,0,0,0,0,0,0,61,0
1,2,GCG,0,0,0,0,0,0,63,0,...,924,0,0,0,0,0,0,0,0,0
2,3,TCT,0,0,0,0,0,0,0,32,...,7,965832,0,0,0,28,0,0,0,69
3,4,CAA,805,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,GGC,0,0,0,0,0,0,0,0,...,0,0,0,252,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,494,GAG,0,0,27,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
494,495,GAG,0,0,37,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
495,496,TAC,1,38,0,0,0,0,0,0,...,0,0,0,67,0,0,0,46,0,0
496,497,GAC,0,87,0,0,0,0,2,0,...,0,0,0,1,0,0,0,0,0,0


In [8]:
count_file = './data/raw_data/WSN_DNA_codoncounts.csv'
pd.read_csv(count_file)



Unnamed: 0,site,wildtype,AAA,AAC,AAG,AAT,ACA,ACC,ACG,ACT,...,TCG,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT
0,1,ATG,0,0,12,4,1,0,36,0,...,0,0,0,0,0,0,0,0,13,0
1,2,AAG,30,2,313434,21,0,0,4,0,...,0,0,0,0,0,0,0,0,0,0
2,3,GCA,0,0,0,0,44,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,AAA,340722,11,28,15,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,CTA,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,13,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,561,TGC,0,0,0,0,0,0,0,0,...,0,0,31,310283,6,36,0,40,0,0
561,562,AGA,17,0,0,0,2,0,0,0,...,0,0,18,0,0,0,0,0,0,0
562,563,ATA,10,0,0,0,26,0,0,0,...,0,0,0,0,0,0,21,0,0,0
563,564,TGC,0,0,0,0,0,0,0,0,...,0,0,52,431896,26,83,0,49,0,0
