In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
AA  = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I','L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
Amino_acid_dict = {'Ala': 'A',
                   'Arg': 'R',
                   'Asn': 'N',
                   'Asp': 'D',
                   'Cys': 'C',
                   'Gln': 'Q',
                   'Glu': 'E',
                   'Gly': 'G',
                   'His': 'H',
                   'Ile': 'I',
                   'Leu': 'L',
                   'Lys': 'K',
                   'Met': 'M',
                   'Phe': 'F',
                   'Pro': 'P',
                   'Ser': 'S',
                   'Thr': 'T',
                   'Trp': 'W',
                   'Tyr': 'Y',
                   'Val': 'V',
                   'Ter': '*'
                  }
def codon2aa(c, noq=False):              # Returns the amino acid character corresponding to the input codon.
    if c[0]=='-' and c[1]=='-' and c[2]=='-': return '-'        # If all nucleotides are missing, return gap
    elif c[0]=='-' or c[1]=='-' or c[2]=='-':                   # Else if some nucleotides are missing, return '?'
        if noq: return '-'
        else:   return '?'
    # If the first or second nucleotide is ambiguous, AA cannot be determined, return 'X'
    elif c[0] in ['W', 'S', 'M', 'K', 'R', 'Y'] or c[1] in ['W', 'S', 'M', 'K', 'R', 'Y']: return 'X'     
                                                    
    elif c[0]=='T':                                             # Else go to tree
        if c[1]=='T':
            if    c[2] in ['T', 'C', 'Y']: return 'F'
            elif  c[2] in ['A', 'G', 'R']: return 'L'
            else:                          return 'X'
        elif c[1]=='C':                    return 'S'
        elif c[1]=='A':
            if    c[2] in ['T', 'C', 'Y']: return 'Y'
            elif  c[2] in ['A', 'G', 'R']: return '*'
            else:                          return 'X'
        elif c[1]=='G':
            if    c[2] in ['T', 'C', 'Y']: return 'C'
            elif  c[2]=='A':               return '*'
            elif  c[2]=='G':               return 'W'
            else:                          return 'X'
        else:                              return 'X'
        
    elif c[0]=='C':
        if   c[1]=='T':                    return 'L'
        elif c[1]=='C':                    return 'P'
        elif c[1]=='A':
            if    c[2] in ['T', 'C', 'Y']: return 'H'
            elif  c[2] in ['A', 'G', 'R']: return 'Q'
            else:                          return 'X'
        elif c[1]=='G':                    return 'R'
        else:                              return 'X'
        
    elif c[0]=='A':
        if c[1]=='T':
            if    c[2] in ['T', 'C', 'Y']: return 'I'
            elif  c[2] in ['A', 'M', 'W']: return 'I'
            elif  c[2]=='G':               return 'M'
            else:                          return 'X'
        elif c[1]=='C':                    return 'T'
        elif c[1]=='A':
            if    c[2] in ['T', 'C', 'Y']: return 'N'
            elif  c[2] in ['A', 'G', 'R']: return 'K'
            else:                          return 'X'
        elif c[1]=='G':
            if    c[2] in ['T', 'C', 'Y']: return 'S'
            elif  c[2] in ['A', 'G', 'R']: return 'R'
            else:                          return 'X'
        else:                              return 'X'
        
    elif c[0]=='G':
        if   c[1]=='T':                    return 'V'
        elif c[1]=='C':                    return 'A'
        elif c[1]=='A':
            if    c[2] in ['T', 'C', 'Y']: return 'D'
            elif  c[2] in ['A', 'G', 'R']: return 'E'
            else:                          return 'X'
        elif c[1]=='G':                    return 'G'
        else:                              return 'X'

    else:                                  return 'X'



PREF_MERGED_DIR = './output/merged_preference/'

# HIV Env BF520 human host, rhesus host

In [2]:
def transform_host(index_file, pref_file, rep_num):
    df_index = pd.read_csv(index_file, delimiter = ',')
    df = pd.read_csv(pref_file)
    df = df.dropna()
    df['site'] = df['site'].astype('string')
    df_index['original'] = df_index['original'].astype('string')
    df_index['new'] = df_index['new'].astype('string')

    df['site'] = df['site'].map(df_index.set_index('new')['original'])
    df = df.dropna()
    df['site'] = df['site'].astype('int')
    
    site_list = []
    AA_list = []
    df_pref_only = df[AA]
    for i in df['site'].tolist():
        site_list += [i]*20
        AA_list += AA
    df_pref = pd.DataFrame(columns=['site', 'amino_acid', rep_num]) 
    df_pref['site'] = site_list
    df_pref['amino_acid'] = AA_list
    df_pref[rep_num] = df_pref_only.values.flatten()
    return df_pref

# BG505
index_file = './data/prefs/BG505_to_HXB2_numbering.txt'
Target_name = 'HIV BG505'

pref_file = './data/prefs/HIV Env BG505-1_prefs.csv'
rep_num = 'rep_1'
df1 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV Env BG505-2_prefs.csv'
rep_num = 'rep_2'
df2 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV Env BG505-3_prefs.csv'
rep_num = 'rep_3'
df3 = transform_host(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df3, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2', 'rep_3']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv(PREF_MERGED_DIR + Target_name + '.csv.gz', index = False, compression = 'gzip')


# BF520
index_file = './data/prefs/BF520c2_to_HXB2.csv'
Target_name = 'HIV BF520'

pref_file = './data/prefs/HIV Env BF520-1_prefs.csv'
rep_num = 'rep_1'
df1 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV Env BF520-2_prefs.csv'
rep_num = 'rep_2'
df2 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV Env BF520-3_prefs.csv'
rep_num = 'rep_3'
df3 = transform_host(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df3, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2', 'rep_3']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv(PREF_MERGED_DIR + Target_name + '.csv.gz', index = False, compression = 'gzip')

# BF520 human host
index_file = './data/prefs/BF520c2_to_HXB2.csv'
Target_name = 'HIV BF520 human host'

pref_file = './data/prefs/HIV BF520 human host-1_prefs.csv'
rep_num = 'rep_1'
df1 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV BF520 human host-2_prefs.csv'
rep_num = 'rep_2'
df2 = transform_host(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv(PREF_MERGED_DIR + Target_name + '.csv.gz', index = False, compression = 'gzip')

# BF520 rhesus host
index_file = './data/prefs/BF520c2_to_HXB2.csv'
Target_name = 'HIV BF520 rhesus host'

pref_file = './data/prefs/HIV BF520 rhesus host-1_prefs.csv'
rep_num = 'rep_1'
df1 = transform_host(index_file, pref_file, rep_num)

pref_file = './data/prefs/HIV BF520 rhesus host-2_prefs.csv'
rep_num = 'rep_2'
df2 = transform_host(index_file, pref_file, rep_num)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv(PREF_MERGED_DIR + Target_name + '.csv.gz', index = False, compression = 'gzip')


# PR8, Aichi68C, MS, MxA, FP16, FP20, VRC34

In [3]:
def preference_merge(REP, Target_name, count_file):
    pref_list = ['./data/prefs/'+Target_name+'-1_prefs.csv',
                 './data/prefs/'+Target_name+'-2_prefs.csv',
                 './data/prefs/'+Target_name+'-3_prefs.csv']

    df_count = pd.read_csv(count_file)
    site_list = df_count['site'].tolist()
    site_list_merged = []
    for site in site_list:
        site_list_merged += [site]*20

    aa_list_merged = []
    for i in range(len(site_list)):
        aa_list_merged += AA

    rep=[i+1 for i in range(REP)]
    rep_list=[]
    for replicate in rep:
        rep_list.append('rep_'+str(replicate))
    df_pref_merged = pd.DataFrame(columns=['site', 'amino_acid'] + rep_list + ['average'])
    df_pref_merged['site'] = site_list_merged
    df_pref_merged['amino_acid'] = aa_list_merged

    for i in rep:
        df_pref = pd.read_csv(pref_list[i-1])
        df_pref_aa = df_pref[AA]
        df_pref_merged['rep_'+str(i)] = df_pref_aa.values.flatten()
    df_pref_merged['average'] = df_pref_merged[rep_list].mean(axis = 1)
    df_pref_merged.to_csv(PREF_MERGED_DIR + Target_name + '.csv.gz', index = False, compression = 'gzip')

# PR8
REP=3
Target_name = 'PR8'
count_file = './data/raw_data/PR8_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

# Aichi68C
REP=2
Target_name = 'Aichi68C'
count_file = './data/raw_data/Aichi68C_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

# MS
REP=2
Target_name = 'MS'
count_file = './data/raw_data/MS_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

# MxA
REP=2
Target_name = 'MxA'
count_file = './data/raw_data/MxA_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

# MxAneg
REP=2
Target_name = 'MxAneg'
count_file = './data/raw_data/MxAneg_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

# FP16
REP=2
Target_name = 'HIV bnAbs FP16'
count_file = './data/raw_data/FP16_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

# FP20
REP=2
Target_name = 'HIV bnAbs FP20'
count_file = './data/raw_data/FP20_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)

# VRC34
REP=2
Target_name = 'HIV bnAbs VRC34'
count_file = './data/raw_data/VRC34_DNA_codoncounts.csv'
preference_merge(REP, Target_name, count_file)


# Flu WSN, A549, CCL141, Matrix_M1, ZIKV, Perth2009

In [4]:
def transform_WSN(df, rep_num, WSN):
    if WSN == True:
        df = df.drop(['WT_AA', 'SITE_ENTROPY'], axis = 1)
        df.set_axis(['site']+[i[3] for i in df.columns[1:]], axis=1,inplace=True)
    site_list = []
    AA_list = []
    df_pref_only = df[AA]
    for i in df['site'].tolist():
        site_list += [i]*20
        AA_list += AA
    df_pref = pd.DataFrame(columns=['site', 'amino_acid', rep_num]) 
    df_pref['site'] = site_list
    df_pref['amino_acid'] = AA_list
    df_pref[rep_num] = df_pref_only.values.flatten()
    return df_pref

# WSN
Target_name = 'WSN'
df = pd.read_csv('./data/prefs/WSN-1_prefs.txt', delimiter = '\t')
rep_num = 'rep_1'
df1 = transform_WSN(df, rep_num, True)

df = pd.read_csv('./data/prefs/WSN-2_prefs.txt', delimiter = '\t')
rep_num = 'rep_2'
df2 = transform_WSN(df, rep_num, True)

df = pd.read_csv('./data/prefs/WSN-3_prefs.txt', delimiter = '\t')
rep_num = 'rep_3'
df3 = transform_WSN(df, rep_num, True)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df3, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2', 'rep_3']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv(PREF_MERGED_DIR + Target_name + '.csv.gz', index = False, compression = 'gzip')

# A549
Target_name = 'A549'
df = pd.read_csv('./data/prefs/A549-1_prefs.csv', delimiter = ',')
rep_num = 'rep_1'
df1 = transform_WSN(df, rep_num, False)

df = pd.read_csv('./data/prefs/A549-2_prefs.csv', delimiter = ',')
rep_num = 'rep_2'
df2 = transform_WSN(df, rep_num, False)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv(PREF_MERGED_DIR + Target_name + '.csv.gz', index = False, compression = 'gzip')

#CCL141
Target_name = 'CCL141'
df = pd.read_csv('./data/prefs/CCL141-1_prefs.csv', delimiter = ',')
rep_num = 'rep_1'
df1 = transform_WSN(df, rep_num, False)

df = pd.read_csv('./data/prefs/CCL141-2_prefs.csv', delimiter = ',')
rep_num = 'rep_2'
df2 = transform_WSN(df, rep_num, False)

df = pd.read_csv('./data/prefs/CCL141-3_prefs.csv', delimiter = ',')
rep_num = 'rep_3'
df3 = transform_WSN(df, rep_num, False)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df3, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2', 'rep_3']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv(PREF_MERGED_DIR + Target_name + '.csv.gz', index = False, compression = 'gzip')

# Matrix_M1
Target_name = 'Matrix_M1'
df = pd.read_csv('./data/prefs/Matrix_M1-1_prefs.csv', delimiter = ',')
rep_num = 'rep_1'
df1 = transform_WSN(df, rep_num, False)

df = pd.read_csv('./data/prefs/Matrix_M1-2_prefs.csv', delimiter = ',')
rep_num = 'rep_2'
df2 = transform_WSN(df, rep_num, False)

df = pd.read_csv('./data/prefs/Matrix_M1-3_prefs.csv', delimiter = ',')
rep_num = 'rep_3'
df3 = transform_WSN(df, rep_num, False)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df3, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2', 'rep_3']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv(PREF_MERGED_DIR + Target_name + '.csv.gz', index = False, compression = 'gzip')

# ZIKV
Target_name = 'ZIKV'
df = pd.read_csv('./data/prefs/ZIKV-1_prefs.csv', delimiter = ',')
rep_num = 'rep_1'
df1 = transform_WSN(df, rep_num, False)

df = pd.read_csv('./data/prefs/ZIKV-2_prefs.csv', delimiter = ',')
rep_num = 'rep_2'
df2 = transform_WSN(df, rep_num, False)

df = pd.read_csv('./data/prefs/ZIKV-3_prefs.csv', delimiter = ',')
rep_num = 'rep_3'
df3 = transform_WSN(df, rep_num, False)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df3, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2', 'rep_3']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv(PREF_MERGED_DIR + Target_name + '.csv.gz', index = False, compression = 'gzip')


# Perth2009
Target_name = 'Perth2009'
df = pd.read_csv('./data/prefs/Perth2009-1_prefs.csv', delimiter = ',')
rep_num = 'rep_1'
df1 = transform_WSN(df, rep_num, False)

df = pd.read_csv('./data/prefs/Perth2009-2_prefs.csv', delimiter = ',')
rep_num = 'rep_2'
df2 = transform_WSN(df, rep_num, False)

df = pd.read_csv('./data/prefs/Perth2009-3_prefs.csv', delimiter = ',')
rep_num = 'rep_3'
df3 = transform_WSN(df, rep_num, False)

df = pd.read_csv('./data/prefs/Perth2009-4_prefs.csv', delimiter = ',')
rep_num = 'rep_4'
df4 = transform_WSN(df, rep_num, False)

df_merge = pd.merge(df1, df2, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df3, on = ['site', 'amino_acid'])
df_merge = pd.merge(df_merge, df4, on = ['site', 'amino_acid'])
df_merge['average'] = df_merge[['rep_1', 'rep_2', 'rep_3', 'rep_4']].mean(axis=1)
df_merge = df_merge.sort_values('site')
df_merge.to_csv(PREF_MERGED_DIR + Target_name + '.csv.gz', index = False, compression = 'gzip')


# TpoR, TpoR_S505N, Ube4b, YAP1

In [5]:
def Transfor_MaveDB(Target_name, pref_column, pref_file):
    df = pd.read_csv(pref_file)
    site_list = []
    amino_acid = []
    pref_list = []
    for _ in range(len(pref_column)):
        pref_list.append([])
    for i in range(df.shape[0]):

        parser = re.split('(\d+)', df['hgvs_pro'][i])
        if len(parser) == 3 and '?' not in parser:
            site_list.append(int(parser[1]))
            amino_acid.append(Amino_acid_dict[parser[-1]])
            for idx in range(len(pref_column)):
                pref_list[idx].append(df[pref_column[idx]][i])

    df_merge = pd.DataFrame(columns=['site', 'amino_acid']+['rep_'+str(i+1) for i in range(len(pref_column))]+['average'])
    df_merge['site'] = site_list
    df_merge['amino_acid']= amino_acid
    for i in range(len(pref_column)):
        df_merge['rep_'+str(i+1)] = pref_list[i]
    df_merge['average'] = df_merge[['rep_'+str(i+1) for i in range(len(pref_column))]].mean(axis=1)
    df_merge = df_merge.sort_values('site')
    df_merge.to_csv(PREF_MERGED_DIR + Target_name + '.csv.gz', index = False, compression = 'gzip')
    

Target_name = 'TpoR'
pref_file = './data/prefs/TpoR_prefs.csv.gz'
pref_column = ['score_Replicate_A', 
               'score_Replicate_B', 
               'score_Replicate_C', 
               'score_Replicate_D', 
               'score_Replicate_E', 
               'score_Replicate_F.1']
Transfor_MaveDB(Target_name, pref_column, pref_file)


Target_name = 'TpoR_S505N'
pref_file = './data/prefs/TpoR_S505N_prefs.csv.gz'
pref_column = ['score_Replicate_1', 
               'score_Replicate_2', 
               'score_Replicate_3', 
               'score_Replicate_4', 
               'score_Replicate_5', 
               'score_Replicate_6.1']
Transfor_MaveDB(Target_name, pref_column, pref_file)


Target_name = 'Ube4b'
pref_file = './data/prefs/Ube4b_prefs.csv.gz'
pref_column = ['score_Rep_2', 
               'score_Rep_3']
Transfor_MaveDB(Target_name, pref_column, pref_file)


Target_name = 'YAP1'
pref_file = './data/prefs/YAP1_prefs.csv.gz'
pref_column = ['score_101208', 
               'score_110307']
Transfor_MaveDB(Target_name, pref_column, pref_file)

Target_name = 'E3'
pref_file = './data/prefs/E3_prefs.csv'
pref_column = ['score_PlusE2NewRep3', 
               'score_PlusE2NewRep4',
               'score_PlusE2NewRep5',
               'score_PlusE2Rep3', 
               'score_PlusE2Rep4',
               'score_PlusE2Rep5']
Transfor_MaveDB(Target_name, pref_column, pref_file)

Target_name = 'Y2H_1'
pref_file = './data/prefs/Y2H_prefs.csv'
pref_column = ['score_Y2H_1_Rep1', 
               'score_Y2H_1_Rep2',
               'score_Y2H_1_Rep3']
Transfor_MaveDB(Target_name, pref_column, pref_file)

Target_name = 'Y2H_2'
pref_file = './data/prefs/Y2H_prefs.csv'
pref_column = ['score_Y2H_2_Rep1', 
               'score_Y2H_2_Rep2',
               'score_Y2H_2_Rep3']
Transfor_MaveDB(Target_name, pref_column, pref_file)

# DBR1

In [6]:
df = pd.read_csv('./data/prefs/DBR1_prefs.csv.gz')
df = df.dropna()
df = df.drop(['Sequence', 'Reference AA', 'Variant Class'], axis = 1)
df = df.rename(columns={'Affected codon': 'site', 
                        'Substituted AA': 'amino_acid',
                        'Day 11 log2 enrichment score (replicate 1)': 'rep_1', 
                        'Day 11 log2 enrichment score (replicate 2)': 'rep_2'})
df['average'] = df[['rep_1', 'rep_2']].mean(axis = 1)
df['site'] = df['site'].astype('int')
df_merge = df.sort_values('site')
df_merge.to_csv(PREF_MERGED_DIR+'DBR1.csv.gz', index = False, compression = 'gzip')


In [7]:
def wildtype(codon_file, selection_file, output_file, replicates):
    df = pd.read_csv(codon_file)
    WT_aa = [codon2aa(i) for i in df['wildtype'].tolist()]
    df['WT_aa'] = WT_aa
    df['WT_indicator'] = True
    df_WT = df[['site', 'WT_aa','WT_indicator']]
    df_WT = df_WT.rename(columns={'WT_aa': 'amino_acid'})
    df_selection = pd.read_csv(selection_file)
    
    if 'gauged' not in ' '.join(df_selection.columns.tolist()):
        df_merge = pd.merge(df_WT, df_selection, on=['site','amino_acid'], how='right')
        df_merge['WT_indicator'] = df_merge['WT_indicator'].fillna(False)
        column_name = df_merge.columns[-(replicates+1):].tolist()
        gauged_name = [i+'_gauged' for i in column_name]
        gauged_list = []
        for i in gauged_name:
            df_merge[i] = 0
            gauged_list.append([])

        for site in df_merge['site'].unique():
            for i in range(len(column_name)):
                gauge_value = df_merge[(df_merge['site']==site)&(df_merge['WT_indicator'])][column_name[i]].values[0]
                original_list = df_merge[df_merge['site']==site][column_name[i]].tolist()
#                 print(gauge_value)
                gauged_list[i] +=  [i - gauge_value for i in original_list]

        for i in range(len(gauged_name)):
            df_merge[gauged_name[i]] = gauged_list[i]

        df_merge.to_csv(output_file, compression = 'gzip', index = False)
        print('gauge done')
        return df_merge
    else:
        print('gauge existed')


codon_file = './data/raw_data/ZIKV_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/ZIKV.csv.gz'
output_file = './output/gauged_selection_coefficients/ZIKV.csv.gz'
replicates = 3
wildtype(codon_file, selection_file, output_file, replicates)


gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,rep_3,joint,rep_1_gauged,rep_2_gauged,rep_3_gauged,joint_gauged
0,1,A,False,-0.009139,-1.203026e-02,-8.203877e-03,-0.025103,-0.226252,-0.236142,-0.222815,-0.487661
1,1,R,False,-0.037421,-3.088728e-02,-2.985570e-02,-0.081780,-0.254535,-0.254999,-0.244466,-0.544338
2,1,N,False,-0.011831,-9.553025e-03,-1.172133e-02,-0.028050,-0.228944,-0.233665,-0.226332,-0.490608
3,1,D,False,-0.003844,-3.454097e-03,-3.031401e-03,-0.008886,-0.220957,-0.227566,-0.217642,-0.471444
4,1,C,False,-0.006201,-9.180511e-03,-3.895432e-03,-0.016547,-0.223314,-0.233292,-0.218506,-0.479104
...,...,...,...,...,...,...,...,...,...,...,...
10579,504,T,False,-0.017051,-1.609082e-02,-1.713299e-02,-0.041580,-0.189193,-0.157002,-0.174478,-0.435361
10580,504,W,False,-0.001415,-1.735121e-07,-1.789024e-03,-0.002801,-0.173557,-0.140912,-0.159134,-0.396582
10581,504,Y,False,-0.001415,-1.812894e-03,-2.320765e-07,-0.002808,-0.173557,-0.142724,-0.157346,-0.396589
10582,504,V,False,-0.003488,1.104854e-03,3.992824e-03,0.006708,-0.175630,-0.139806,-0.153353,-0.387073


In [8]:
codon_file = './data/raw_data/BG505_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/HIV Env BG505.csv.gz'
output_file = './output/gauged_selection_coefficients/HIV Env BG505.csv.gz'
replicates = 3
wildtype(codon_file, selection_file, output_file, replicates)


gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,rep_3,joint,rep_1_gauged,rep_2_gauged,rep_3_gauged,joint_gauged
0,30,A,True,-0.015366,-0.042260,0.012669,0.015350,0.000000,0.000000,0.000000,0.000000
1,30,R,False,-0.021677,-0.006807,-0.002920,-0.015414,-0.006311,0.035453,-0.015589,-0.030764
2,30,N,False,-0.021327,-0.016567,-0.014036,-0.040368,-0.005960,0.025694,-0.026705,-0.055718
3,30,D,False,-0.022502,0.024385,0.013555,0.020483,-0.007135,0.066645,0.000886,0.005133
4,30,C,False,-0.010532,-0.018857,-0.020996,-0.042158,0.004834,0.023403,-0.033665,-0.057508
...,...,...,...,...,...,...,...,...,...,...,...
14065,699,T,False,-0.049961,0.011372,0.004885,-0.020309,-0.127939,0.080636,-0.040203,-0.069744
14066,699,W,False,-0.017808,0.002355,-0.013677,-0.026750,-0.095787,0.071619,-0.058764,-0.076185
14067,699,Y,False,0.011237,-0.018501,-0.023079,-0.028164,-0.066741,0.050763,-0.068166,-0.077599
14068,699,V,False,-0.028438,0.050798,-0.015883,0.011675,-0.106416,0.120062,-0.060970,-0.037761


In [9]:
codon_file = './data/raw_data/BF520_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/HIV Env BF520.csv.gz'
output_file = './output/gauged_selection_coefficients/HIV Env BF520.csv.gz'
replicates = 3
wildtype(codon_file, selection_file, output_file, replicates)


gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,rep_3,joint,rep_1_gauged,rep_2_gauged,rep_3_gauged,joint_gauged
0,30,A,True,-1.438800e-01,-1.944390e-01,-2.441822e-01,-2.950637e-01,0.000000,0.000000,0.000000,0.000000
1,30,R,False,-1.012725e-07,-2.262431e-07,-7.253101e-08,-4.000466e-07,0.143880,0.194439,0.244182,0.295063
2,30,N,False,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.143880,0.194439,0.244182,0.295064
3,30,D,False,-5.315121e-07,-3.976840e-07,-5.682080e-07,-1.497404e-06,0.143879,0.194439,0.244182,0.295062
4,30,C,False,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.143880,0.194439,0.244182,0.295064
...,...,...,...,...,...,...,...,...,...,...,...
13897,691,T,False,2.789343e-03,1.615925e-02,3.356764e-02,4.721649e-02,-0.052368,0.031392,0.055965,0.029068
13898,691,W,False,-3.017728e-03,-1.076386e-02,-4.209948e-03,-1.539347e-02,-0.058175,0.004469,0.018188,-0.033542
13899,691,Y,False,-1.249844e-02,-6.414506e-04,9.570118e-03,-2.401218e-03,-0.067656,0.014591,0.031968,-0.020550
13900,691,V,False,1.755803e-02,7.601935e-03,-1.405762e-02,1.332714e-02,-0.037600,0.022835,0.008340,-0.004821


In [10]:
codon_file = './data/raw_data/human_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/HIV BF520 human host.csv.gz'
output_file = './output/gauged_selection_coefficients/HIV BF520 human host.csv.gz'
replicates = 2
wildtype(codon_file, selection_file, output_file, replicates)


gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,joint,rep_1_gauged,rep_2_gauged,joint_gauged
0,30,A,True,-1.252172e-01,-4.717869e-02,-1.171115e-01,0.000000,0.000000,0.000000
1,30,R,False,-1.367643e-07,-2.106226e-07,-3.473869e-07,0.125217,0.047178,0.117111
2,30,N,False,0.000000e+00,0.000000e+00,0.000000e+00,0.125217,0.047179,0.117112
3,30,D,False,-4.674272e-07,-5.533146e-07,-1.020742e-06,0.125217,0.047178,0.117111
4,30,C,False,0.000000e+00,0.000000e+00,0.000000e+00,0.125217,0.047179,0.117112
...,...,...,...,...,...,...,...,...,...
14065,699,T,False,-8.224614e-09,0.000000e+00,-8.224614e-09,-0.064921,-0.102062,-0.129689
14066,699,W,False,-3.853823e-03,-2.121521e-06,-3.727067e-03,-0.068775,-0.102064,-0.133416
14067,699,Y,False,-1.057307e-06,-1.156271e-06,-2.213577e-06,-0.064922,-0.102063,-0.129691
14068,699,V,False,-8.171748e-09,0.000000e+00,-8.171748e-09,-0.064921,-0.102062,-0.129689


In [11]:
codon_file = './data/raw_data/rhesus_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/HIV BF520 rhesus host.csv.gz'
output_file = './output/gauged_selection_coefficients/HIV BF520 rhesus host.csv.gz'
replicates = 2
wildtype(codon_file, selection_file, output_file, replicates)


gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,joint,rep_1_gauged,rep_2_gauged,joint_gauged
0,30,A,True,-4.105970e-02,-5.908830e-03,-4.336264e-02,0.000000,0.000000,0.000000
1,30,R,False,-1.367643e-08,-2.106226e-08,-3.473869e-08,0.041060,0.005909,0.043363
2,30,N,False,0.000000e+00,0.000000e+00,0.000000e+00,0.041060,0.005909,0.043363
3,30,D,False,-4.674272e-08,-5.533146e-08,-1.020742e-07,0.041060,0.005909,0.043363
4,30,C,False,0.000000e+00,0.000000e+00,0.000000e+00,0.041060,0.005909,0.043363
...,...,...,...,...,...,...,...,...,...
14065,699,T,False,-8.224614e-10,0.000000e+00,-8.224614e-10,-0.007742,0.004884,-0.002695
14066,699,W,False,-3.974721e-04,-2.121521e-07,-3.986902e-04,-0.008139,0.004883,-0.003094
14067,699,Y,False,-1.057307e-07,-1.156271e-07,-2.213577e-07,-0.007742,0.004884,-0.002695
14068,699,V,False,-8.171748e-10,0.000000e+00,-8.171748e-10,-0.007742,0.004884,-0.002695


In [12]:
codon_file = './data/raw_data/VRC34_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/HIV bnAbs VRC34.csv.gz'
output_file = './output/gauged_selection_coefficients/HIV bnAbs VRC34.csv.gz'
replicates = 2
wildtype(codon_file, selection_file, output_file, replicates)


gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,joint,rep_1_gauged,rep_2_gauged,joint_gauged
0,30,A,True,-0.088512,-0.046735,-0.104449,0.000000,0.000000,0.000000
1,30,R,False,-0.004322,-0.000337,-0.005837,0.084191,0.046398,0.098611
2,30,N,False,-0.004048,-0.004190,-0.008656,0.084465,0.042545,0.095792
3,30,D,False,0.002715,0.005175,0.007046,0.091227,0.051910,0.111494
4,30,C,False,-0.003383,0.000357,-0.003288,0.085129,0.047092,0.101161
...,...,...,...,...,...,...,...,...,...
14065,699,T,False,-0.006062,0.001052,-0.004876,-0.015892,-0.000213,-0.015090
14066,699,W,False,-0.004869,-0.002079,-0.006927,-0.014699,-0.003344,-0.017141
14067,699,Y,False,-0.001773,0.006015,0.004309,-0.011604,0.004750,-0.005904
14068,699,V,False,-0.012105,-0.002908,-0.014847,-0.021935,-0.004174,-0.025060


In [13]:
codon_file = './data/raw_data/FP16_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/HIV bnAbs FP16.csv.gz'
output_file = './output/gauged_selection_coefficients/HIV bnAbs FP16.csv.gz'
replicates = 2
wildtype(codon_file, selection_file, output_file, replicates)


gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,joint,rep_1_gauged,rep_2_gauged,joint_gauged
0,30,A,True,-0.080650,-0.029013,-0.083075,0.000000,0.000000,0.000000
1,30,R,False,-0.004672,-0.015205,-0.020216,0.075979,0.013808,0.062859
2,30,N,False,-0.002804,-0.007088,-0.010122,0.077846,0.021925,0.072954
3,30,D,False,0.000402,0.001237,0.001124,0.081052,0.030250,0.084200
4,30,C,False,-0.004945,-0.002690,-0.007698,0.075705,0.026323,0.075378
...,...,...,...,...,...,...,...,...,...
14065,699,T,False,-0.002445,0.004679,0.002244,-0.000913,-0.000063,-0.000979
14066,699,W,False,-0.001201,0.000165,-0.001017,0.000331,-0.004577,-0.004241
14067,699,Y,False,-0.001784,0.004086,0.002305,-0.000251,-0.000655,-0.000919
14068,699,V,False,0.006809,-0.006085,0.000860,0.008341,-0.010827,-0.002364


In [14]:
codon_file = './data/raw_data/FP20_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/HIV bnAbs FP20.csv.gz'
output_file = './output/gauged_selection_coefficients/HIV bnAbs FP20.csv.gz'
replicates = 2
wildtype(codon_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,joint,rep_1_gauged,rep_2_gauged,joint_gauged
0,30,A,True,-0.038100,0.003191,-0.022484,0.000000,0.000000,0.000000
1,30,R,False,-0.001999,-0.009691,-0.011616,0.036101,-0.012882,0.010868
2,30,N,False,-0.005244,-0.001791,-0.007104,0.032856,-0.004982,0.015380
3,30,D,False,0.008634,-0.001572,0.007000,0.046734,-0.004763,0.029485
4,30,C,False,-0.005706,0.007516,0.001726,0.032394,0.004326,0.024210
...,...,...,...,...,...,...,...,...,...
14065,699,T,False,-0.001084,0.005489,0.004472,-0.021047,0.023266,0.001964
14066,699,W,False,-0.003872,-0.000047,-0.003920,-0.023834,0.017730,-0.006428
14067,699,Y,False,0.002603,0.003388,0.006024,-0.017359,0.021165,0.003516
14068,699,V,False,-0.001602,0.005857,0.004215,-0.021564,0.023633,0.001707


In [15]:
codon_file = './data/raw_data/Perth2009_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/Perth2009.csv.gz'
output_file = './output/gauged_selection_coefficients/Perth2009.csv.gz'
replicates = 4
wildtype(codon_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,rep_3,rep_4,joint,rep_1_gauged,rep_2_gauged,rep_3_gauged,rep_4_gauged,joint_gauged
0,1,A,False,-0.228206,-0.169474,-0.189958,0.087909,-0.222629,-0.388529,-0.659860,-0.603675,-0.141804,-0.563405
1,1,R,False,0.121223,-0.041648,0.090308,-0.112834,0.051511,-0.039100,-0.532035,-0.323409,-0.342546,-0.289265
2,1,N,False,0.077750,-0.009723,0.234759,0.005293,0.185005,-0.082573,-0.500109,-0.178959,-0.224419,-0.155770
3,1,D,False,0.034013,0.304593,-0.032153,0.189254,0.301517,-0.126310,-0.185794,-0.445871,-0.040459,-0.039259
4,1,C,False,0.110994,0.100525,0.058474,0.080342,0.207113,-0.049329,-0.389861,-0.355244,-0.149371,-0.133662
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11902,567,T,False,-0.043499,-0.096243,-0.053968,-0.102841,-0.117557,-0.891262,-1.183119,-1.176356,-1.163957,-1.406100
11903,567,W,False,0.095643,0.219418,0.039788,0.161092,0.466630,-0.752119,-0.867458,-1.082599,-0.900024,-0.821914
11904,567,Y,False,0.004296,-0.024369,-0.023013,-0.036342,-0.004656,-0.843466,-1.111244,-1.145401,-1.097457,-1.293199
11905,567,V,False,-0.124261,-0.106221,-0.014119,-0.081844,-0.122425,-0.972023,-1.193096,-1.136507,-1.142959,-1.410968


In [16]:
codon_file = './data/raw_data/CCL141_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/CCL141.csv.gz'
output_file = './output/gauged_selection_coefficients/CCL141.csv.gz'
replicates = 3
wildtype(codon_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,rep_3,joint,rep_1_gauged,rep_2_gauged,rep_3_gauged,joint_gauged
0,1,A,False,-0.061280,-0.049304,-0.047489,-0.088229,-0.772519,-0.711617,-0.725481,-1.150491
1,1,R,False,-0.089901,-0.073764,-0.085586,-0.159624,-0.801139,-0.736077,-0.763578,-1.221886
2,1,N,False,-0.037012,-0.031161,-0.020695,-0.058393,-0.748251,-0.693474,-0.698687,-1.120655
3,1,D,False,-0.025307,-0.019428,-0.023522,-0.047132,-0.736545,-0.681741,-0.701514,-1.109394
4,1,C,False,-0.026486,-0.018230,-0.033550,-0.053815,-0.737725,-0.680543,-0.711542,-1.116077
...,...,...,...,...,...,...,...,...,...,...,...
15955,760,T,False,-0.059244,-0.059847,-0.076727,-0.120981,-0.895433,-0.815671,-0.852086,-1.293460
15956,760,W,False,0.026390,0.041105,0.056984,0.178988,-0.809799,-0.714718,-0.718375,-0.993491
15957,760,Y,False,-0.055756,-0.042967,-0.042822,-0.079030,-0.891946,-0.798790,-0.818181,-1.251509
15958,760,V,False,-0.071218,-0.066983,-0.057044,-0.115528,-0.907408,-0.822807,-0.832403,-1.288006


In [17]:
codon_file = './data/raw_data/A549_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/A549.csv.gz'
output_file = './output/gauged_selection_coefficients/A549.csv.gz'
replicates = 3
wildtype(codon_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,rep_3,joint,rep_1_gauged,rep_2_gauged,rep_3_gauged,joint_gauged
0,1,A,False,-0.069898,-0.048427,-0.058717,-0.103474,-0.809507,-0.775116,-0.797766,-1.255812
1,1,R,False,-0.085384,-0.069942,-0.076981,-0.136801,-0.824993,-0.796632,-0.816030,-1.289138
2,1,N,False,-0.036199,-0.032016,-0.022438,-0.058073,-0.775808,-0.758706,-0.761487,-1.210411
3,1,D,False,-0.019406,-0.018494,-0.022437,-0.036630,-0.759015,-0.745184,-0.761486,-1.188968
4,1,C,False,-0.025904,-0.017354,-0.032003,-0.048657,-0.765513,-0.744044,-0.771052,-1.200994
...,...,...,...,...,...,...,...,...,...,...,...
15955,760,T,False,-0.058473,-0.061532,-0.075577,-0.121429,-0.909837,-0.782275,-0.869323,-1.290837
15956,760,W,False,0.039764,0.032528,0.044873,0.171268,-0.811600,-0.688215,-0.748873,-0.998140
15957,760,Y,False,-0.056840,-0.057206,-0.034837,-0.087070,-0.908204,-0.777949,-0.828583,-1.256478
15958,760,V,False,-0.065433,-0.058801,-0.054878,-0.099144,-0.916797,-0.779544,-0.848623,-1.268552


In [18]:
codon_file = './data/raw_data/MxA_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/MxA.csv.gz'
output_file = './output/gauged_selection_coefficients/MxA.csv.gz'
replicates = 2
wildtype(codon_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,joint,rep_1_gauged,rep_2_gauged,joint_gauged
0,1,A,False,-1.642490e-06,-1.022034e-06,-2.664524e-06,-0.280304,-0.265423,-0.429581
1,1,R,False,3.771611e-03,9.100634e-03,1.451313e-02,-0.276531,-0.256321,-0.415065
2,1,N,False,-6.355282e-07,-4.109946e-07,-1.046523e-06,-0.280303,-0.265422,-0.429579
3,1,D,False,0.000000e+00,-5.827963e-09,-5.827963e-09,-0.280302,-0.265422,-0.429578
4,1,C,False,-3.727336e-09,-3.885309e-09,-7.612645e-09,-0.280302,-0.265422,-0.429578
...,...,...,...,...,...,...,...,...,...
10453,498,T,False,3.979515e-01,3.544422e-01,5.340280e-01,-0.455157,-0.206273,-0.249650
10454,498,W,False,-3.222397e-01,-3.782226e-01,-5.122538e-01,-1.175348,-0.938938,-1.295931
10455,498,Y,False,-2.442741e-01,-2.977706e-01,-4.134223e-01,-1.097383,-0.858486,-1.197100
10456,498,V,False,-5.250783e-01,-5.549608e-01,-6.357450e-01,-1.378187,-1.115676,-1.419423


In [19]:
codon_file = './data/raw_data/MS_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/MS.csv.gz'
output_file = './output/gauged_selection_coefficients/MS.csv.gz'
replicates = 2
wildtype(codon_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,joint,rep_1_gauged,rep_2_gauged,joint_gauged
0,1,A,False,-1.642490e-06,-1.022034e-06,-2.664524e-06,-0.237307,-0.233431,-0.364074
1,1,R,False,7.254972e-03,1.002249e-02,1.881476e-02,-0.230051,-0.223408,-0.345257
2,1,N,False,-6.355282e-07,-4.109946e-07,-1.046523e-06,-0.237306,-0.233431,-0.364073
3,1,D,False,0.000000e+00,-5.827963e-09,-5.827963e-09,-0.237306,-0.233430,-0.364072
4,1,C,False,2.902274e-03,-3.885309e-09,3.066658e-03,-0.234403,-0.233430,-0.361005
...,...,...,...,...,...,...,...,...,...
10453,498,T,False,5.013515e-01,3.558763e-01,5.944799e-01,-0.367383,-0.222222,-0.201849
10454,498,W,False,-3.031567e-01,-3.754200e-01,-4.964028e-01,-1.171892,-0.953519,-1.292732
10455,498,Y,False,-2.576910e-01,-2.718026e-01,-4.031868e-01,-1.126426,-0.849901,-1.199516
10456,498,V,False,-4.746157e-01,-4.938968e-01,-5.653539e-01,-1.343351,-1.071995,-1.361683


In [20]:
codon_file = './data/raw_data/MxAneg_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/MxAneg.csv.gz'
output_file = './output/gauged_selection_coefficients/MxAneg.csv.gz'
replicates = 2
wildtype(codon_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,joint,rep_1_gauged,rep_2_gauged,joint_gauged
0,1,A,False,-1.642490e-06,-1.022034e-06,-2.664524e-06,-0.280304,-0.265423,-0.429581
1,1,R,False,3.771611e-03,9.100634e-03,1.451313e-02,-0.276531,-0.256321,-0.415065
2,1,N,False,-6.355282e-07,-4.109946e-07,-1.046523e-06,-0.280303,-0.265422,-0.429579
3,1,D,False,0.000000e+00,-5.827963e-09,-5.827963e-09,-0.280302,-0.265422,-0.429578
4,1,C,False,-3.727336e-09,-3.885309e-09,-7.612645e-09,-0.280302,-0.265422,-0.429578
...,...,...,...,...,...,...,...,...,...
10453,498,T,False,3.979515e-01,3.544422e-01,5.340280e-01,-0.455157,-0.206273,-0.249650
10454,498,W,False,-3.222397e-01,-3.782226e-01,-5.122538e-01,-1.175348,-0.938938,-1.295931
10455,498,Y,False,-2.442741e-01,-2.977706e-01,-4.134223e-01,-1.097383,-0.858486,-1.197100
10456,498,V,False,-5.250783e-01,-5.549608e-01,-6.357450e-01,-1.378187,-1.115676,-1.419423


In [21]:
codon_file = './data/raw_data/HomM1_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/Matrix_M1.csv.gz'
output_file = './output/gauged_selection_coefficients/Matrix_M1.csv.gz'
replicates = 3
wildtype(codon_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,rep_3,joint,rep_1_gauged,rep_2_gauged,rep_3_gauged,joint_gauged
0,1,A,False,-0.143420,-0.130483,-0.155258,-0.213219,-1.397071,-1.381862,-1.378743,-1.708042
1,1,R,False,-0.157189,-0.146008,-0.153729,-0.215293,-1.410840,-1.397387,-1.377214,-1.710116
2,1,N,False,-0.056040,-0.060483,-0.057589,-0.097759,-1.309691,-1.311862,-1.281074,-1.592582
3,1,D,False,-0.061209,-0.059326,-0.051964,-0.096627,-1.314860,-1.310705,-1.275450,-1.591451
4,1,C,False,-0.052824,-0.048705,-0.047273,-0.080897,-1.306474,-1.300084,-1.270758,-1.575720
...,...,...,...,...,...,...,...,...,...,...,...
5308,253,T,False,-0.057007,-0.064913,-0.044685,-0.099932,-0.774206,-0.822459,-0.711495,-1.229170
5309,253,W,False,-0.003961,-0.009725,0.002024,0.007765,-0.721160,-0.767271,-0.664786,-1.121473
5310,253,Y,False,-0.009920,-0.016659,-0.026625,-0.021214,-0.727119,-0.774205,-0.693435,-1.150451
5311,253,V,False,-0.043550,-0.038977,-0.045208,-0.078253,-0.760749,-0.796523,-0.712018,-1.207490


In [22]:
codon_file = './data/raw_data/PR8_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/PR8.csv.gz'
output_file = './output/gauged_selection_coefficients/PR8.csv.gz'
replicates = 3
wildtype(codon_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,rep_3,joint,rep_1_gauged,rep_2_gauged,rep_3_gauged,joint_gauged
0,1,A,False,-1.588242e-06,-9.116109e-07,-1.015400e-06,-3.515253e-06,-0.558729,-0.165669,-0.130102,-0.583335
1,1,R,False,1.172463e-02,1.176479e-03,-2.269012e-03,1.484895e-02,-0.547003,-0.164491,-0.132370,-0.568482
2,1,N,False,-3.077798e-06,-5.027946e-07,-2.219152e-07,-3.802507e-06,-0.558730,-0.165668,-0.130101,-0.583335
3,1,D,False,-1.890902e-09,-2.501504e-09,-1.469833e-09,-5.862239e-09,-0.558727,-0.165668,-0.130101,-0.583331
4,1,C,False,0.000000e+00,-2.590863e-09,0.000000e+00,-2.590863e-09,-0.558727,-0.165668,-0.130101,-0.583331
...,...,...,...,...,...,...,...,...,...,...,...
10453,498,T,False,-1.846334e-01,-1.332931e-01,-8.458195e-02,-3.361313e-01,-0.180615,-0.981435,-0.597915,-0.787924
10454,498,W,False,-3.153976e-02,-4.990786e-02,-7.890274e-02,-1.561473e-01,-0.027521,-0.898050,-0.592235,-0.607940
10455,498,Y,False,-2.028449e-01,8.705869e-02,3.726587e-01,2.044085e-01,-0.198826,-0.761084,-0.140674,-0.247384
10456,498,V,False,-2.480221e-01,4.274552e-02,3.133219e-02,-1.474319e-01,-0.244004,-0.805397,-0.482000,-0.599225


In [23]:
codon_file = './data/raw_data/Aichi68C_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/Aichi68C.csv.gz'
output_file = './output/gauged_selection_coefficients/Aichi68C.csv.gz'
replicates = 2
wildtype(codon_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,joint,rep_1_gauged,rep_2_gauged,joint_gauged
0,1,A,False,1.260099e-04,-1.022034e-07,1.277259e-04,-0.037363,-0.031884,-0.066734
1,1,R,False,4.552973e-04,7.052332e-04,1.197320e-03,-0.037034,-0.031179,-0.065664
2,1,N,False,-6.355282e-08,-4.109946e-08,-1.046523e-07,-0.037490,-0.031884,-0.066861
3,1,D,False,0.000000e+00,-5.827963e-10,-5.827963e-10,-0.037489,-0.031884,-0.066861
4,1,C,False,-3.727336e-10,-3.885309e-10,-7.612645e-10,-0.037489,-0.031884,-0.066861
...,...,...,...,...,...,...,...,...,...
10453,498,T,False,4.667052e-02,4.290091e-02,9.936530e-02,-0.442375,-0.152895,-0.334225
10454,498,W,False,-5.076745e-02,-6.087270e-02,-1.009676e-01,-0.539813,-0.256668,-0.534558
10455,498,Y,False,-4.273999e-02,-4.471882e-02,-7.991174e-02,-0.531785,-0.240514,-0.513502
10456,498,V,False,-1.273709e-01,-1.342239e-01,-2.139638e-01,-0.616416,-0.330019,-0.647554


In [24]:
codon_file = './data/raw_data/WSN_DNA_codoncounts.csv'
selection_file = './output/selection_coefficients/WSN.csv.gz'
output_file = './output/gauged_selection_coefficients/WSN.csv.gz'
replicates = 3
wildtype(codon_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,rep_3,joint,rep_1_gauged,rep_2_gauged,rep_3_gauged,joint_gauged
0,1,A,False,-5.774015e-10,0.000000e+00,-1.802252e-08,-1.859993e-08,0.003398,0.013481,0.001531,0.016561
1,1,R,False,-2.999165e-04,-2.999176e-04,-2.999207e-04,-8.997548e-04,0.003098,0.013181,0.001231,0.015661
2,1,N,False,-1.639706e-03,-2.761312e-03,-1.698832e-03,-6.117113e-03,0.001758,0.010720,-0.000168,0.010443
3,1,D,False,-9.829320e-09,-1.988292e-08,-1.115037e-08,-4.086261e-08,0.003398,0.013481,0.001531,0.016560
4,1,C,False,0.000000e+00,0.000000e+00,-4.573932e-10,-4.573932e-10,0.003398,0.013481,0.001531,0.016561
...,...,...,...,...,...,...,...,...,...,...,...
11860,565,T,False,-2.269957e-03,-9.044772e-03,-8.256772e-03,-1.340381e-02,-0.251617,-0.285085,-0.282201,-0.585212
11861,565,W,False,-9.687717e-03,-9.078947e-03,-8.186585e-03,-2.198810e-02,-0.259035,-0.285119,-0.282131,-0.593796
11862,565,Y,False,-8.586323e-03,-1.010091e-02,-8.385556e-03,-2.209933e-02,-0.257933,-0.286141,-0.282330,-0.593907
11863,565,V,False,-4.195443e-03,-2.360678e-02,-1.729799e-02,-2.527836e-02,-0.253542,-0.299647,-0.291243,-0.597087


In [25]:
def wildtype_full_length(sequence_file, selection_file, output_file, replicates):
    with open(sequence_file, 'r') as file:
        nucleotide = file.read().rstrip()
        codon_list = [nucleotide[i:i+3] for i in range(0, len(nucleotide), 3)]
        WT_aa = [codon2aa(i) for i in codon_list]

    df_selection = pd.read_csv(selection_file)
    site_list = df_selection['site'].unique()
    df_WT = pd.DataFrame(columns = ['site', 'WT_aa', 'WT_indicator'])
    df_WT['site'] = site_list
    df_WT['WT_aa'] = WT_aa
    df_WT['WT_indicator'] = True
    # df_WT = df[['site', 'WT_aa','WT_indicator']]
    df_WT = df_WT.rename(columns={'WT_aa': 'amino_acid'})

    if 'gauged' not in ' '.join(df_selection.columns.tolist()):
        df_merge = pd.merge(df_WT, df_selection, on=['site','amino_acid'], how='right')
        df_merge['WT_indicator'] = df_merge['WT_indicator'].fillna(False)
        column_name = df_merge.columns[-(replicates+1):].tolist()
        gauged_name = [i+'_gauged' for i in column_name]
        gauged_list = []
        for i in gauged_name:
            df_merge[i] = 0
            gauged_list.append([])

        for site in df_merge['site'].unique():
            for i in range(len(column_name)):
                gauge_value = df_merge[(df_merge['site']==site)&(df_merge['WT_indicator'])][column_name[i]].values[0]
                original_list = df_merge[df_merge['site']==site][column_name[i]].tolist()
    #                 print(gauge_value)
                gauged_list[i] +=  [i - gauge_value for i in original_list]

        for i in range(len(gauged_name)):
            df_merge[gauged_name[i]] = gauged_list[i]

        df_merge.to_csv(output_file, compression = 'gzip', index = False)
        print('gauge done')
        return df_merge
    else:
        print('gauge existed')

In [26]:
sequence_file = './data/raw_data/YAP1_Reference_sequence.txt'
selection_file = './output/selection_coefficients/YAP1.csv.gz'
output_file = './output/gauged_selection_coefficients/YAP1.csv.gz'
replicates = 2

wildtype_full_length(sequence_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,joint,rep_1_gauged,rep_2_gauged,joint_gauged
0,2,A,False,0.06310,0.086900,0.06450,0.24310,0.270900,0.25950
1,2,R,False,0.05210,0.051000,0.08840,0.23210,0.235000,0.28340
2,2,N,False,-0.05600,-0.092200,-0.08870,0.12400,0.091800,0.10630
3,2,D,True,-0.18000,-0.184000,-0.19500,0.00000,0.000000,0.00000
4,2,C,False,0.00273,0.003270,0.00414,0.18273,0.187270,0.19914
...,...,...,...,...,...,...,...,...,...
709,35,T,False,0.00378,0.002820,0.00670,-0.38722,-0.391180,-0.39330
710,35,W,False,0.00000,0.000000,0.00000,-0.39100,-0.394000,-0.40000
711,35,Y,False,-0.01350,-0.011200,-0.02300,-0.40450,-0.405200,-0.42300
712,35,V,False,0.00244,0.000896,0.00347,-0.38856,-0.393104,-0.39653


In [27]:
sequence_file = './data/raw_data/Ube4b_Reference_sequence.txt'
selection_file = './output/selection_coefficients/Ube4b.csv.gz'
output_file = './output/gauged_selection_coefficients/Ube4b.csv.gz'
replicates = 2

wildtype_full_length(sequence_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,joint,rep_1_gauged,rep_2_gauged,joint_gauged
0,2,A,False,0.0000,0.00000,0.0000,-0.1320,-0.09740,-0.1270
1,2,R,False,0.0274,-0.00606,0.0234,-0.1046,-0.10346,-0.1036
2,2,N,False,0.0000,0.00000,0.0000,-0.1320,-0.09740,-0.1270
3,2,D,False,0.0000,0.00000,0.0000,-0.1320,-0.09740,-0.1270
4,2,C,False,0.0000,0.00000,0.0000,-0.1320,-0.09740,-0.1270
...,...,...,...,...,...,...,...,...,...
2158,104,T,False,0.0000,0.00000,0.0000,-0.0225,-0.01100,-0.0168
2159,104,W,False,-0.0832,-0.06210,-0.0793,-0.1057,-0.07310,-0.0961
2160,104,Y,False,0.0000,0.00000,0.0000,-0.0225,-0.01100,-0.0168
2161,104,V,False,-0.0112,-0.01690,-0.0274,-0.0337,-0.02790,-0.0442


In [28]:
sequence_file = './data/raw_data/BRCA1_Reference_sequence.txt'
selection_file = './output/selection_coefficients/Y2H_1.csv.gz'
output_file = './output/gauged_selection_coefficients/Y2H_1.csv.gz'
replicates = 3

wildtype_full_length(sequence_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,rep_3,joint,rep_1_gauged,rep_2_gauged,rep_3_gauged,joint_gauged
0,2,A,False,0.000007,0.000009,0.000011,0.000027,0.000080,0.000036,0.000004,0.000111
1,2,R,False,0.000000,0.000000,0.000000,0.000000,0.000073,0.000028,-0.000008,0.000084
2,2,N,False,-0.000050,-0.000055,-0.000043,-0.000153,0.000024,-0.000028,-0.000050,-0.000069
3,2,D,True,-0.000073,-0.000028,0.000008,-0.000084,0.000000,0.000000,0.000000,0.000000
4,2,C,False,0.000000,0.000000,0.000000,0.000000,0.000073,0.000028,-0.000008,0.000084
...,...,...,...,...,...,...,...,...,...,...,...
6358,304,T,False,0.000000,0.000000,0.000000,0.000000,-0.000299,-0.000337,-0.000288,-0.000900
6359,304,W,False,0.000000,0.000000,0.000000,0.000000,-0.000299,-0.000337,-0.000288,-0.000900
6360,304,Y,False,0.000003,0.000001,0.000001,0.000006,-0.000296,-0.000336,-0.000287,-0.000894
6361,304,V,False,-0.000024,-0.000016,-0.000009,-0.000045,-0.000323,-0.000353,-0.000297,-0.000945


In [29]:
sequence_file = './data/raw_data/BRCA1_Reference_sequence.txt'
selection_file = './output/selection_coefficients/Y2H_2.csv.gz'
output_file = './output/gauged_selection_coefficients/Y2H_2.csv.gz'
replicates = 3

wildtype_full_length(sequence_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,rep_3,joint,rep_1_gauged,rep_2_gauged,rep_3_gauged,joint_gauged
0,2,A,False,-0.000003,-0.000006,-1.050000e-05,-0.000019,0.000029,0.000128,0.000274,0.000358
1,2,R,False,0.000000,0.000000,0.000000e+00,0.000000,0.000032,0.000134,0.000284,0.000377
2,2,N,False,-0.000096,-0.000064,-5.210000e-05,-0.000241,-0.000064,0.000070,0.000232,0.000136
3,2,D,True,-0.000032,-0.000134,-2.840000e-04,-0.000377,0.000000,0.000000,0.000000,0.000000
4,2,C,False,0.000000,0.000000,0.000000e+00,0.000000,0.000032,0.000134,0.000284,0.000377
...,...,...,...,...,...,...,...,...,...,...,...
6358,304,T,False,0.000000,0.000000,0.000000e+00,0.000000,-0.000026,-0.000015,-0.000094,-0.000149
6359,304,W,False,0.000000,0.000000,0.000000e+00,0.000000,-0.000026,-0.000015,-0.000094,-0.000149
6360,304,Y,False,0.000001,0.000004,-1.130000e-06,0.000005,-0.000025,-0.000011,-0.000095,-0.000144
6361,304,V,False,-0.000006,-0.000005,-4.140000e-07,-0.000010,-0.000032,-0.000021,-0.000094,-0.000159


In [30]:
sequence_file = './data/raw_data/BRCA1_Reference_sequence.txt'
selection_file = './output/selection_coefficients/E3.csv.gz'
output_file = './output/gauged_selection_coefficients/E3.csv.gz'
replicates = 6

wildtype_full_length(sequence_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,rep_3,rep_4,rep_5,rep_6,joint,rep_1_gauged,rep_2_gauged,rep_3_gauged,rep_4_gauged,rep_5_gauged,rep_6_gauged,joint_gauged
0,2,A,False,-0.00198,-0.002430,-0.00137,-0.001020,-0.000787,-0.00107,-0.00848,-0.02048,-0.012930,-0.00324,-0.007280,-0.000214,-0.01957,-0.02538
1,2,R,False,0.00000,0.000000,0.00000,0.000000,0.000000,0.00000,0.00000,-0.01850,-0.010500,-0.00187,-0.006260,0.000573,-0.01850,-0.01690
2,2,N,False,0.00403,-0.017200,-0.00392,0.011400,0.005000,0.00248,0.01860,-0.01447,-0.027700,-0.00579,0.005140,0.005573,-0.01602,0.00170
3,2,D,True,0.01850,0.010500,0.00187,0.006260,-0.000573,0.01850,0.01690,0.00000,0.000000,0.00000,0.000000,0.000000,0.00000,0.00000
4,2,C,False,0.00000,0.000000,0.00000,0.000000,0.000000,0.00000,0.00000,-0.01850,-0.010500,-0.00187,-0.006260,0.000573,-0.01850,-0.01690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6358,304,T,False,0.00000,0.000000,0.00000,0.000000,0.000000,0.00000,0.00000,0.03130,0.006110,0.04200,0.028400,0.031000,0.03790,0.07240
6359,304,W,False,0.00000,0.000000,0.00000,0.000000,0.000000,0.00000,0.00000,0.03130,0.006110,0.04200,0.028400,0.031000,0.03790,0.07240
6360,304,Y,False,0.00525,0.000717,0.00380,-0.000104,0.002160,0.00233,0.00963,0.03655,0.006827,0.04580,0.028296,0.033160,0.04023,0.08203
6361,304,V,False,0.00266,-0.000511,0.00115,0.001040,0.000888,0.00668,0.00818,0.03396,0.005599,0.04315,0.029440,0.031888,0.04458,0.08058


In [31]:
sequence_file = './data/raw_data/TpoR_Reference_sequence.txt'
selection_file = './output/selection_coefficients/TpoR.csv.gz'
output_file = './output/gauged_selection_coefficients/TpoR.csv.gz'
replicates = 6

wildtype_full_length(sequence_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,rep_3,rep_4,rep_5,rep_6,joint,rep_1_gauged,rep_2_gauged,rep_3_gauged,rep_4_gauged,rep_5_gauged,rep_6_gauged,joint_gauged
0,2,A,False,0.001220,0.000208,0.000036,0.000280,0.000299,0.000541,0.00268,0.010540,0.005808,0.006356,0.011580,0.011799,0.014441,0.05558
1,2,R,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.009320,0.005600,0.006320,0.011300,0.011500,0.013900,0.05290
2,2,N,False,0.002130,0.000941,0.001120,0.003250,0.003720,0.002880,0.01230,0.011450,0.006541,0.007440,0.014550,0.015220,0.016780,0.06520
3,2,D,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.009320,0.005600,0.006320,0.011300,0.011500,0.013900,0.05290
4,2,C,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.009320,0.005600,0.006320,0.011300,0.011500,0.013900,0.05290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646,32,T,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.006000,0.003990,0.004250,0.004030,0.005670,0.004530,0.02760
647,32,W,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.006000,0.003990,0.004250,0.004030,0.005670,0.004530,0.02760
648,32,Y,False,0.000362,0.000302,0.000202,0.000162,0.000376,0.000138,0.00160,0.006362,0.004292,0.004452,0.004192,0.006046,0.004668,0.02920
649,32,V,False,0.000798,0.000473,0.000312,0.000897,0.000617,0.000823,0.00406,0.006798,0.004463,0.004562,0.004927,0.006287,0.005353,0.03166


In [32]:
sequence_file = './data/raw_data/TpoR_Reference_sequence.txt'
selection_file = './output/selection_coefficients/TpoR_S505N.csv.gz'
output_file = './output/gauged_selection_coefficients/TpoR_S505N.csv.gz'
replicates = 6

wildtype_full_length(sequence_file, selection_file, output_file, replicates)

gauge done


Unnamed: 0,site,amino_acid,WT_indicator,rep_1,rep_2,rep_3,rep_4,rep_5,rep_6,joint,rep_1_gauged,rep_2_gauged,rep_3_gauged,rep_4_gauged,rep_5_gauged,rep_6_gauged,joint_gauged
0,2,A,False,-0.000112,-0.000128,0.001430,0.000504,0.000246,0.000175,0.002090,-0.000308,-0.002118,0.004820,0.001934,0.001616,0.001745,0.007280
1,2,R,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000196,-0.001990,0.003390,0.001430,0.001370,0.001570,0.005190
2,2,N,False,0.000251,-0.000101,0.000591,-0.000506,0.000484,0.000531,0.001210,0.000055,-0.002091,0.003981,0.000924,0.001854,0.002101,0.006400
3,2,D,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000196,-0.001990,0.003390,0.001430,0.001370,0.001570,0.005190
4,2,C,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000196,-0.001990,0.003390,0.001430,0.001370,0.001570,0.005190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646,32,T,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.002120,-0.000989,0.003790,-0.000605,0.000499,0.002290,0.002690
647,32,W,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.002120,-0.000989,0.003790,-0.000605,0.000499,0.002290,0.002690
648,32,Y,False,-0.000375,0.000089,0.000246,0.000107,0.000136,0.000078,0.000279,-0.002495,-0.000900,0.004036,-0.000498,0.000635,0.002368,0.002969
649,32,V,False,0.000094,-0.000020,-0.000006,-0.000177,0.000198,0.000160,0.000250,-0.002026,-0.001009,0.003784,-0.000782,0.000697,0.002450,0.002940


In [33]:
excel_file = './data/raw_data/DBR1.xlsx'
selection_file = './output/selection_coefficients/DBR1.csv.gz'
output_file = './output/gauged_selection_coefficients/DBR1.csv.gz'
replicates = 2

df_selection = pd.read_csv(selection_file)
site_list = df_selection['site'].unique()
df_WT = pd.read_excel(excel_file)[['Affected codon', 'Reference AA']]
df_WT = df_WT.drop_duplicates( keep='last').dropna()
df_WT['WT_indicator'] = True
# df_WT = df[['site', 'WT_aa','WT_indicator']]
df_WT = df_WT.rename(columns={'Affected codon': 'site', 'Reference AA': 'amino_acid'})

if 'gauged' not in ' '.join(df_selection.columns.tolist()):
    df_merge = pd.merge(df_WT, df_selection, on=['site','amino_acid'], how='right')
    df_merge['WT_indicator'] = df_merge['WT_indicator'].fillna(False)
    column_name = df_merge.columns[-(replicates+1):].tolist()
    gauged_name = [i+'_gauged' for i in column_name]
    gauged_list = []
    for i in gauged_name:
        df_merge[i] = 0
        gauged_list.append([])

    for site in df_merge['site'].unique():
        for i in range(len(column_name)):
            gauge_value = df_merge[(df_merge['site']==site)&(df_merge['WT_indicator'])][column_name[i]].values[0]
            original_list = df_merge[df_merge['site']==site][column_name[i]].tolist()
            gauged_list[i] +=  [i - gauge_value for i in original_list]

    for i in range(len(gauged_name)):
        df_merge[gauged_name[i]] = gauged_list[i]

    df_merge.to_csv(output_file, compression = 'gzip', index = False)
    print('gauge done')
else:
    print('gauge existed')

gauge done
