In [17]:
import numpy as np
import pandas as pd
import scipy.stats as st
AA = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

In [114]:
df_freq = pd.read_csv('./data/raw_data/H1_HumanSwine_alignment_frequencies.txt', sep="\t")
df_freq = df_freq.drop(['WT_AA'], axis=1)
df_freq.columns = df_freq.columns.tolist()[:2]+[i[-1] for i in df_freq.columns[2:]]
df_sele = pd.read_csv('./output/selection_coefficients/WSN.csv.gz')
df_sele = df_sele[df_sele['amino_acid']!='*']
df_pref = pd.read_csv('./output/merged_preference/WSN.csv.gz')
df_sele['site'] = df_sele['site'].astype('int')
df_pref['site'] = df_pref['site'].astype('int')
sites = set(np.array(df_pref['site']))

df_pref1 = pd.read_csv('./data/prefs/WSN-1_prefs.txt', sep="\t")
df_pref1 = df_pref1.drop(['WT_AA', 'PI_*'], axis = 1)
df_pref2 = pd.read_csv('./data/prefs/WSN-2_prefs.txt', sep="\t")
df_pref2 = df_pref2.drop(['WT_AA', 'PI_*'], axis = 1)
df_pref3 = pd.read_csv('./data/prefs/WSN-3_prefs.txt', sep="\t")
df_pref3 = df_pref3.drop(['WT_AA', 'PI_*'], axis = 1)
df_entropy  = pd.concat([df_pref1, df_pref2, df_pref3]).groupby(level=0).mean()[['#SITE', "SITE_ENTROPY"]]

ranks_pref = []
z_pref     = []
ranks_sele = []
z_sele     = []
entropy    = []

for s in sites:

    df_f_site = df_freq[df_freq['#SITE']==s]
    f_site = [df_f_site[aa] for aa in AA]
    top = np.argmax(f_site)

    df_p_site = df_pref[df_pref['site']==s]
    df_p_site = df_p_site.sort_values('amino_acid')
    pref_site = df_p_site['average']
    site_ranks = st.rankdata(pref_site) 
    ranks_pref.append(site_ranks[top])

    m_site   = np.mean(pref_site)
    std_site = np.std(pref_site)
#     print(pref_site)
    z_pref.append((pref_site.tolist()[top]-m_site)/std_site)

    df_s_site  = df_sele[df_sele['site']==s]
    df_s_site = df_s_site.sort_values('amino_acid')
    sele_site  = df_s_site['joint']
    site_ranks = st.rankdata(sele_site)
    ranks_sele.append(site_ranks[top])

    m_site   = np.mean(sele_site)
    std_site = np.std(sele_site)
    z_sele.append((sele_site.tolist()[top]-m_site)/std_site)
    
    entropy.append(df_entropy[df_entropy['#SITE']==s]['SITE_ENTROPY'])

print('rank of WT AA (larger is better)')
print('method\tmean\tstd')
print('pref\t%.1f\t%.1f' % (np.mean(ranks_pref), np.std(ranks_pref)))
print('s\t%.1f\t%.1f\n'  % (np.mean(ranks_sele), np.std(ranks_sele)))

ranks_p_norm = np.array(ranks_pref)/np.array(entropy)
ranks_s_norm = np.array(ranks_sele)/np.array(entropy)
print('rank of WT AA, normalized by site entropy (larger is better)')
print('method\tmean\tstd')
print('pref\t%.1f\t%.1f' % (np.mean(ranks_p_norm), np.std(ranks_p_norm)))
print('s\t%.1f\t%.1f\n'  % (np.mean(ranks_s_norm), np.std(ranks_s_norm)))

print('worst rank of WT AA (larger is better)')
print('method\tstd\tnorm')
print('pref\t%.1f\t%.1f' % (np.min(ranks_pref), np.min(ranks_p_norm)))
print('s\t%.1f\t%.1f\n'  % (np.min(ranks_sele), np.min(ranks_s_norm)))

print('z score of WT AA (larger is better)')
print('method\tmean\tstd')
print('pref\t%.1f\t%.1f' % (np.mean(z_pref), np.std(z_pref)))
print('s\t%.1f\t%.1f\n'  % (np.mean(z_sele), np.std(z_sele)))

rank of WT AA (larger is better)
method	mean	std
pref	18.3	3.2
s	18.9	3.7

rank of WT AA, normalized by site entropy (larger is better)
method	mean	std
pref	6.3	1.5
s	6.5	1.7

worst rank of WT AA (larger is better)
method	std	norm
pref	1.0	0.3
s	1.0	0.3

z score of WT AA (larger is better)
method	mean	std
pref	2.6	1.5
s	3.5	1.5

