<a id='contents'></a>

# Figures and analysis

This notebook contains scripts for producing the main figures and results accompanying the manuscript. Here we perform basic organization and processing of data, which is then passed to functions in `figures.py` and `mplot.py` (available at this [GitHub repository](https://github.com/johnbarton/mplot)) for detailed formatting. The figures produced are stored as PDFs in the `/figures` folder.

## Contents

- [Overview and table of contents](#contents)
- [Loading libraries and global variables](#global)
- [Figures and data analysis](#figures)  

<a id='global'></a>

## Libraries and variables

In [1]:
# Full library list and version numbers

print('This notebook was prepared using:')

import sys, os
from copy import deepcopy
from importlib import reload
print('python version %s' % sys.version)

import numpy as np
print('numpy version %s' % np.__version__)

import scipy as sp
import scipy.stats as st
print('scipy version %s' % sp.__version__)

import pandas as pd
print('pandas version %s' % pd.__version__)

import matplotlib
import matplotlib.cm as cm
import matplotlib.pyplot as plot
import matplotlib.gridspec as gridspec
import matplotlib.image as mpimg
print('matplotlib version %s' % matplotlib.__version__)

import figures as fig
import mplot as mp

This notebook was prepared using:
python version 3.11.10 | packaged by conda-forge | (main, Oct 16 2024, 01:26:25) [Clang 17.0.6 ]
numpy version 1.26.4
scipy version 1.14.1
pandas version 2.2.2
matplotlib version 3.9.2


In [2]:
from importlib import reload
reload(mp)
reload(fig)

fig.plot_figure_ch505_ch848_circle()

0.0891913699999999 -0.03781679
0.12553002 -0.03616553
CH505-CH848 done.


In [3]:
from importlib import reload
reload(mp)
reload(fig)

fig.plot_figure_ch505_structure(filename='fig-ch505-structure.png')

(-0.5, 715.5) (650.5, -0.5)
CH505-structure done.


In [4]:
from importlib import reload
reload(mp)
reload(fig)

fig.plot_figure_ch848_structure('fig-ch848-structure.png')

(-0.5, 770.5) (705.5, -0.5)
CH848-structure done.


In [5]:
df_ch505 = pd.read_csv('data/processed/selection_RMs_common_mutations_SHIVCH505.csv')
df_ch848 = pd.read_csv('data/processed/selection_RMs_common_mutations_SHIVCH848.csv')

top_x = 20
top_muts = ['N130D', 'N279D', 'K302N', 'Y330H', 'N334S', 'H417R']

for df in [df_ch505, df_ch848]:
    rank_mean  = np.argsort(np.array(df['mean_S']))[::-1]
    rank_joint = np.argsort(np.array(df['RMs']))[::-1]
    
    m_count = 0
    j_count = 0
    print('')
    print('rank\tm mut\tm s\tm ct\tj mut\tj s\tj ct')
    for i in range(top_x):
        m_mut = df.iloc[rank_mean[i]]
        j_mut = df.iloc[rank_joint[i]]
        if str(m_mut.mutation) in top_muts:
            m_count += 1
        if str(j_mut.mutation) in top_muts:
            j_count += 1
        print('%d\t%s\t%.3f\t%d\t%s\t%.3f\t%d' % (i+1, m_mut.mutation, m_mut.mean_S, m_count, j_mut.mutation, j_mut.RMs, j_count))


rank	m mut	m s	m ct	j mut	j s	j ct
1	N334S	0.032	1	N334S	0.024	1
2	Y330H	0.026	2	Y330H	0.020	2
3	K302N	0.022	3	N279D	0.017	3
4	N279D	0.019	4	K462E	0.014	3
5	H356N	0.017	4	H356N	0.012	3
6	H417R	0.017	5	K302N	0.012	4
7	K462E	0.016	5	G153E	0.011	4
8	T234N	0.016	5	-147N	0.011	4
9	G153E	0.013	5	N130D	0.010	5
10	N130D	0.011	6	T234N	0.009	5
11	N474D	0.010	6	H417R	0.009	6
12	N276D	0.010	6	V518M	0.008	6
13	N230S	0.009	6	S137N	0.008	6
14	H356Y	0.009	6	I150S	0.008	6
15	-147N	0.008	6	N230S	0.007	6
16	V518M	0.008	6	T132A	0.007	6
17	S137N	0.008	6	E507G	0.006	6
18	T132A	0.007	6	H356Y	0.006	6
19	N464D	0.007	6	T132I	0.006	6
20	T408A	0.007	6	N279-	0.006	6

rank	m mut	m s	m ct	j mut	j s	j ct
1	P179L	0.020	0	P179L	0.020	0
2	N413T	0.016	0	N413T	0.017	0
3	N462S	0.016	0	N413S	0.014	0
4	R336E	0.015	0	D137S	0.013	0
5	N413S	0.015	0	N462T	0.013	0
6	T459aI	0.014	0	S407N	0.012	0
7	N462D	0.014	0	R336E	0.012	0
8	T459a-	0.013	0	P179S	0.012	0
9	S407N	0.013	0	V142fA	0.012	0
10	R336K	0.012	0	N462D	0.012	0
11	P179S	0.01

In [6]:
test_frac = 0.04

df_ch505    = pd.read_csv('data/processed/enrichment_CH505_multiply_fraction.csv')
df_ch505_rm = pd.read_csv('data/processed/enrichment_grouped_SHIV_CH505_multiply_fraction.csv')
df_ch848    = pd.read_csv('data/processed/enrichment_CH848_multiply_fraction.csv')
df_ch848_rm = pd.read_csv('data/processed/enrichment_grouped_SHIV_CH848_multiply_fraction.csv')

df_ch505['P']    = 10**df_ch505['log10_P']
df_ch505_rm['P'] = 10**df_ch505_rm['log10_P']
df_ch848['P']    = 10**df_ch848['log10_P']
df_ch848_rm['P'] = 10**df_ch848_rm['log10_P']

print('CH505')
df_ch505_sub = df_ch505[df_ch505['fraction']==test_frac]
print(df_ch505_sub)
print(np.sum(df_ch505_sub['num_cutoff']))

print('\nSHIV.CH505')
df_ch505_rm_sub = df_ch505_rm[df_ch505_rm['fraction']==test_frac]
print(df_ch505_rm_sub)
print(np.sum(df_ch505_rm_sub['num_cutoff']))

print('\nCH848')
df_ch848_sub = df_ch848[df_ch848['fraction']==test_frac]
print(df_ch848_sub)
print(np.sum(df_ch848_sub['num_cutoff']))

print('\nSHIV.CH848')
df_ch848_rm_sub = df_ch848_rm[df_ch848_rm['fraction']==test_frac]
print(df_ch848_rm_sub)
print(np.sum(df_ch848_rm_sub['num_cutoff']))

CH505
    categories  fraction  num_cutoff  enrichment   log10_P             P
15          LD      0.04           3    8.001882 -1.696659  2.010669e-02
34       CD4BS      0.04           3    1.795544 -0.631573  2.335754e-01
53          V1      0.04           6    8.179702 -3.006772  9.845274e-04
72          V2      0.04           2    1.194117 -0.588050  2.581963e-01
91          V3      0.04           7    4.921884 -2.231098  5.873564e-03
110         V4      0.04          10    7.112784 -4.206573  6.214791e-05
129         V5      0.04           4   10.669177 -2.548404  2.828756e-03
148  Reversion      0.04          11   12.142706 -6.843870  1.432615e-07
167       Hole      0.04           3    1.047609 -0.780847  1.656353e-01
186     Shield      0.04           6   30.972796 -6.240632  5.746030e-07
205      Shift      0.04           2   79.152700 -3.154002  7.014525e-04
224        PNG      0.04           7    2.308620 -0.981163  1.044327e-01
64

SHIV.CH505
    categories  fraction  num_

In [7]:
df_ch505 = pd.read_csv('data/processed/selection_RMs_SHIVCH505.csv')
df_ch848 = pd.read_csv('data/processed/selection_RMs_SHIVCH848.csv')

top_n = 20

for df in [df_ch505, df_ch848]:
    mut_unique = np.array(np.unique(df['mutation']))
    mut_count = np.array([len(np.unique(df[df['mutation']==m]['individual'])) for m in mut_unique])
    s_avg = np.array([np.mean(df[df['mutation']==m]['selection']) for m in mut_unique])
    s_rank = np.argsort(s_avg)[::-1]
    mut_sort = mut_unique[s_rank[:top_n]]
    count_sort = mut_count[s_rank[:top_n]]
    s_sort = s_avg[s_rank[:top_n]]
    print('mut\t# RMs\tavg s')
    for i in range(top_n):
        print('%s\t%d\t%.2f' % (mut_sort[i], count_sort[i], s_sort[i]))
    print('')

mut	# RMs	avg s
N334S	7	0.03
P8T	1	0.03
Y330H	6	0.03
T618N	1	0.02
K302N	4	0.02
T406I	1	0.02
N279D	5	0.02
H417R	7	0.02
T234N	4	0.02
N404D	2	0.01
T406D	1	0.01
P8S	2	0.01
N230K	1	0.01
N334-	1	0.01
R166K	2	0.01
N130D	5	0.01
T406A	1	0.01
N404S	1	0.01
N276D	4	0.01
T676A	3	0.01

mut	# RMs	avg s
N142gS	2	0.02
N413I	1	0.02
N462P	1	0.02
K340-	1	0.02
N413H	2	0.02
N413T	5	0.02
N462S	5	0.02
N413S	6	0.01
N413K	2	0.01
S334G	1	0.01
N462D	5	0.01
N142gT	2	0.01
I150T	1	0.01
Y405N	3	0.01
T144N	1	0.01
D137N	6	0.01
N142gV	1	0.01
N462T	4	0.01
N448S	3	0.01
S461R	1	0.01



In [8]:
from importlib import reload
reload(mp)
reload(fig)

pdata = {
    's_files': ['data/processed/selection_vs_num-of-RMs_CH505.csv', 'data/processed/selection_vs_num-of-RMs_CH848.csv'],
    'tags':    ['CH505', 'CH848']
}

fig.plot_selection_vs_rms(**pdata)

In [9]:
from importlib import reload
reload(mp)
reload(fig)

pdata = {
    's_files':    ['data/processed/fig3A_selection_vs_time.csv', 'data/processed/figS4A_selection_vs_time.csv'],
    'traj_files': ['data/processed/fig3C_trajectory.csv', 'data/processed/figS4C_trajectory.csv'],
    'tags':       ['CH505', 'CH848']
}

fig.plot_trajectory_selection(**pdata)

In [10]:
from importlib import reload
reload(mp)
reload(fig)

pdata = {
    'traj_files': ['data/processed/fig3C_trajectory.csv', 'data/processed/figS4C_trajectory.csv'],
    'tags':       ['CH505', 'CH848']
}

fig.plot_trajectory_expanded(**pdata)

In [11]:
from importlib import reload
reload(mp)
reload(fig)

pdata = {
    's_files':    ['data/processed/selection_RMs_SHIVCH505.csv', 'data/processed/selection_RMs_SHIVCH848.csv'],
    'traj_files': ['data/processed/trajectories_RMs_SHIVCH505.csv', 'data/processed/trajectories_RMs_SHIVCH848.csv'],
    'tags':       ['SHIV.CH505', 'SHIV.CH848'],
    'rms':        ['RM5695', 'RM6167']
}

fig.plot_trajectory_selection_shiv(**pdata)

In [12]:
from importlib import reload
reload(mp)
reload(fig)

pdata = {
    'f_files':    ['data/processed/fig6A_fitness_change.csv', 'data/processed/figS6A_fitness_change.csv'],
    'tags':       ['SHIV.CH505', 'SHIV.CH848'],
    't_breadth':  dict(RM5695=16*7, RM6070=8*7, RM6163=80*7, RM6167=64*7),
    'use_breadth': True,
}

fig.plot_fitness_gain_v_time(**pdata)

In [13]:
from importlib import reload
reload(mp)
reload(fig)

pdata = {
    'f_files':    ['data/processed/fitness_change_w_categories_CH505.csv', 'data/processed/fitness_change_w_categories_CH848.csv'],
    'tags':       ['SHIV.CH505', 'SHIV.CH848'],
    # 't_breadth':  dict(RM5695=16*7, RM6070=8*7, RM6163=80*7, RM6167=64*7),
    'use_breadth': True,
}

fig.plot_fitness_gain_v_time_categories(**pdata)

In [14]:
from importlib import reload
reload(mp)
reload(fig)

pdata = {
    'f_files': ['data/processed/fig4A_fitness_comparison.csv', 'data/processed/fig4B_fitness_comparison.csv'],
    'tags':    ['CH505', 'CH848']
}

fig.plot_fitness_comparison(**pdata)

0.8060000000000002
CH505 PearsonRResult(statistic=0.9613279468407108, pvalue=0.0)
CH848 PearsonRResult(statistic=0.9549845148431755, pvalue=0.0)


In [15]:
from importlib import reload
reload(mp)
reload(fig)

pdata = {
    'f_files': ['data/processed/fig4A_fitness_comparison.csv', 'data/processed/fig4B_fitness_comparison.csv'],
    'tags':    ['CH505', 'CH848']
}

fig.plot_fitness_comparison_horizontal(**pdata)

0.3285714285714286
CH505 PearsonRResult(statistic=0.9613279468407108, pvalue=0.0)
CH848 PearsonRResult(statistic=0.9549845148431755, pvalue=0.0)


In [16]:
from importlib import reload
reload(mp)
reload(fig)

pdata = {
    'f_files':    ['data/processed/figS3A_fitness_comparison_using_shuffled_seq.csv', 'data/processed/figS3B_fitness_comparison_using_shuffled_seq.csv'],
    'tags':       ['CH505', 'CH848'],
    'filename':   'fig-f-compare-h-shuffle-wide',
    'use_breadth': False
}

fig.plot_fitness_comparison_horizontal(**pdata)

0.3285714285714286
CH505 PearsonRResult(statistic=0.8419526537177966, pvalue=0.0)
CH848 PearsonRResult(statistic=0.7438102116203521, pvalue=8.399517373586148e-287)
