### Fitness assay simulations with noise introduced during bottlenecks

Similar approach to simulating fitness assays with one key difference: I define bottleneck size and sequencing depth for a "typical" neutral mutation. As an example if the bottleneck size is 50, and sequencing depth is 100, and the number of cell counts after bottleneck are X, the number of reads is drawn from a Poisson distribution with mean 2X 

In [1]:
#importing libraries
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from Bio.SeqIO.FastaIO import SimpleFastaParser
import re
import pandas as pd
import seaborn as sns
import pathlib
import os

In [2]:
sns.set_theme()
sns.set_context('paper')

In [3]:
#current working directory
cwd = os.getcwd()
print(cwd)

/Users/anuraglimdi/github/2022_Limdi_limits-pooled-fitness-assays/AnalysisNotebooks


In [4]:
#use the pathlib.Path function to get the parent directories-> goal is to navigate to directory with the metadata
# and the fitness trajectories data
path = pathlib.Path(cwd)
repo = str(path.parents[0])
print(path.parents[0]) #this should be the base directory for the github repository: the exact path will differ for 
#each unique user

/Users/anuraglimdi/github/2022_Limdi_limits-pooled-fitness-assays


In [5]:
#paths for metadat and mutant trajectories
metadata_path = repo+'/Metadata/'
data_path = repo+'/ProcessedData/Mutant_Trajectories/'

In [6]:
#names of libraries
libraries = ['REL606']
#more interpretable names for the figures in the paper
libraries2 = ['Anc']

In [7]:
#opening the pandas file with all the metadata!
all_data = pd.read_csv(metadata_path+"all_metadata_REL606.txt", sep="\t")
names = all_data.iloc[:,0]
gene_start = all_data.iloc[:,3]
gene_end = all_data.iloc[:,4]
strand = all_data.iloc[:,5]
locations = np.transpose(np.vstack([gene_start,gene_end,strand]))
k12_tags = all_data.iloc[:,2]
uniprot_rel606 = all_data.iloc[:,6]

#genomic coordinates of pseudogenes
locations_pseudogenes = np.loadtxt(metadata_path+'pseudogenes_locations_REL606.txt')

In [8]:
#fractions of the gene at the 5' and 3' ends to be excluded from analysis because they insertions there may not actually
#be disruptive to protein function
frac5p = 0.1
frac3p = 0.25

#reading the REL606 reference genome
with open(metadata_path+"rel606_reference.fasta") as in_handle:
    for title, seq in SimpleFastaParser(in_handle):
        ta_sites = [m.start(0) for m in re.finditer('TA', seq)]
ta_sites = np.array(ta_sites)

#counting how many TA sites are present in the interior of each gene
ta_gene = np.zeros(len(names))
for i in range(0,len(names)):
    start = locations[i, 0]
    end = locations[i, 1]
    length = end - start
    #if the gene is on the forward strand
    if locations[i,2]==1:
        #counting sites only in the middle of the gene, excluding defined fractions at each end
        ta_gene[i] = np.sum((ta_sites > start+length*frac5p)&(ta_sites < end - length*frac3p))
    elif locations[i,2]==-1:
        ta_gene[i] = np.sum((ta_sites < start+length*frac5p)&(ta_sites > end - length*frac3p))

In [9]:
#all the information from the fitness assay condensed into a couple of matrices
counts_all_green = np.zeros([5, len(ta_sites)])
counts_all_red = np.zeros([5, len(ta_sites)])

#loading the new file where I have the counts for each TA site for all time points
gname = data_path+'/green_'+libraries[0]+'_merged_all_TAsites.txt'
greendata = np.loadtxt(gname)
rname = data_path+'/red_'+libraries[0]+'_merged_all_TAsites.txt'
reddata = np.loadtxt(rname)
#now for extracting the UMI corrected counts
counts_all_green = greendata[2:11:2,:]
counts_all_red = reddata[2:11:2,:]