### Here, I'm finding all the TA sites represented in the t-1 data and finding the corresponding counts at that site for the other time points. I'll save the merged data as a numpy array

In [1]:
import numpy as np
import time
import os
import re
from Bio.SeqIO.FastaIO import SimpleFastaParser

In [2]:
#input directory with all the counts data
directory = #'/Volumes/GoogleDrive/My Drive/novaseq_data/counts_r2_UMI/ara_minus'
##YOUR INPUT DIRECTORY HERE

In [3]:
#output directory for combined data
outpath = #'/Volumes/GoogleDrive/My Drive/novaseq_data/merged_trajectory_r2_UMI'
##YOUR OUTPUT DIRECTORY HERE

In [4]:
#search query for sample name
search = r'_S(.*)_merged'
#create a dictionary: sample names are keys, path to textfiles with counts data are values
dict_files = {}
for filename in os.listdir(directory):
    sample = int(re.search(search,filename).group(1))
    filepath = os.path.join(directory,filename)
    dict_files[sample] = filepath

In [5]:
libraries = ['REL606', 'REL607', 'REL11330', 'REL11333', 'REL11364', 'REL11336', 'REL11339', 'REL11389', 'REL11392', 'REL11342', 'REL11345', 'REL11348', 'REL11367', 'REL11370', 'methods_old', 'methods_new']

## Now I'm going to count how many reads are present at every TA site (note that there are 211995 ta sites in the REl606 reference genome). This should make it much easier to combine all the data into one single array

In [7]:
with open("/Users/anuraglimdi/Desktop/TnSeq_LTEE/ReferenceGenome/rel606_reference.fasta") as in_handle:
    for title, seq in SimpleFastaParser(in_handle):
        ta_sites = [m.start(0) for m in re.finditer('TA', seq)]
ta_sites = np.array(ta_sites)

In [8]:
def merge_all_ta_sites(keys,population):
    #ensure that there are enough keys
    assert len(keys)==9, "Input keys array should be of length 9"
    #loading the data
    tm1 = np.loadtxt(dict_files[keys[0]])
    t0_red = np.loadtxt(dict_files[keys[1]])
    t0_green = np.loadtxt(dict_files[keys[2]])
    t1_red = np.loadtxt(dict_files[keys[3]])
    t1_green = np.loadtxt(dict_files[keys[4]])
    t2_red = np.loadtxt(dict_files[keys[5]])
    t2_green = np.loadtxt(dict_files[keys[6]])
    t3_red = np.loadtxt(dict_files[keys[7]])
    t3_green = np.loadtxt(dict_files[keys[8]])
    #making empty arrays for storing counts
    #each row corresponds to a timepoint
    counts_site_green = np.zeros([11,len(ta_sites)])
    counts_site_red = np.zeros([11,len(ta_sites)])
    #the first row is simply the list of TA sites
    counts_site_green[0,:] = ta_sites
    counts_site_red[0,:] = ta_sites
    #one row for the list of ta_sites
    #10 rows, one for each time point, with UMI uncorrected and corrected counts
     #first the green replicate
    t0 = time.time()
    for i in range(0, len(ta_sites)):
        query = ta_sites[i]
        #if the site is present in t0_green data
        if query in tm1[0,:]:
            counts_site_green[1,i] = tm1[1,np.where(tm1[0,:]==query)]
            counts_site_green[2,i] = tm1[2,np.where(tm1[0,:]==query)]
       #if the site is present in t0_green data
        if query in t0_green[0,:]:
            #extract the corresponding counts
            counts_site_green[3,i] = t0_green[1,np.where(t0_green[0,:]==query)]
            counts_site_green[4,i] = t0_green[2,np.where(t0_green[0,:]==query)]
        if query in t1_green[0,:]:
            counts_site_green[5,i] = t1_green[1,np.where(t1_green[0,:]==query)]
            counts_site_green[6,i] = t1_green[2,np.where(t1_green[0,:]==query)]
        if query in t2_green[0,:]:
            counts_site_green[7,i] = t2_green[1,np.where(t2_green[0,:]==query)]
            counts_site_green[8,i] = t2_green[2,np.where(t2_green[0,:]==query)]
        if query in t3_green[0,:]:
            counts_site_green[9,i] = t3_green[1,np.where(t3_green[0,:]==query)]
            counts_site_green[10,i] = t3_green[2,np.where(t3_green[0,:]==query)]
    t1 = time.time()
    print(t1-t0)
    
    t0 = time.time()
    for i in range(0, len(ta_sites)):
        query = ta_sites[i]
        #if the site is present in t0_red data
        if query in tm1[0,:]:
            counts_site_red[1,i] = tm1[1,np.where(tm1[0,:]==query)]
            counts_site_red[2,i] = tm1[2,np.where(tm1[0,:]==query)]
       #if the site is present in t0_red data
        if query in t0_red[0,:]:
            #extract the corresponding counts
            counts_site_red[3,i] = t0_red[1,np.where(t0_red[0,:]==query)]
            counts_site_red[4,i] = t0_red[2,np.where(t0_red[0,:]==query)]
        if query in t1_red[0,:]:
            counts_site_red[5,i] = t1_red[1,np.where(t1_red[0,:]==query)]
            counts_site_red[6,i] = t1_red[2,np.where(t1_red[0,:]==query)]
        if query in t2_red[0,:]:
            counts_site_red[7,i] = t2_red[1,np.where(t2_red[0,:]==query)]
            counts_site_red[8,i] = t2_red[2,np.where(t2_red[0,:]==query)]
        if query in t3_red[0,:]:
            counts_site_red[9,i] = t3_red[1,np.where(t3_red[0,:]==query)]
            counts_site_red[10,i] = t3_red[2,np.where(t3_red[0,:]==query)]
    t1 = time.time()
    print(t1-t0)

    gname = outpath+'/green_'+population+'_merged_all_TAsites.txt'
    rname = outpath+'/red_'+population+'_merged_all_TAsites.txt'


    
    np.savetxt(gname, counts_site_green, comments=f'#{population}, col 0: TA sites, col 1,2: t-1 counts, uncorrected and UMI corrected resp., col 3,4: t0green counts,uncorrected and UMI corrected resp. col 5,6: t1green counts, uncorrected and UMI corrected resp. col 7,8: t2green counts, uncorrected and UMI corrected resp., col 9,10: t3green counts, uncorrected and UMI corrected resp.')
    np.savetxt(rname, counts_site_red, comments=f'#{population}, col 0: TA sites, col 1,2: t-1 counts, uncorrected and UMI corrected resp., col 3,4: t0red counts,uncorrected and UMI corrected resp. col 5,6: t1red counts, uncorrected and UMI corrected resp. col 7,8: t2red counts, uncorrected and UMI corrected resp., col 9,10: t3red counts, uncorrected and UMI corrected resp.')

    return None

In [9]:
#arranging the keys in an array
keys = np.linspace(1,144,144)
keys = np.reshape(keys, (9,16))
#each column in the reshaped array corresponds to a library
#each row corresponds to a time point

In [10]:
anc_methods = [0,1,14,15]
ara_plus = [8,9,10,11,12,13]
ara_minus = [2,3,4,5,6,7]

In [16]:
for i in ara_minus:
    print(libraries[i])
    merge_all_ta_sites(keys[:,i], libraries[i])

REL11330
190.98461484909058
169.85977816581726
REL11333
140.4428768157959
139.73922204971313
REL11364
163.18904089927673
159.96757793426514
REL11336
163.61236214637756
163.11022305488586
REL11339
183.90242195129395
174.74777221679688
REL11389
161.64354991912842
148.84663200378418


In [None]:
for i in ara_plus:
    print(libraries[i])
    merge_all_ta_sites(keys[:,i], libraries[i])

In [None]:
for i in anc_methods:
    print(libraries[i])
    merge_all_ta_sites(keys[:,i], libraries[i])