In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import re 

In [2]:
# change directory to folder with output files by sample
os.chdir('/Users/andysposato/Desktop/jenna_sperm/SpermAnalysis_Muller1')

In [3]:
# load sample-info.csv and subset rows of edited samples
samples_df = pd.read_csv("./data/samples-info.csv", sep=",", index_col=0, dtype={'replicate':'category'})
# Andy commented out the next line so this code will read your control samples. To look at edited fish only, uncomment the next line:
#samples_df = samples_df[samples_df['condition']=='edited']
print(samples_df)

                fish  age replicate condition
sample_id                                    
fish1_s3_rep1      1    7         1    edited
fish1_s3_rep2      1    7         2    edited
fish1_s3_rep3      1    7         3    edited
fish1_s4_rep1      1    8         1    edited
fish1_s4_rep2      1    8         2    edited
...              ...  ...       ...       ...
fish10_s9_rep2    10   13         2    edited
fish10_s9_rep3    10   13         3    edited
fish10_s10_rep1   10   14         1    edited
fish10_s10_rep2   10   14         2    edited
fish10_s10_rep3   10   14         3    edited

[93 rows x 4 columns]


In [4]:
# reorder dataframe rows by fish then date 
samples_df.sort_values(by=['fish','age'], inplace=True)
ind_sorted = samples_df.index
print("After reordering for clustering: ") 
print(samples_df)
print("This is the new index: ")
print(ind_sorted)

After reordering for clustering: 
                     fish  age replicate condition
sample_id                                         
fish1_s3_rep1           1    7         1    edited
fish1_s3_rep2           1    7         2    edited
fish1_s3_rep3           1    7         3    edited
fish1_s4_rep1           1    8         1    edited
fish1_s4_rep2           1    8         2    edited
...                   ...  ...       ...       ...
fishctrl3_s9_rep2   ctrl3   13         2   control
fishctrl3_s9_rep3   ctrl3   13         3   control
fishctrl3_s10_rep1  ctrl3   14         1   control
fishctrl3_s10_rep2  ctrl3   14         2   control
fishctrl3_s10_rep3  ctrl3   14         3   control

[93 rows x 4 columns]
This is the new index: 
Index(['fish1_s3_rep1', 'fish1_s3_rep2', 'fish1_s3_rep3', 'fish1_s4_rep1',
       'fish1_s4_rep2', 'fish1_s4_rep3', 'fish1_s5_rep1', 'fish1_s5_rep2',
       'fish1_s5_rep3', 'fish1_s6_rep1', 'fish1_s6_rep2', 'fish1_s6_rep3',
       'fish1_s7_rep1', 'fish1_

In [5]:
# list of labelled indexes 
# convert Index class to np.array and then to list of strings
sampind_list = ind_sorted.values.tolist()

In [6]:
print(sampind_list)

['fish1_s3_rep1', 'fish1_s3_rep2', 'fish1_s3_rep3', 'fish1_s4_rep1', 'fish1_s4_rep2', 'fish1_s4_rep3', 'fish1_s5_rep1', 'fish1_s5_rep2', 'fish1_s5_rep3', 'fish1_s6_rep1', 'fish1_s6_rep2', 'fish1_s6_rep3', 'fish1_s7_rep1', 'fish1_s7_rep2', 'fish1_s7_rep3', 'fish1_s8_rep1', 'fish1_s8_rep2', 'fish1_s8_rep3', 'fish1_s9_rep1', 'fish1_s9_rep2', 'fish1_s9_rep3', 'fish1_s10_rep1', 'fish1_s10_rep2', 'fish1_s10_rep3', 'fish1_s11_rep1', 'fish1_s11_rep2', 'fish1_s11_rep3', 'fish1_s12_rep1', 'fish1_s12_rep2', 'fish1_s12_rep3', 'fish10_s8_rep1', 'fish10_s8_rep2', 'fish10_s8_rep3', 'fish10_s9_rep1', 'fish10_s9_rep2', 'fish10_s9_rep3', 'fish10_s10_rep1', 'fish10_s10_rep2', 'fish10_s10_rep3', 'fish5_s3_rep1', 'fish5_s3_rep2', 'fish5_s3_rep3', 'fish5_s4_rep1', 'fish5_s4_rep2', 'fish5_s4_rep3', 'fish5_s5_rep1', 'fish5_s5_rep2', 'fish5_s5_rep3', 'fish5_s6_rep1', 'fish5_s6_rep2', 'fish5_s6_rep3', 'fish5_s7_rep1', 'fish5_s7_rep2', 'fish5_s7_rep3', 'fish5_s8_rep1', 'fish5_s8_rep2', 'fish5_s8_rep3', 'fish5_s9

In [7]:
print(os.getcwd())
print(os.listdir())
# change directory to folder with fish dictionaries to access no fly barcode list
os.chdir('/Users/andysposato/Desktop/jenna_sperm/fish_dictionaries/')
print(os.getcwd())
print(os.listdir())

/Users/andysposato/Desktop/jenna_sperm/SpermAnalysis_Muller1
['.Rhistory', '.DS_Store', 'experiments', 'data']
/Users/andysposato/Desktop/jenna_sperm/fish_dictionaries
['fish5_dictionary.tsv', 'fish1_dictionary.tsv', 'fishes_dictionaries.tsv', 'fishctrl3_dictionary.tsv', 'no_fly_barcodes_list.txt', 'fish10_dictionary.tsv']


In [8]:
no_fly_barcodes = []
file = open("no_fly_barcodes_list.txt", "r")
for line in file: 
    # get rid of new line character in barcode string
    barcode = line[0:-1]
    # add each barcode string to no_fly_barcodes
    no_fly_barcodes.append(barcode)
file.close()
print("These barcodes should not be used since they show up in multiple animals:")
for barcode in no_fly_barcodes: 
    print(barcode)

These barcodes should not be used since they show up in multiple animals:
1D+38_28D+41_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE
NONE_NONE_NONE_NONE_NONE_81D+170_81D+170_81D+170_81D+170_NONE
3D+35_3D+64_NONE_81D+119_81D+119_81D+119_81D+119_NONE_1I+254+C_NONE
2D+36_81D+61_81D+61_81D+61_81D+61_NONE_NONE_NONE_NONE_NONE
2I+36+GA_NONE_NONE_15D+111_72D+130_72D+130_72D+130_NONE_3D+251_NONE
82D+36_82D+36_82D+36_82D+36_NONE_NONE_NONE_NONE_NONE_NONE
NONE_NONE_NONE_NONE_NONE_87D+167_87D+167_87D+167_87D+167_NONE
1D+37_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE
NONE_2D+62_NONE_NONE_NONE_87D+167_87D+167_87D+167_87D+167_NONE
4D+38_15D+65_NONE_54D+119_54D+119_54D+119_NONE_NONE_NONE_NONE
87D+36_87D+36_87D+36_87D+36_NONE_1D+172_NONE_NONE_NONE_NONE
2I+37+GA_27D+51_NONE_3I+119+CCA_NONE_3D+169_54D+199_54D+199_54D+199_NONE
NONE_NONE_NONE_NONE_1D+146_81D+170_81D+170_81D+170_81D+170_NONE
7D+36_NONE_NONE_NONE_NONE_NONE_61D+195_61D+195_61D+195_NONE
17I+38+GTACATCGAGTGTATGC_28D+65_28D+65_NONE_NONE_NONE_

In [9]:
# change directory back to folder with output files by sample
os.chdir('/Users/andysposato/Desktop/jenna_sperm/SpermAnalysis_Muller1')

In [49]:
# load all ReadCounts files, trim reads below 0.005 (proportion-wise)
# collect 'filtered' readcounts into array of dataframes 
readcounts_df = []
for jj in ind_sorted: 
    # here we can alter the code to read "filtered.allReadCounts" files then we don't need the following df filters
    df = pd.read_csv("./data/viz-Output/filteredReadCount_files/" + jj + "/" + jj + ".filteredReadCounts", delimiter="\t", usecols=[0,2,3])
    #print(df)
    
    # trim barcodes below 0.5% of read depth
    df = df[df['proportion'] > 0.005]
    # trim barcodes below 10 read counts
    df = df[df['count'] > 9]
    # trim edited barcodes that show up in multiple animals
    df = df[~df['event'].isin(no_fly_barcodes)]
    
    readcounts_df.append(df)

In [44]:
type(readcounts_df)

list

In [50]:
print(readcounts_df)

[                                               event  count  proportion
0  1D+38_28D+41_NONE_NONE_NONE_NONE_NONE_NONE_NON...   2434    0.513827
1  3D+37_76I+66+TGTCGTGCAGTCGACTCCATGACAGCAGATACT...    862    0.181972
2  NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE    279    0.058898
3  12D+34_12D+60_NONE_NONE_3D+143_84D+169_84D+169...    268    0.056576
5  12D+34_12D+60_NONE_NONE_NONE_NONE_NONE_NONE_NO...     40    0.008444
6  1D+38_28D+41_NONE_NONE_NONE_NONE_NONE_NONE_1D+...     30    0.006333
7  3I+38+TAT_8D+57_3D+90_9D+115_8D+143_1D+173_NON...     28    0.005911,                                                event  count  proportion
0  1D+38_28D+41_NONE_NONE_NONE_NONE_NONE_NONE_NON...   5002    0.528585
1  3D+37_76I+66+TGTCGTGCAGTCGACTCCATGACAGCAGATACT...   1062    0.112227
2  12D+34_12D+60_NONE_NONE_3D+143_84D+169_84D+169...    542    0.057276
3  40D+18_NONE_NONE_2I+118+TT_1D+145_NONE_NONE_NO...    314    0.033182
4  NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE    293   

In [46]:
print(len(readcounts_df))

93


In [65]:
# extract event and count columns from each sample to create one big dataframe of all samples 
array_sampdf = []
for ind,df in enumerate(readcounts_df): 
    sample_id = ind_sorted[ind]
    sampdf = df[['event','count']]
    sampdf = sampdf.set_index('event')
    shortID = re.sub(r'-sperm', '_S', sample_id)
    sampdf = sampdf.rename(columns={'count':f'{shortID}'})
    array_sampdf.append(sampdf)

In [66]:
print(array_sampdf)

[                                                    fish1_s3_rep1
event                                                            
1D+38_28D+41_NONE_NONE_NONE_NONE_NONE_NONE_NONE...           2434
3D+37_76I+66+TGTCGTGCAGTCGACTCCATGACAGCAGATACTG...            862
NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE             279
12D+34_12D+60_NONE_NONE_3D+143_84D+169_84D+169_...            268
12D+34_12D+60_NONE_NONE_NONE_NONE_NONE_NONE_NON...             40
1D+38_28D+41_NONE_NONE_NONE_NONE_NONE_NONE_1D+2...             30
3I+38+TAT_8D+57_3D+90_9D+115_8D+143_1D+173_NONE...             28,                                                     fish1_s3_rep2
event                                                            
1D+38_28D+41_NONE_NONE_NONE_NONE_NONE_NONE_NONE...           5002
3D+37_76I+66+TGTCGTGCAGTCGACTCCATGACAGCAGATACTG...           1062
12D+34_12D+60_NONE_NONE_3D+143_84D+169_84D+169_...            542
40D+18_NONE_NONE_2I+118+TT_1D+145_NONE_NONE_NON...            314
NONE_NON

In [67]:
# join dataframes together along the column axis, taking the union of indices 
newdf = pd.concat(array_sampdf, axis=1, sort=False)
newdf.index.name = 'barcode'
# JENNA! Andy made this subfolder "Andy_test_9.20.24" outside of python. So if you try to save with this file path, it will error
# to fix, make your own subfolder and replace that here, or remove the subfolder from the file path string and save directly under experiments
newdf.to_csv("./experiments/Andy_test_10.10.24/barcodeMatrix_filterBelow0.005_wide.tsv", sep="\t") # save for ease of viewing 

In [167]:
# collect list of fish, each will get its own barcode matrix file
# saved each fish name as a key in a dictionary with an empty list for the value
fish_names = {}
for fish in sampind_list: 
    fish = fish.split("_")
    fish_name = fish[0]
    if fish_name not in fish_names: 
        fish_names[fish_name] = []

In [168]:
# for all the keys in fish_names, populate its empty list with the matrices associated with that fish name
for fish_name in fish_names.keys():
    for sample in array_sampdf:
        sample_name = str(sample).split("_")[0].split(" ")[-1]
        if fish_name == sample_name: 
            fish_names[fish_name].append(sample)

In [170]:
fish_names["fish10"]

[                                                    fish10_s8_rep1
 event                                                             
 NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE            19947
 NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_1D+254_...            3839,
                                                     fish10_s8_rep2
 event                                                             
 NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE            29525
 NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_1D+254_...            4700
 18D+35_NONE_NONE_NONE_53D+146_53D+146_53D+146_N...             220,
                                                     fish10_s8_rep3
 event                                                             
 NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE            22279
 NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_1D+254_...            3820
 11I+38+TGGAGTATCAT_5S+60+CACAT_NONE_55D+118_55D...             191,
                                             

In [171]:
# fish each fish in the dictionary, write a barcode matrix file  
for fish,matrix in fish_names.items(): 
    fish_df = pd.concat(matrix, axis = 1, sort = False)
    fish_df.index.name = 'barcode'
    fish_df.to_csv("./experiments/Andy_test_10.10.24/" + str(fish) + "_barcodeMatrix_filterBelow0.005_wide.tsv", sep="\t") # save for ease of viewing 