In [60]:
# The purpose of this notebook is to scan through all barcodes found in a fish and compare to other fish so we can create a list of "no fly" barcodes. 
# "No fly" barcodes show up in multiple animals and are not trustworthy!
# Make sure this ipynb file is in a folder that has access to viz-Output

#### First, find all the allReadCount files in your data directory

In [1]:
import os 
import collections

In [2]:
print(os.getcwd())
print(os.listdir())

/Users/andysposato/Desktop/jenna_sperm
['.DS_Store', 'make_barcode_matrix_Jenna.ipynb', 'find_bad_barcodes_w_filters.ipynb', 'find_bad_barcodes.ipynb', 'muller 2.Rmd', 'muller.nb.html', 'SpermAnalysis_Muller1', '.ipynb_checkpoints', 'fish_dictionaries', 'muller.Rmd']


In [3]:
filepaths = []
for dirpath, dirnames, filenames in os.walk("."):
   for filename in filenames:
      if filename.endswith(".allReadCounts"):
          filepath = os.path.join(dirpath, filename)
          filepaths.append(filepath)

print("Listing paths for every .allReadCounts file in your directory:\n")
for i in filepaths: 
    print(i)

Listing paths for every .allReadCounts file in your directory:

./SpermAnalysis_Muller1/data/viz-Output/fishctrl3_s6_rep1/fishctrl3_s6_rep1.allReadCounts
./SpermAnalysis_Muller1/data/viz-Output/fish1_s5_rep2/fish1_s5_rep2.allReadCounts
./SpermAnalysis_Muller1/data/viz-Output/fishctrl3_s10_rep1/fishctrl3_s10_rep1.allReadCounts
./SpermAnalysis_Muller1/data/viz-Output/fish5_s7_rep2/fish5_s7_rep2.allReadCounts
./SpermAnalysis_Muller1/data/viz-Output/fish1_s9_rep3/fish1_s9_rep3.allReadCounts
./SpermAnalysis_Muller1/data/viz-Output/fish5_s11_rep2/fish5_s11_rep2.allReadCounts
./SpermAnalysis_Muller1/data/viz-Output/fish1_s9_rep2/fish1_s9_rep2.allReadCounts
./SpermAnalysis_Muller1/data/viz-Output/fish5_s3_rep1/fish5_s3_rep1.allReadCounts
./SpermAnalysis_Muller1/data/viz-Output/fish5_s11_rep3/fish5_s11_rep3.allReadCounts
./SpermAnalysis_Muller1/data/viz-Output/fish5_s7_rep3/fish5_s7_rep3.allReadCounts
./SpermAnalysis_Muller1/data/viz-Output/fish1_s5_rep3/fish1_s5_rep3.allReadCounts
./SpermAnaly

#### Now we want to grab barcodes from each file in the paths we added to 'filepaths' and add them to a fish dictionary

In [4]:
# make a dictionary to hold all fish dictionaries 
# make each fish start with an empty dictionary
fishes = {}
for path in filepaths: 
    path = path.split("/")
    filename = path[-2]
    fish = filename.split("_")
    fish_name = fish[0]
    if fish_name not in fishes:
        fishes[fish_name] = {}
print(fishes)

{'fishctrl3': {}, 'fish1': {}, 'fish5': {}}


In [5]:
# just making sure the file paths and contents are what we expect 
for allReadCounts in filepaths: 
    print(allReadCounts)
    file = open(allReadCounts, 'r')
    file.readline()
    print(file.readline())
    file.close()
# this looks right

./SpermAnalysis_Muller1/data/viz-Output/fishctrl3_s6_rep1/fishctrl3_s6_rep1.allReadCounts
NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE	0	65932	0.9200541438160227

./SpermAnalysis_Muller1/data/viz-Output/fish1_s5_rep2/fish1_s5_rep2.allReadCounts
1D+38_28D+41_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE	0	12906	0.6402420875086814

./SpermAnalysis_Muller1/data/viz-Output/fishctrl3_s10_rep1/fishctrl3_s10_rep1.allReadCounts
NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE	0	86528	0.9775076537240592

./SpermAnalysis_Muller1/data/viz-Output/fish5_s7_rep2/fish5_s7_rep2.allReadCounts
54D+36_54D+36_54D+36_NONE_NONE_NONE_NONE_NONE_NONE_NONE	0	10526	0.3051102930519725

./SpermAnalysis_Muller1/data/viz-Output/fish1_s9_rep3/fish1_s9_rep3.allReadCounts
1D+38_28D+41_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE	0	1544	0.3356521739130435

./SpermAnalysis_Muller1/data/viz-Output/fish5_s11_rep2/fish5_s11_rep2.allReadCounts
2D+36_81D+61_81D+61_81D+61_81D+61_NONE_NONE_NONE_NONE_NONE	0	28708	0.

In [6]:
# for every read file
for read_file in filepaths: 
    # grab sample and rep info and store as variables "fish_id_name" and "samp_rep"
    fish_id = read_file.split("/")
    fish_id = fish_id[-2]
    fish_id = fish_id.split("_")
    fish_id_name = fish_id[0]
    sample = fish_id[1]
    replicate = fish_id[2]
    rep_num = replicate[-1]
    samp_rep = sample+"."+rep_num
    # if the fish id name matches a dictionary name in fishes
    if fish_id_name in fishes:
        # then open file for reading 
        file = open(read_file, "r")
        # read the first line so we can ignore the headers in the loop
        file.readline()
        # for each line containing a barcode in the allReadCounts file
        for line in file.readlines(): 
            # identify each element
            line = line.split('\t')
            barcode = line[0]
            rank = line[1]
            reads = line[2]
            proportion = line[3]
            # if the barcode is not found as a key in the fish's dictionary
            if barcode not in fishes[fish_id_name].keys(): 
                # then add it as a new key and store the sample replicate information as the first item in a list for that barcode's value
                fishes[fish_id_name][barcode] = [samp_rep]
            # else if the barcode already exists as a key in the fish's directory
            else:
                # then just add this sample replicate information to the list 
                if samp_rep not in fishes[fish_id_name][barcode]: 
                    fishes[fish_id_name][barcode].append(samp_rep)
        file.close()

In [7]:
print("Looking at fishes which is a dictionary of individual fish:\n")
fishes

Looking at fishes which is a dictionary of individual fish:



{'fishctrl3': {'NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE': ['s6.1',
   's10.1',
   's3.2',
   's3.3',
   's7.1',
   's8.3',
   's4.3',
   's8.2',
   's9.2',
   's5.3',
   's5.2',
   's9.3',
   's10.2',
   's6.2',
   's6.3',
   's10.3',
   's3.1',
   's7.3',
   's7.2',
   's8.1',
   's9.1',
   's5.1'],
  '3D+35_NONE_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE': ['s6.1'],
  '1D+38_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE': ['s6.1',
   's3.2',
   's3.3',
   's8.2',
   's9.2',
   's5.3',
   's5.2',
   's10.2',
   's6.3',
   's10.3',
   's7.2',
   's5.1'],
  'NONE_NONE_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE': ['s6.1',
   's4.3',
   's5.3',
   's5.2',
   's10.2',
   's10.3',
   's5.1'],
  '3D+35_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE': ['s6.1',
   's3.2',
   's3.3',
   's5.3',
   's5.2',
   's9.3',
   's10.2',
   's6.3',
   's7.2',
   's5.1'],
  '30D+35_30D+35_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE': ['s6.1', 's5.2'],
  '10D+29_NONE_NONE_NONE_1D+146_3D+169_

In [8]:
print("Looking at the dictionary for just an individual fish...\n")
print("fish1:")
print(fishes["fish1"])

Looking at the dictionary for just an individual fish...

fish1:
{'1D+38_28D+41_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE': ['s5.2', 's9.3', 's9.2', 's5.3', 's4.3', 's8.2', 's12.2', 's8.3', 's4.2', 's12.3', 's7.1', 's3.3', 's11.1', 's3.2', 's10.1', 's6.1', 's5.1', 's9.1', 's12.1', 's8.1', 's4.1', 's7.2', 's11.3', 's7.3', 's3.1', 's11.2', 's10.2', 's6.3', 's10.3', 's6.2'], '7D+36_12D+60_NONE_1I+119+A&5D+121_19D+130_4S+169+CAGA_6D+195_1D+222&1I+229+T_8D+244_NONE': ['s5.2', 's9.3', 's9.2', 's5.3', 's4.3', 's8.2', 's12.2', 's8.3', 's4.2', 's12.3', 's7.1', 's11.1', 's10.1', 's6.1', 's5.1', 's9.1', 's12.1', 's8.1', 's4.1', 's7.2', 's11.3', 's7.3', 's11.2', 's10.2', 's6.3', 's10.3', 's6.2'], '1D+38_28D+41_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE': ['s5.2', 's9.3', 's5.3', 's4.3', 's8.2', 's12.2', 's8.3', 's4.2', 's12.3', 's7.1', 's3.3', 's11.1', 's3.2', 's10.1', 's6.1', 's5.1', 's9.1', 's12.1', 's8.1', 's4.1', 's7.2', 's11.3', 's7.3', 's3.1', 's11.2', 's10.2', 's6.3', 's10.3', 's6.2'], 

In [117]:
print(fishes["fish5"])

{'54D+36_54D+36_54D+36_NONE_NONE_NONE_NONE_NONE_NONE_NONE': ['s7.2', 's11.2', 's3.1', 's11.3', 's7.3', 's6.3', 's10.3', 's10.2', 's6.2', 's5.1', 's9.1', 's8.1', 's12.1', 's4.1', 's3.3', 's11.1', 's7.1', 's3.2', 's6.1', 's10.1', 's5.2', 's9.3', 's9.2', 's5.3', 's12.3', 's4.3', 's8.2', 's8.3', 's4.2', 's12.2'], 'NONE_2D+62_NONE_NONE_NONE_87D+167_87D+167_87D+167_87D+167_NONE': ['s7.2', 's11.2', 's3.1', 's11.3', 's7.3', 's6.3', 's10.3', 's10.2', 's6.2', 's5.1', 's9.1', 's8.1', 's12.1', 's4.1', 's3.3', 's11.1', 's7.1', 's3.2', 's6.1', 's10.1', 's5.2', 's9.3', 's9.2', 's5.3', 's12.3', 's4.3', 's8.2', 's8.3', 's4.2', 's12.2'], 'NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE': ['s7.2', 's11.2', 's3.1', 's11.3', 's7.3', 's6.3', 's10.3', 's10.2', 's6.2', 's5.1', 's9.1', 's8.1', 's12.1', 's4.1', 's3.3', 's11.1', 's7.1', 's3.2', 's6.1', 's10.1', 's5.2', 's9.3', 's9.2', 's5.3', 's12.3', 's4.3', 's8.2', 's8.3', 's4.2', 's12.2'], '1D+38_1D+64_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE': ['s7.2', 's3.

In [118]:
print(fishes["fishctrl3"])

{'NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE': ['s6.1', 's10.1', 's3.2', 's3.3', 's7.1', 's8.3', 's4.3', 's8.2', 's9.2', 's5.3', 's5.2', 's9.3', 's10.2', 's6.2', 's6.3', 's10.3', 's3.1', 's7.3', 's7.2', 's8.1', 's9.1', 's5.1'], '3D+35_NONE_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE': ['s6.1'], '1D+38_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE': ['s6.1', 's3.2', 's3.3', 's8.2', 's9.2', 's5.3', 's5.2', 's10.2', 's6.3', 's10.3', 's7.2', 's5.1'], 'NONE_NONE_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE': ['s6.1', 's4.3', 's5.3', 's5.2', 's10.2', 's10.3', 's5.1'], '3D+35_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE': ['s6.1', 's3.2', 's3.3', 's5.3', 's5.2', 's9.3', 's10.2', 's6.3', 's7.2', 's5.1'], '30D+35_30D+35_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE': ['s6.1', 's5.2'], '10D+29_NONE_NONE_NONE_1D+146_3D+169_NONE_NONE_NONE_NONE': ['s6.1'], 'NONE_NONE_NONE_NONE_8D+137_1D+172_NONE_NONE_NONE_NONE': ['s6.1'], 'NONE_NONE_NONE_NONE_NONE_8D+173_2D+201_3D+229_7I+253+ACTCCAT_NONE':

In [9]:
# printing a dictionary can look kind of clunky, let's fix that
print("printing the dictionary for control fish 3 line by line...\n")
print("fishctrl3:")
for i,j in fishes["fishctrl3"].items():
    print(i,j)

printing the dictionary for control fish 3 line by line...

fishctrl3:
NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE ['s6.1', 's10.1', 's3.2', 's3.3', 's7.1', 's8.3', 's4.3', 's8.2', 's9.2', 's5.3', 's5.2', 's9.3', 's10.2', 's6.2', 's6.3', 's10.3', 's3.1', 's7.3', 's7.2', 's8.1', 's9.1', 's5.1']
3D+35_NONE_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE ['s6.1']
1D+38_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE ['s6.1', 's3.2', 's3.3', 's8.2', 's9.2', 's5.3', 's5.2', 's10.2', 's6.3', 's10.3', 's7.2', 's5.1']
NONE_NONE_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE ['s6.1', 's4.3', 's5.3', 's5.2', 's10.2', 's10.3', 's5.1']
3D+35_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE ['s6.1', 's3.2', 's3.3', 's5.3', 's5.2', 's9.3', 's10.2', 's6.3', 's7.2', 's5.1']
30D+35_30D+35_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE ['s6.1', 's5.2']
10D+29_NONE_NONE_NONE_1D+146_3D+169_NONE_NONE_NONE_NONE ['s6.1']
NONE_NONE_NONE_NONE_8D+137_1D+172_NONE_NONE_NONE_NONE ['s6.1']
NONE_NONE_NONE_NONE_NONE_8D+17

#### Next, write dictionaries to files

In [10]:
# if the fish_dictionaries folder doesn't already exist, create it
if not os.path.exists('./fish_dictionaries/'): 
    os.mkdir("./fish_dictionaries/")

In [12]:
output_dir = "./fish_dictionaries/"

In [13]:
# for every fish in the fishes dictionary
for fish in fishes.keys(): 
    # create an output tsv file
    with open(output_dir+fish+"_dictionary.tsv", 'w') as f: 
        # write a header on the first line
        f.write(fish +'\t' + 'barcode' + '\t' + 'occurrence'+'\n')
        # for each line, write the barcode (i) and the sample reps it shows up in within that fish (j)
        for i, j in fishes[fish].items():
            f.write('\t' + i + '\t' + str(j) + '\n')
    f.close()

In [14]:
# check that file outputs are the right length
# when you open up the dictionary tsv files in Excel or Notepad++, they should match the dictionary lengths here:
print(len(fishes["fish1"]))
print(len(fishes["fish5"]))
print(len(fishes["fishctrl3"]))

10023
15899
27354


#### Finally, make a no-fly list for barcodes that show up in multiple animals

In [15]:
# this list will hold the barcodes we want to filter out because they show up in multiple samples
# the only barcode we should reasonably expect to show up in multiple samples is the unedited barcode 

In [16]:
# save the unedited barcode string as a variable
unedited = "NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE"

In [17]:
# looking at barcode list for fish 1, this should match the order in the dictionary file
fishes["fish1"].keys()

dict_keys(['1D+38_28D+41_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE', '7D+36_12D+60_NONE_1I+119+A&5D+121_19D+130_4S+169+CAGA_6D+195_1D+222&1I+229+T_8D+244_NONE', '1D+38_28D+41_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE', '3D+37_76I+66+TGTCGTGCAGTCGACTCCATGACAGCAGATACTGCTGTCGACTCCATACTGCATGATATCTCTGTCGACTCCATAG_NONE_NONE_NONE_82D+172_82D+172_82D+172_82D+172_NONE', '12D+34_12D+60_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE', '2I+37+GA_27D+51_NONE_3I+119+CCA_NONE_3D+169_54D+199_54D+199_54D+199_NONE', '12D+34_12D+60_NONE_1I+119+A&5D+121_19D+130_4S+169+CAGA_6D+195_1D+222&1I+229+T_8D+244_NONE', '12D+34_12D+60_NONE_NONE_3D+143_NONE_54D+199_54D+199_54D+199_NONE', 'NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE', 'NONE_NONE_NONE_NONE_NONE_81D+170_81D+170_81D+170_81D+170_NONE', '4D+38_81D+61_81D+61_81D+61_81D+61_NONE_NONE_NONE_NONE_NONE', '2I+36+GA_NONE_NONE_15D+111_72D+130_72D+130_72D+130_NONE_3D+251_NONE', '4D+38_15D+65_NONE_54D+119_54D+119_54D+119_NONE_NONE_NONE_NONE', '8D+31_3D+64

In [18]:
filepaths = []
for dirpath, dirnames, filenames in os.walk("."):
   for filename in filenames:
      if filename.endswith("dictionary.tsv"):
          filepath = os.path.join(dirpath, filename)
          filepaths.append(filepath)

print("Listing paths for every dictionary files in your directory:\n")
for i in filepaths: 
    print(i)

Listing paths for every dictionary files in your directory:

./fish_dictionaries/fish5_dictionary.tsv
./fish_dictionaries/fish1_dictionary.tsv
./fish_dictionaries/fishctrl3_dictionary.tsv


In [19]:
# make a list to hold all fish names
fish_list = []
for path in filepaths: 
    path = path.split("/")
    # file name should be the last element when separated by /
    filename = path[-1]
    fish = filename.split("_")
    # fish name should be the first element when filename is separated by _
    fish_name = fish[0]
    # add all fish names to fish list
    if fish_name not in fish_list:
        fish_list.append(fish_name)
print(fish_list)

['fish5', 'fish1', 'fishctrl3']


In [20]:
all_barcodes = []
good_barcodes = []
bad_barcodes = []

# for all fish dictionary tsv files
for read_file in filepaths: 
    # open the file 
    file = open(read_file, 'r')
    # read the first line to grab the fish name
    fish = file.readline().split("\t")[0]
    # for the barcode and sample replicate occurrence in each line
    for line in file.readlines(): 
        line = line.split("\t")
        # grab the barcode 
        barcode = line[1]
        # if the barcode is not the unedited barcode, compare it to barcodes from other fish
        if barcode != unedited: 
            # if the barcode is not in all barcodes, 
            if barcode not in all_barcodes: 
                # add it to all barcodes 
                all_barcodes.append(barcode)
            # but 
            else:
                # if it is already in all barcodes
                if barcode in all_barcodes:
                    # and not in bad barcodes yet
                    if barcode not in bad_barcodes: 
                        # add it to bad barcodes list
                        bad_barcodes.append(barcode)
                        # this should prevent bad barcodes from being added twice or more to the bad barcodes list
file.close()

# for every barcode in all barcodes
for barcode in all_barcodes: 
    # if it's not in bad barcodes
    if barcode not in bad_barcodes: 
        # it is a good barcode
        good_barcodes.append(barcode)

In [21]:
print("number of unique barcodes across all fish: "+str(len(all_barcodes)))
print("number of good barcodes across all fish: "+str(len(good_barcodes)))
print("number of bad barcodes across all fish: "+str(len(bad_barcodes)))

number of unique barcodes across all fish: 49779
number of good barcodes across all fish: 46664
number of bad barcodes across all fish: 3115


In [22]:
print("These are bad barcodes, not to be trusted:")
print(bad_barcodes)

These are bad barcodes, not to be trusted:
['1D+38_28D+41_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE', '7D+36_12D+60_NONE_1I+119+A&5D+121_19D+130_4S+169+CAGA_6D+195_1D+222&1I+229+T_8D+244_NONE', '1D+38_28D+41_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE', '3D+37_76I+66+TGTCGTGCAGTCGACTCCATGACAGCAGATACTGCTGTCGACTCCATACTGCATGATATCTCTGTCGACTCCATAG_NONE_NONE_NONE_82D+172_82D+172_82D+172_82D+172_NONE', '12D+34_12D+60_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE', '2I+37+GA_27D+51_NONE_3I+119+CCA_NONE_3D+169_54D+199_54D+199_54D+199_NONE', '12D+34_12D+60_NONE_NONE_3D+143_NONE_54D+199_54D+199_54D+199_NONE', 'NONE_NONE_NONE_NONE_NONE_81D+170_81D+170_81D+170_81D+170_NONE', '2I+36+GA_NONE_NONE_15D+111_72D+130_72D+130_72D+130_NONE_3D+251_NONE', '4D+38_15D+65_NONE_54D+119_54D+119_54D+119_NONE_NONE_NONE_NONE', '12D+34_12D+60_NONE_NONE_NONE_NONE_NONE_NONE_NONE_NONE', '12D+34_12D+60_NONE_NONE_3D+143_84D+169_84D+169_84D+169_84D+169_NONE', 'NONE_NONE_NONE_NONE_NONE_1D+172_54D+199_54D+199_54D+199_NO

In [23]:
output = open(output_dir+'no_fly_barcodes_list.txt', 'w')
for barcode in bad_barcodes: 
    output.write(barcode + '\n')
output.close()

In [24]:
# when you open no_fly_barcode_list.txt, you should be able to copy a barcode string and search for it in your directory
# you'll have peace of mind that this works if the barcode is found in allReadCount files corresponding to different fish

# for example: 
# 1D+38_28D+41_NONE_NONE_NONE_NONE_54D+199_54D+199_54D+199_NONE shows up in fish 1 and fish 5
# 7D+36_12D+60_NONE_1I+119+A&5D+121_19D+130_4S+169+CAGA_6D+195_1D+222&1I+229+T_8D+244_NONE shows up in fish 1 and fistctrl3

