In [1]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import glob
import seaborn as sns
import gzip
import matplotlib.pyplot as plt
from adjustText import adjust_text

In [2]:
allowed_barcodes = pd.read_excel("../data/220922_BD354_barcodes.xlsx").iloc[:42,]
allowed_barcodes["Name"] = [x.replace("BD354_", "") for x in allowed_barcodes["Name"]]
allowed_barcodes.head()

Unnamed: 0,Sample num,Tx,Antibody,Fraction,Replicate,Name,Cell number,Nanodrop 1:10,ul,nM,P5,P7,hash,Unnamed: 13,index 1 (P7),index 2 (P5),Unnamed: 16,Unnamed: 17,Unnamed: 18
0,1.0,DMSO,CD11b,low,1.0,DMSO_CD11b_low_rep1,,21.5,4.651163,,1.0,1.0,11.0,,ATTACTCG,AGGCTATA,,,
1,2.0,DMSO,CD11b,high,1.0,DMSO_CD11b_high_rep1,,18.6,5.376344,25.2,2.0,2.0,22.0,,TCCGGAGA,GCCTCTAT,,Conversion from nanodrop to nM,
2,3.0,DMSO,CD11b,low,2.0,DMSO_CD11b_low_rep2,,12.5,8.0,,3.0,3.0,33.0,,CGCTCATT,AGGATAGG,,nanodrop,nM
3,4.0,DMSO,CD11b,high,2.0,DMSO_CD11b_high_rep2,,16.0,6.25,,4.0,4.0,44.0,,GAGATTCC,TCAGAGCC,,18.6,25.2
4,5.0,DMSO,GFP,low,1.0,DMSO_GFP_low_rep1,,15.3,6.535948,,5.0,5.0,55.0,,ATTCAGAA,CTTCGCCT,,16.4,21.7


In [3]:
# weissman sgRNAs
library = pd.read_csv("../data/TableS4_mCRISPRiv2_library.csv")
library = library[library["Sublibrary half"] == "Top5"]

weissman_guides = [x.upper() for x in list(library["protospacer sequence"])]
library.head()

Unnamed: 0,sgID,gene,transcript,protospacer sequence,selection rank,predicted score,empirical score,off-target stringency,Sublibrary,Sublibrary half
0,0610007P14Rik_+_85824465.23-P1P2,0610007P14Rik,P1P2,GTAGTACTGGATCGTCAGGT,1.0,0.898157,,0,m3,Top5
1,0610007P14Rik_+_85824469.23-P1P2,0610007P14Rik,P1P2,GGCAGTAGTACTGGATCGTC,2.0,0.862836,,0,m3,Top5
2,0610007P14Rik_+_85824478.23-P1P2,0610007P14Rik,P1P2,GGATACTGCTGCAGTAGTAC,3.0,0.760573,,0,m3,Top5
3,0610007P14Rik_+_85824456.23-P1P2,0610007P14Rik,P1P2,GATCGTCAGGTAGGAGAATT,4.0,0.747215,,0,m3,Top5
4,0610007P14Rik_+_85824523.23-P1P2,0610007P14Rik,P1P2,GTGGGACCGGAGCTGGCCTA,5.0,0.740656,,0,m3,Top5


In [21]:
def get_lines(fn, counter, start_at_zero=False):
    i = 0
    with gzip.open(fn,'r') as fin:
        for line in fin:
            i += 1
            if ((i + 2) % 4 == 0): 
                # change the first letter to G because it's often read wrong
                # also read the first 20
                if not start_at_zero: seq = 'G' + line.rstrip()[1:20].decode() 
                else: seq = 'G' + line.rstrip()[:19].decode() 
                
                if seq not in counter: counter[seq] = 1
                else: counter[seq] += 1
        
    return(counter)

# get dictionary of reads for all files
def get_read_dict(fn1, fn2, fn3, fn4, fn5):
    print(fn1)
    counter = defaultdict(lambda: 0)
    
    counter = get_lines(fn1, counter)
    counter = get_lines(fn2, counter)
    counter = get_lines(fn3, counter)
    counter = get_lines(fn4, counter)
    counter = get_lines(fn5, counter, start_at_zero=True)  # these were from the third run where the first base is not G 

    
    return(dict(counter))

In [22]:
# fix the counter for fn5, which didn't get added because I forgot to set start_at_zero to True

for j in range(42):
    # these samples were numbered 1 to 42
    fn5 = glob.glob("../fastq_guess3/*_" + str(j+1) + ".fastq.gz")[0]
    print(fn5)
    print(j)

    sample_to_dicts[j] = get_lines(fn5, sample_to_dicts[j], start_at_zero=True)

../fastq_guess3/ATTACTCG_AGGCTATA_1.fastq.gz
0
../fastq_guess3/TCCGGAGA_GCCTCTAT_2.fastq.gz
1
../fastq_guess3/CGCTCATT_AGGATAGG_3.fastq.gz
2
../fastq_guess3/GAGATTCC_TCAGAGCC_4.fastq.gz
3
../fastq_guess3/ATTCAGAA_CTTCGCCT_5.fastq.gz
4
../fastq_guess3/GAATTCGT_TAAGATTA_6.fastq.gz
5
../fastq_guess3/ATTACTCG_ACGTCCTG_7.fastq.gz
6
../fastq_guess3/TCCGGAGA_AGGCTATA_8.fastq.gz
7
../fastq_guess3/CGCTCATT_GCCTCTAT_9.fastq.gz
8
../fastq_guess3/GAGATTCC_AGGATAGG_10.fastq.gz
9
../fastq_guess3/ATTCAGAA_TCAGAGCC_11.fastq.gz
10
../fastq_guess3/GAATTCGT_CTTCGCCT_12.fastq.gz
11
../fastq_guess3/ATTACTCG_TAAGATTA_13.fastq.gz
12
../fastq_guess3/TCCGGAGA_ACGTCCTG_14.fastq.gz
13
../fastq_guess3/CGCTCATT_AGGCTATA_15.fastq.gz
14
../fastq_guess3/GAGATTCC_GCCTCTAT_16.fastq.gz
15
../fastq_guess3/ATTCAGAA_AGGATAGG_17.fastq.gz
16
../fastq_guess3/GAATTCGT_TCAGAGCC_18.fastq.gz
17
../fastq_guess3/ATTACTCG_CTTCGCCT_19.fastq.gz
18
../fastq_guess3/TCCGGAGA_TAAGATTA_20.fastq.gz
19
../fastq_guess3/CGCTCATT_ACGTCCTG_21.fa

In [8]:
sample_to_dicts = []

for i in range(42):
    # these samples were numbered 1 to 42
    fn1 = glob.glob("../fastq_guess/*_" + str(i+1) + ".fastq.gz")[0]
    fn3 = glob.glob("../fastq_guess2/*_" + str(i+1) + ".fastq.gz")[0]
    fn5 = glob.glob("../fastq_guess3/*_" + str(i+1) + ".fastq.gz")[0]

    # these samples were numbered starting from 2 to 43
    fn2 = "../fastq_perfect/220927Van_D22-2430" + str(i+2).zfill(2) + "_NA_sequence.fastq.gz"
    fn4 = "../fastq_perfect2/220927Van_D22-2430" + str(i+2).zfill(2) + "_NA_sequence.fastq.gz"
    sample_to_dicts.append(get_read_dict(fn1, fn2, fn3, fn4, fn5))

../fastq_guess/ATTACTCG_AGGCTATA_1.fastq.gz
../fastq_guess/TCCGGAGA_GCCTCTAT_2.fastq.gz
../fastq_guess/CGCTCATT_AGGATAGG_3.fastq.gz
../fastq_guess/GAGATTCC_TCAGAGCC_4.fastq.gz
../fastq_guess/ATTCAGAA_CTTCGCCT_5.fastq.gz
../fastq_guess/GAATTCGT_TAAGATTA_6.fastq.gz
../fastq_guess/ATTACTCG_ACGTCCTG_7.fastq.gz
../fastq_guess/TCCGGAGA_AGGCTATA_8.fastq.gz
../fastq_guess/CGCTCATT_GCCTCTAT_9.fastq.gz
../fastq_guess/GAGATTCC_AGGATAGG_10.fastq.gz
../fastq_guess/ATTCAGAA_TCAGAGCC_11.fastq.gz
../fastq_guess/GAATTCGT_CTTCGCCT_12.fastq.gz
../fastq_guess/ATTACTCG_TAAGATTA_13.fastq.gz
../fastq_guess/TCCGGAGA_ACGTCCTG_14.fastq.gz
../fastq_guess/CGCTCATT_AGGCTATA_15.fastq.gz
../fastq_guess/GAGATTCC_GCCTCTAT_16.fastq.gz
../fastq_guess/ATTCAGAA_AGGATAGG_17.fastq.gz
../fastq_guess/GAATTCGT_TCAGAGCC_18.fastq.gz
../fastq_guess/ATTACTCG_CTTCGCCT_19.fastq.gz
../fastq_guess/TCCGGAGA_TAAGATTA_20.fastq.gz
../fastq_guess/CGCTCATT_ACGTCCTG_21.fastq.gz
../fastq_guess/GAGATTCC_AGGCTATA_22.fastq.gz
../fastq_guess/ATTC

In [27]:
import pickle
pickle.dump(sample_to_dicts, open("221029_bd354_counts.pkl", "wb" ) )

In [23]:
# a dict of count vectors
sample_to_counts = {}
for i in range(42):
    sample_to_counts[i] = [sample_to_dicts[i].get(k, 0) for k in weissman_guides]

In [25]:
counts = pd.DataFrame.from_dict(sample_to_counts)
counts.index = library["sgID"]
counts.columns = allowed_barcodes["Name"]

counts.to_csv("221029_bd354_counts_by_sgrna.csv")
counts.head()

Name,DMSO_CD11b_low_rep1,DMSO_CD11b_high_rep1,DMSO_CD11b_low_rep2,DMSO_CD11b_high_rep2,DMSO_GFP_low_rep1,DMSO_GFP_high_rep1,DMSO_GFP_low_rep2,DMSO_GFP_high_rep2,HU_CD11b_low_rep1,HU_CD11b_high_rep1,...,Day0_all_all_rep1,Day0_all_all_rep2,DMSO_all_all_rep1,DMSO_all_all_rep2,HU_all_all_rep1,HU_all_all_rep2,APH_all_all_rep1,APH_all_all_rep2,E2wd_all_all_rep1,E2wd_all_all_rep2
sgID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0610007P14Rik_+_85824465.23-P1P2,123,120,63,101,103,126,208,118,118,140,...,307,115,167,136,122,125,61,127,334,100
0610007P14Rik_+_85824469.23-P1P2,144,152,105,132,134,184,213,148,126,132,...,214,133,172,206,147,123,118,184,261,137
0610007P14Rik_+_85824478.23-P1P2,250,302,268,259,365,523,536,435,348,259,...,672,326,430,400,417,391,173,422,694,247
0610007P14Rik_+_85824456.23-P1P2,58,98,107,140,113,108,171,148,109,122,...,192,87,102,101,156,135,80,104,243,74
0610007P14Rik_+_85824523.23-P1P2,65,122,65,93,132,172,150,101,69,94,...,234,119,132,139,109,113,91,106,187,82


## MAGECK

In [41]:
counts_mageck = pd.read_csv("221029_bd354_counts_by_sgrna.csv", index_col=0)
counts_mageck.head()

Unnamed: 0_level_0,DMSO_CD11b_low_rep1,DMSO_CD11b_high_rep1,DMSO_CD11b_low_rep2,DMSO_CD11b_high_rep2,DMSO_GFP_low_rep1,DMSO_GFP_high_rep1,DMSO_GFP_low_rep2,DMSO_GFP_high_rep2,HU_CD11b_low_rep1,HU_CD11b_high_rep1,...,Day0_all_all_rep1,Day0_all_all_rep2,DMSO_all_all_rep1,DMSO_all_all_rep2,HU_all_all_rep1,HU_all_all_rep2,APH_all_all_rep1,APH_all_all_rep2,E2wd_all_all_rep1,E2wd_all_all_rep2
sgID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0610007P14Rik_+_85824465.23-P1P2,123,120,63,101,103,126,208,118,118,140,...,307,115,167,136,122,125,61,127,334,100
0610007P14Rik_+_85824469.23-P1P2,144,152,105,132,134,184,213,148,126,132,...,214,133,172,206,147,123,118,184,261,137
0610007P14Rik_+_85824478.23-P1P2,250,302,268,259,365,523,536,435,348,259,...,672,326,430,400,417,391,173,422,694,247
0610007P14Rik_+_85824456.23-P1P2,58,98,107,140,113,108,171,148,109,122,...,192,87,102,101,156,135,80,104,243,74
0610007P14Rik_+_85824523.23-P1P2,65,122,65,93,132,172,150,101,69,94,...,234,119,132,139,109,113,91,106,187,82


In [42]:
# Making a file for MAGECK:
gene_name = [x.split('_')[0] for x in counts_mageck.index]
counts_mageck["gene_name"] = gene_name
col_list = ["gene_name"]
col_list.extend(counts_mageck.columns[:42])

counts_mageck = counts_mageck[col_list]
counts_mageck.head()

Unnamed: 0_level_0,gene_name,DMSO_CD11b_low_rep1,DMSO_CD11b_high_rep1,DMSO_CD11b_low_rep2,DMSO_CD11b_high_rep2,DMSO_GFP_low_rep1,DMSO_GFP_high_rep1,DMSO_GFP_low_rep2,DMSO_GFP_high_rep2,HU_CD11b_low_rep1,...,Day0_all_all_rep1,Day0_all_all_rep2,DMSO_all_all_rep1,DMSO_all_all_rep2,HU_all_all_rep1,HU_all_all_rep2,APH_all_all_rep1,APH_all_all_rep2,E2wd_all_all_rep1,E2wd_all_all_rep2
sgID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0610007P14Rik_+_85824465.23-P1P2,0610007P14Rik,123,120,63,101,103,126,208,118,118,...,307,115,167,136,122,125,61,127,334,100
0610007P14Rik_+_85824469.23-P1P2,0610007P14Rik,144,152,105,132,134,184,213,148,126,...,214,133,172,206,147,123,118,184,261,137
0610007P14Rik_+_85824478.23-P1P2,0610007P14Rik,250,302,268,259,365,523,536,435,348,...,672,326,430,400,417,391,173,422,694,247
0610007P14Rik_+_85824456.23-P1P2,0610007P14Rik,58,98,107,140,113,108,171,148,109,...,192,87,102,101,156,135,80,104,243,74
0610007P14Rik_+_85824523.23-P1P2,0610007P14Rik,65,122,65,93,132,172,150,101,69,...,234,119,132,139,109,113,91,106,187,82


In [49]:
counts_mageck.to_csv("../mageck/230408_bd354_counts_mageck.txt", sep='\t')

In [45]:
# negative control genes -- should just be a list
neg_ctrl = counts_mageck[counts_mageck["gene_name"] == "non-targeting"].index
neg_ctrl

Index(['non-targeting_00000', 'non-targeting_00001', 'non-targeting_00002',
       'non-targeting_00003', 'non-targeting_00004', 'non-targeting_00005',
       'non-targeting_00006', 'non-targeting_00007', 'non-targeting_00008',
       'non-targeting_00009',
       ...
       'non-targeting_02160', 'non-targeting_02161', 'non-targeting_02162',
       'non-targeting_02163', 'non-targeting_02164', 'non-targeting_02165',
       'non-targeting_02166', 'non-targeting_02167', 'non-targeting_02168',
       'non-targeting_02169'],
      dtype='object', name='sgID', length=2170)

In [48]:
with open("230408_bd354_neg_ctrl_guides.txt", 'w') as ofile:
    for i in neg_ctrl:
        ofile.write(i + '\n')