### this notebook is for final examinations of the full oligo pool

A couple things to check:
1. no RE site
2. no more motifs passing threshold
3. mutagenesis correct
4. BC unique and within proper GC content
5. properly named with correct BC

Then compile all sequences into one tsv and one fasta

In [1]:
import os, sys, warnings, re, glob2, itertools, string, random, math

import numpy as np
import pandas as pd
import scipy
from scipy import stats
from scipy.stats import mannwhitneyu, normaltest
from scipy import interpolate
import statsmodels
import statsmodels.api as sm
import fastcluster

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.gridspec as gridspec
from matplotlib import cm
from matplotlib_venn import venn3
from matplotlib.ticker import FormatStrFormatter
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.font_manager
import seaborn as sns

### I. housekeeping and load data

In [2]:
# set up working directories (do not run twice)
base_dir = os.path.split(os.getcwd())[0]
sys.path.insert(0, os.path.join(os.getcwd(),"utils"))
os.chdir(base_dir)
scriptdir=os.path.join(base_dir, "scripts")

In [4]:
from utils import specseq_plot_utils,sequence_annotator, mpra_CRE_utils, mpra_plot_utils
specseq_plot_utils.set_manuscript_params() # max 7pt

In [40]:
# note the actual primer sequences comtain partial restriction enzyme site, so be careful not to duplicate those sequences
primerF = "GTAGCGTCTGTCCGT" #15
primerR = "CTGTAGTAGTAGTTGG" #16
primerR_revcomp = sequence_annotator.rev_comp(primerR)
padding="C" # for padding betweeing SpeI and SphI site
EcoRI="GAATTC"
SpeI="ACTAGT"
SphI="GCATGC"
EagI="CGGCCG"
basalFiller="GTCCCCCTTGCAGAATTACATGCAACCCTCAACTGCTCGATCTAGCTCTCTAATGGCAAGATCCGCAAGGGCAAAACAGACTATTACCGCGAGATCCGAACGAAATGAAGATTGAACATGGCTGTAATTGGGCT"

In [41]:
# peak lists
fimo_th = 2.5e-3
#fimo_th = 1.0e-3
# full peak list with annotation
mpraAnnot_df = pd.read_csv(os.path.join(base_dir, "peaksets", f"allCRE_annotation.withMutants.fimo{fimo_th}.tsv"), sep="\t", header=0)
# motif position information
allMonomerSites = pd.read_csv(os.path.join(base_dir, "peaksets", f"allMonomerMutatedSites.fimo{fimo_th}.tsv"), sep="\t", header=0)
allDimerSites = pd.read_csv(os.path.join(base_dir, "peaksets", f"allDimerMutatedSites.fimo{fimo_th}.tsv"), sep="\t", header=0)
# 20 controls trimmed to 134bp
trimmed_control_CREs = pd.read_csv(os.path.join(base_dir, "peaksets", "positiveControls.trimmed.tsv"), sep="\t", header=0)
# 150 scrambles
scrambled_CREs = pd.read_csv(os.path.join(base_dir, "peaksets", "scrambled150.withShuffles.tsv"), sep="\t", header=0)

# fastas
allCRE_fasta = sequence_annotator.read_fasta(os.path.join(base_dir, "sequences", "mpraAllCRE.fa"))
singleDimerMutants = sequence_annotator.read_fasta(os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", "mutatedCRE.k88n_olap.MEME.2.mutated.fa"))
singleMonomerMutants = sequence_annotator.read_fasta(os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", "mutatedCRE.CRX_Corbo.mutated.fa"))
doubleMutants = sequence_annotator.read_fasta(os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", "mutatedCRE.dimerMonomer.mutated.fa"))
trimmed_control_fasta = sequence_annotator.read_fasta(os.path.join(base_dir, "sequences", "positiveControls.trimmed.fa"))
scrambled_fasta = sequence_annotator.read_fasta(os.path.join(base_dir,"sequences", "scrambled150_shuffled.fa"))

# occupancy data
mu = 0.1
# unmaked occupancy landscape
unmasked_occupancy_df = pd.read_csv(os.path.join(base_dir, "predicted_occp", f"allCRE.unmasked.occupancy.mu{str(mu)}.tsv"), sep="\t", header=0)
# dimeric motif maksed occupancy landscape
masked_occupancy_df = pd.read_csv(os.path.join(base_dir, "predicted_occp", f"allCRE.masked.occupancy.mu{str(mu)}.tsv"), sep="\t", header=0)
mu = 9
# unmaked occupancy landscape
unmasked_occupancy_df2 = pd.read_csv(os.path.join(, "predicted_occp", f"allCRE.unmasked.occupancy.mu{str(mu)}.tsv"), sep="\t", header=0)
# dimeric motif maksed occupancy landscape
masked_occupancy_df2 = pd.read_csv(os.path.join(base_dir, "predicted_occp", f"allCRE.masked.occupancy.mu{str(mu)}.tsv"), sep="\t", header=0)

In [106]:
fimo_th = 2.5e-3
mpra_library_oligo = sequence_annotator.read_fasta(os.path.join(base_dir, "sequences", f"hdmuts_library.Oligos.fimo{fimo_th}.filtered.fa"))
control_oligo = sequence_annotator.read_fasta(os.path.join(base_dir, "sequences", f"hdmuts_control.Oligos.fimo{fimo_th}.fa"))
basal_oligo = sequence_annotator.read_fasta(os.path.join(base_dir, "sequences", "hdmuts_basal.Oligos.fa"))

#### there are some barcodes that ends with CGGC or starts with GGCCG|GCATG|GAATT will create RE site and needs to be drop

In [98]:
all_barcodes = pd.read_csv(os.path.join(base_dir,"barcodes","validated10bp_barcodes.txt"), header=None).rename(columns={0:"BC"})

In [208]:
all_barcodes = all_barcodes[~all_barcodes.BC.apply(lambda row: (row[-4:]=="CGGC")|(row[-5:]=="GCATG")|(row[:5]=="GGCCG")|(row[-5:]=="GAATT"))].reset_index(drop=True)

In [209]:
all_barcodes.to_csv(os.path.join(base_dir,"barcodes","validated10bp_barcodes.txt"), index=False, header=False)

In [210]:
len(all_barcodes)

19738

### check BC match in oligo and that in the label, check RE site

In [26]:
mpra_library_oligo.index[0]

'chr1-4360266-4360399_peak.3_RetinalGene_WT_TTGATCCTAC'

In [33]:
control_oligo.index[0]

'chr14-20440153-20440286_peak.9841_control_WT_TGTATGCCGG'

In [34]:
basal_oligo.index[0]

'basal.1_GGTAATCTAT'

In [108]:
mpra_library_CREs = mpra_library_oligo.index.str.split(pat="_|-", expand=True).to_frame().reset_index(drop=True)

In [211]:
CRX_CREs = mpra_library_CREs[mpra_library_CREs[7].isnull()].drop(columns=7).copy().rename(columns={0:"seqnames",
                                                                                                    1:"start",
                                                                                                    2:"end",
                                                                                                    3:"peak.id",
                                                                                                    4:"annotation",
                                                                                                    5:"motif",
                                                                                                    6:"BC"})
CRX_oligo = mpra_library_oligo[CRX_CREs.index].reset_index().rename(columns={0:"fullOligo"})
CRX_CREs = pd.merge(CRX_CREs.reset_index(drop=True), CRX_oligo, left_index=True, right_index=True)

# use new BCs
CRX_CREs["BC"] = all_barcodes[:len(CRX_CREs)].reset_index(drop=True)
CRX_CREs["label"] = CRX_CREs["seqnames"]+"-"+CRX_CREs["start"].astype(str)+"-"+CRX_CREs["end"].astype(str)+"_"+CRX_CREs["peak.id"]+"_"+CRX_CREs["annotation"]+"_"+CRX_CREs["motif"]+"_"+CRX_CREs["BC"]
CRX_CREs["CRE"] = CRX_CREs["fullOligo"].apply(lambda row: row[len(primerF+EcoRI):-len(SpeI+"C"+SphI+"NNNNNNNNNN"+EagI+primerR)])
CRX_CREs["fullOligo"] = primerF+EcoRI+CRX_CREs["CRE"]+SpeI+"C"+SphI+CRX_CREs["BC"]+EagI+primerR_revcomp
CRX_CREs["BC.inOligo"] = CRX_CREs["fullOligo"].apply(lambda row: row[-len("NNNNNNNNNN"+EagI+primerR):-len(EagI+primerR)])

In [183]:
# if all BCs in oligo matches that in label
set(CRX_CREs["BC"]) - set(CRX_CREs["BC.inOligo"])

0

In [184]:
CRX_CREs.iloc[mpra_CRE_utils.find_REsite_match(CRX_CREs["CRE"], RE_list=[EcoRI,SpeI,SphI,EagI]).index,:]

Looking for matches: GAATTC|ACTAGT|GCATGC|CGGCCG


Unnamed: 0,seqnames,start,end,peak.id,annotation,motif,BC,label,fullOligo,CRE,BC.inOligo


In [106]:
# there is a match of TACTAGTG match due to a mutation in the monomer site, manually change it to TCCTAGTG in the fasta and reload

In [185]:
mpra_CRE_utils.find_REsite_match(CRX_CREs["CRE"], RE_list=[EcoRI,SpeI,SphI,EagI])

Looking for matches: GAATTC|ACTAGT|GCATGC|CGGCCG


Series([], Name: CRE, dtype: object)

In [212]:
scrambled_CREs = mpra_library_CREs[mpra_library_CREs[7].notnull()].copy().rename(columns={0:"seqnames",
                                                                                                    1:"start",
                                                                                                    2:"end",
                                                                                                    3:"peak.id",
                                                                                                    4:"shuffled",
                                                                                                    5:"annotation",
                                                                                                    6:"motif",
                                                                                                    7:"BC"})
scrambled_oligo = mpra_library_oligo[scrambled_CREs.index].reset_index().rename(columns={0:"fullOligo"})
scrambled_CREs = pd.merge(scrambled_CREs.reset_index(drop=True), scrambled_oligo, left_index=True, right_index=True)

# use new BCs
scrambled_CREs["BC"] = all_barcodes[len(CRX_CREs):len(CRX_CREs)+len(scrambled_CREs)].reset_index(drop=True)
# rename shulffled sequences so that the _shuff tag would not be cutoff ?? is this necessary/which is more convenient when parsing the final fasta
scrambled_CREs["label"] = scrambled_CREs["seqnames"]+"-"+scrambled_CREs["start"].astype(str)+"-"+scrambled_CREs["end"].astype(str)+"_"+scrambled_CREs["peak.id"]+".shuff"+"_"+scrambled_CREs["annotation"]+"_"+scrambled_CREs["motif"]+"_"+scrambled_CREs["BC"]
scrambled_CREs["CRE"] = scrambled_CREs["fullOligo"].apply(lambda row: row[len(primerF+EcoRI):-len(SpeI+"C"+SphI+"NNNNNNNNNN"+EagI+primerR)])
scrambled_CREs["fullOligo"] = primerF+EcoRI+scrambled_CREs["CRE"]+SpeI+"C"+SphI+scrambled_CREs["BC"]+EagI+primerR_revcomp
scrambled_CREs["BC.inOligo"] = scrambled_CREs["fullOligo"].apply(lambda row: row[-len("NNNNNNNNNN"+EagI+primerR):-len(EagI+primerR)])

In [187]:
scrambled_CREs["label"][0]

'chr1-66831349-66831482_peak.159.shuff_ELost_scrambled_CAGCTATCGT'

In [188]:
# if all BCs in oligo matches that in label
sum(scrambled_CREs["BC"]!=scrambled_CREs["BC.inOligo"])

0

In [189]:
mpra_CRE_utils.find_REsite_match(scrambled_CREs["CRE"], RE_list=[EcoRI,SpeI,SphI,EagI])

Looking for matches: GAATTC|ACTAGT|GCATGC|CGGCCG


Series([], Name: CRE, dtype: object)

In [213]:
control_CREs = control_oligo.index.str.split(pat="_|-", expand=True).to_frame().reset_index(drop=True).rename(columns={0:"seqnames",
                                                                                                                        1:"start",
                                                                                                                        2:"end",
                                                                                                                        3:"peak.id",
                                                                                                                        4:"annotation",
                                                                                                                        5:"motif",
                                                                                                                        6:"BC"})
control_CREs = pd.merge(control_CREs.reset_index(drop=True), control_oligo.reset_index().rename(columns={0:"fullOligo"}), left_index=True, right_index=True)


# use new BCs
control_CREs["BC"] = all_barcodes[len(CRX_CREs)+len(scrambled_CREs):len(CRX_CREs)+len(scrambled_CREs)+len(control_CREs)].reset_index(drop=True)
control_CREs["label"] = control_CREs["seqnames"]+"-"+control_CREs["start"].astype(str)+"-"+control_CREs["end"].astype(str)+"_"+control_CREs["peak.id"]+"_"+control_CREs["annotation"]+"_"+control_CREs["motif"]+"_"+control_CREs["BC"]
control_CREs["CRE"] = control_CREs["fullOligo"].apply(lambda row: row[len(primerF+EcoRI):-len(SpeI+"C"+SphI+"NNNNNNNNNN"+EagI+primerR)])
control_CREs["fullOligo"] = primerF+EcoRI+control_CREs["CRE"]+SpeI+"C"+SphI+control_CREs["BC"]+EagI+primerR_revcomp
control_CREs["BC.inOligo"] = control_CREs["fullOligo"].apply(lambda row: row[-len("NNNNNNNNNN"+EagI+primerR):-len(EagI+primerR)])

In [191]:
# if all BCs in oligo matches that in label
sum(control_CREs["BC"]!=control_CREs["BC.inOligo"])

0

In [192]:
mpra_CRE_utils.find_REsite_match(control_CREs["CRE"], RE_list=[EcoRI,SpeI,SphI,EagI])

Looking for matches: GAATTC|ACTAGT|GCATGC|CGGCCG


Series([], Name: CRE, dtype: object)

In [246]:
basal_CREs = basal_oligo.index.str.split(pat="_|-", expand=True).to_frame().reset_index(drop=True).reset_index(drop=True)
basal_CREs = basal_oligo.index.str.split(pat="_|-", expand=True).to_frame().reset_index(drop=True).rename(columns={0:"peak.id",
                                                                                                                    1:"BC"})
basal_CREs = pd.merge(basal_CREs.reset_index(drop=True), basal_oligo.reset_index().rename(columns={0:"fullOligo"}), left_index=True, right_index=True)

# use new BCs
basal_CREs["BC"] = all_barcodes[len(CRX_CREs)+len(scrambled_CREs)+len(control_CREs):len(CRX_CREs)+len(scrambled_CREs)+len(control_CREs)+20].reset_index(drop=True)
basal_CREs["label"] = basal_CREs["peak.id"]+"_"+basal_CREs["BC"]
basal_CREs["fullOligo"] = primerF+EcoRI+SpeI+"C"+basalFiller+SphI+basal_CREs["BC"]+EagI+primerR_revcomp
basal_CREs["BC.inOligo"] = basal_CREs["fullOligo"].apply(lambda row: row[-len("NNNNNNNNNN"+EagI+primerR):-len(EagI+primerR)])

In [194]:
# if all BCs in oligo matches that in label
sum(basal_CREs["BC"]!=basal_CREs["BC.inOligo"])

0

### check unique BCs

In [247]:
CRX_oligo_BCs = CRX_CREs["BC.inOligo"].copy()
scrambled_oligo_BCs = scrambled_CREs["BC.inOligo"].copy()
control_oligo_BCs = control_CREs["BC.inOligo"].copy()
basal_oligo_BCs = basal_CREs["BC.inOligo"].copy()

In [196]:
len(pd.concat([CRX_oligo_BCs,scrambled_oligo_BCs,control_oligo_BCs,basal_oligo_BCs]).unique())

17920

In [197]:
len(mpra_library_oligo)+len(control_oligo)+len(basal_oligo)

17920

### check remaining marked HD motifs not mutated

In [147]:
# 1-indexed nucleotide position dictionaries
dimer_core = {1:"T", 2:"A", 3:"A", 9:"T", 10:"T", 11:"A"} # k88n_olap.MEME.2
corbo_mono_core = {2:"T", 3:"A", 4:"A"} # CRX_Corbo
crx_mono_core = {8:"T", 9:"T", 10:"A"} # JASPAR
n50_mono_core = {5:"T", 6:"T", 7:"A"} # k88n_olap.DERME.1
dimer_mutant_core = {3:"C", 9:"G"}
corbo_mutant_core = {4:"C"}
crx_mutant_core = {8:"G"}
n50_mutant_core = {5:"G"}

dimer_motif="k88n_olap.MEME.2"
mono_motif = "CRX_Corbo"
n50_motif = "k88n_olap.DREME.1"

In [142]:
# motif position information
allMonomerSites = pd.read_csv(os.path.join(base_dir, "peaksets", f"allMonomerMutatedSites.fimo{fimo_th}.tsv"), sep="\t", header=0)
allDimerSites = pd.read_csv(os.path.join(base_dir, "peaksets", f"allDimerMutatedSites.fimo{fimo_th}.tsv"), sep="\t", header=0)

In [144]:
dimerMutated_fasta = CRX_CREs.loc[lambda df: df.motif=="mutD","CRE"].copy().dropna()

In [148]:
dimerMatched_fimo_score, _ = mpra_CRE_utils.find_and_mutate_motif(dimerMutated_fasta, allDimerSites, dimer_motif, dimer_mutant_core, dimer_core)

In [149]:
dimerMatched_fimo_score

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq


In [182]:
monomerMatched_fimo_score, _ = mpra_CRE_utils.find_and_mutate_motif(dimerMutated_fasta, allMonomerSites, mono_motif, corbo_mutant_core, corbo_mono_core)
monomerMatched_fimo_score2, _ = mpra_CRE_utils.find_and_mutate_motif(dimerMutated_fasta, allMonomerSites, mono_motif, n50_mutant_core, n50_mono_core)

In [151]:
monomerMatched_fimo_score

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq


In [183]:
monomerMatched_fimo_score2

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq


In [152]:
doubleMatched_fimo_score1, _ = mpra_CRE_utils.find_and_mutate_motif(dimerMutated_fasta, allDimerSites, dimer_motif, dimer_mutant_core, dimer_core)
doubleMatched_fimo_score2, _ = mpra_CRE_utils.find_and_mutate_motif(dimerMutated_fasta, allMonomerSites, mono_motif, corbo_mutant_core, corbo_mono_core)

In [153]:
doubleMatched_fimo_score1

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq


In [154]:
doubleMatched_fimo_score2

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq


### just to be sure, scan the full mutated CRE library again with fimo

In [171]:
allMutated_fasta = CRX_CREs.drop_duplicates(subset=["seqnames","start"]).loc[lambda df: df.motif!="wt",["label","CRE"]].copy().set_index("label").squeeze()

In [173]:
equence_annotator.write_fasta(allMutated_fasta, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", "hdmuts_library.allMutatedCREs.fa"))

In [164]:
meme_dir = f"{base_dir}/meme/all_chip_pwm.meme"
fimo_meta = f"{base_dir}/scripts/fimo_meta.csv"

In [166]:
fimo_sample_list = pd.read_csv(fimo_meta, header=0)
fimo_sample_list.tail(1)

Unnamed: 0,sampleName,inputFA,markovBG,outputDir
12,final_exam,final_exam/hdmuts_library.allMutatedCREs.fa,final_exam/hdmuts_library.allMutatedCREs.backg...,final_exam


In [174]:
# copy the newly generated fasta to fimo folder
old_dir = os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", "hdmuts_library.allMutatedCREs.fa")
new_dir = os.path.join(base_dir, f"fimo_{fimo_th}", os.path.split(fimo_sample_list.iloc[-1,1])[0].split("/")[-1])
!mkdir -p "{new_dir}"
!cp "{old_dir}" "{new_dir}"

In [175]:
!bash ./scripts/meme_fimo_scanning.sh "{scriptdir}" "{base_dir}" "{meme_dir}" "{fimo_th}" "{fimo_meta}" 13

working directory: /mnt/v/yqzheng/qiaoer/VSCode_yiqiao/SPEC-SEQ/scripts
query fasta: /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/final_exam/hdmuts_library.allMutatedCREs.fa
[K1788 134 134 134.0 239592
Scanning with threshold 0.0025
FIMO output will be written to /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/final_exam
ha! this is the end of the script!


In [176]:
# read fimo score for the unmasked fasta sequences
fimo_score = pd.read_csv(os.path.join(base_dir, f"fimo_{fimo_th}", "final_exam", "fimo.tsv"), sep="\t", header=0)

In [178]:
dimerMatched_fimo_score2, _ = mpra_CRE_utils.find_and_mutate_motif(dimerMutated_fasta, fimo_score, dimer_motif, dimer_mutant_core, dimer_core)

In [179]:
dimerMatched_fimo_score2

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq


In [184]:
monomerMatched_fimo_score3, _ = mpra_CRE_utils.find_and_mutate_motif(dimerMutated_fasta, fimo_score, mono_motif, corbo_mutant_core, corbo_mono_core)
monomerMatched_fimo_score4, _ = mpra_CRE_utils.find_and_mutate_motif(dimerMutated_fasta, fimo_score, mono_motif, n50_mutant_core, n50_mono_core)

In [185]:
monomerMatched_fimo_score3

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq


In [186]:
monomerMatched_fimo_score4

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq


### everything listed looks good, compile annotation dataframe and fasta

In [248]:
# annotation dataframe
full_library_df = pd.concat([CRX_CREs, scrambled_CREs, control_CREs, basal_CREs]).drop(columns="shuffled").reset_index(drop=True)

#### peak.8438 starts with GGCCG and will create a EagI site when placed next to EcoRI site, remove all instances for this peak

In [249]:
full_library_df = full_library_df.loc[lambda df: ~(df["peak.id"]=="peak.8483"),:].reset_index(drop=True)

In [250]:
sum(full_library_df["fullOligo"].apply(lambda row: len(row)==200))

17912

In [251]:
sum(full_library_df["BC.inOligo"].apply(lambda row: (row[-4:]=="CGGC")|(row[-5:]=="GCATG")|(row[:5]=="GGCCG")|(row[-5:]=="GAATT")))

0

In [252]:
full_library_df.to_csv(os.path.join(base_dir, "submission", f"full_library.fimo{fimo_th}.tsv"), sep="\t", header=True, index=False)

In [253]:
# annotation dataframe in Twist format
full_library_forTwist = full_library_df.loc[:,["label","fullOligo"]].copy()
full_library_forTwist.to_csv(os.path.join(base_dir, "submission", f"full_library.fimo{fimo_th}.Twist.tsv"), sep="\t", header=True, index=False)

In [254]:
# label to BC map
full_library_BCs = full_library_df.loc[:,["label","BC","BC.inOligo"]].copy()
full_library_BCs.to_csv(os.path.join(base_dir, "submission", f"full_library.fimo{fimo_th}.BarCodes.tsv"), sep="\t", header=True, index=False)

In [255]:
# CRE sequences only
full_library_CREs = full_library_df.copy().dropna().loc[:,["label","CRE"]].set_index("label").squeeze()
sequence_annotator.write_fasta(full_library_CREs, os.path.join(base_dir, "submission", f"full_library.CREsOnly.fa"))

In [256]:
# full oligo sequences
full_library_fasta = full_library_df.copy().loc[:,["label","fullOligo"]].set_index("label").squeeze()
sequence_annotator.write_fasta(full_library_fasta, os.path.join(base_dir, "submission", f"full_library.fullOligo.fa"))

In [175]:
primerR_revcomp

'CCAACTACTACTACAG'

In [174]:
mpra_CRE_utils.find_REsite_match(all_barcodes.squeeze(), RE_list=[EcoRI,SpeI,SphI,EagI])

Looking for matches: GAATTC|ACTAGT|GCATGC|CGGCCG


Series([], Name: BC, dtype: object)

In [148]:
EagI

'CGGCCG'

In [147]:
SphI

'GCATGC'

## FINAL! 06-27-2022 15:50

In [None]:
full_library_forTwist = pd.read_csv(os.path.join(base_dir, "submission", f"full_library.fimo{fimo_th}.Twist.tsv"), sep="\t", header=0)

In [134]:
full_library_forTwist["BC.inOligo"] = full_library_forTwist["fullOligo"].apply(lambda row: row[-len("NNNNNNNNNN"+EagI+primerR):-len(EagI+primerR)])

In [135]:
len(full_library_forTwist["BC.inOligo"].unique())

17920

In [136]:
full_library_forTwist["BC.inLabel"] = full_library_forTwist["label"].apply(lambda row: row[-10:])

In [137]:
len(full_library_forTwist["BC.inLabel"].unique())

17920

In [138]:
sum(full_library_forTwist["BC.inLabel"]==full_library_forTwist["BC.inOligo"])

17920

In [139]:
check_basal = full_library_forTwist.iloc[-20,1]
full_library_forTwist.iloc[-20,0]

'peak.9841_CACGTTCACG'

In [140]:
primerF+EcoRI+SpeI+"C"+basalFiller == check_basal[0:len(primerF+EcoRI+SpeI+"C"+basalFiller)]

True

In [142]:
SphI+full_library_forTwist.iloc[-20,0][-10:]+EagI+primerR_revcomp == check_basal[-len(SphI+"NNNNNNNNNN"+EagI+primerR_revcomp):]

True