### for random selection and shuffling of fasta sequences

In [1]:
import os, sys, warnings, re, glob2, itertools, string, random, math

import numpy as np
import pandas as pd
import scipy
from scipy import stats

random.seed(4444)

### I. housekeeping

In [2]:
# set up working directories (do not run twice)
base_dir = os.path.split(os.getcwd())[0]
sys.path.insert(0, os.path.join(os.getcwd(),"utils"))
os.chdir(base_dir)
scriptdir=os.path.join(base_dir, "scripts")

In [4]:
from utils import specseq_plot_utils, sequence_annotator, mpra_CRE_utilss
specseq_plot_utils.set_manuscript_params() # max 7pt

In [5]:
EcoRI="GAATTC"
SpeI="ACTAGT"
SphI="GCATGC"
EagI="CGGCCG"

### II. retrieve all peaks

In [139]:
fimo_th = 2.5e-3
# full peak list with annotation
mpraAnnot_df = pd.read_csv(os.path.join(base_dir, "peaksets", f"allCRE_annotation.fimo{fimo_th}.tsv"), sep="\t", header=0)
allCRE_fasta = sequence_annotator.read_fasta(os.path.join(base_dir, "sequences", "mpraAllCRE.fa"))

### III. select N numbers of sequences and generate scramble sequences

In [64]:
# a complete list of the index of all CREs
allCRE = range(1, len(allCRE_fasta))
# randomly draw n elements from the list
subsetCRE = random.sample(allCRE, 180)

In [65]:
# retrieve annotation and fasta of CREs drawed
subsetCRE_df = mpraAnnot_df.iloc[subsetCRE,:].copy()
subsetCRE_fasta = allCRE_fasta[subsetCRE].copy()

In [51]:
# make sure the annotation and fasta are matched
set(subsetCRE_df["peak.id"])-set(subsetCRE_fasta.index)

set()

In [7]:
# read directly from file
subsetCRE_df = pd.read_csv(os.path.join(base_dir, "peaksets", "scrambledSubset.tsv"), sep="\t", header=0)
subsetCRE_fasta = allCRE_fasta[subsetCRE_df["peak.id"]].copy()

In [66]:
# write selected scramble peak annotations to file
subsetCRE_df.to_csv(os.path.join(base_dir, "peaksets", "scrambledSubset.tsv"), sep="\t", header=True, index=False)

In [67]:
# output fasta to a new file
sequence_annotator.write_fasta(subsetCRE_fasta, os.path.join(base_dir, "sequences", "subsetCREs.fa"))

In [9]:
input_fa =os.path.join(base_dir, "sequences" , "subsetCREs.fa")
output_fa = os.path.join(base_dir,"sequences", "subsetCREs_shuffled.fa")

In [69]:
# use function comes with MEME to shuffle all sequences while preserving dinucleotide frequencies
!fasta-shuffle-letters -kmer 2 -dna -tag "_shuff" "{input_fa}" "{output_fa}"

In [70]:
scrambled_fasta = sequence_annotator.read_fasta(output_fa)
scrambled_REmatch = mpra_CRE_utils.find_REsite_match(scrambled_fasta, RE_list=[EcoRI,SpeI,SphI,EagI])

Looking for matches: GAATTC|ACTAGT|GCATGC|CGGCCG


In [71]:
scrambled_REmatch

label
peak.3932_shuff    GCTGTATTGAGGGAATTCCAGTCTTCCAGGAATTCGACTGAAAGCC...
peak.1358_shuff    GAGCTTAGTTGGCAAGGGGCAGGGACTGAGCAGCATGACTTCAGGG...
peak.9184_shuff    TAGCAGGGCCTGGCCTGCTAGTCCTGAGCAAGCAACCCTCTTCTAG...
peak.2769_shuff    GATGGAACCGGTGACAACAACCTCCTGTAGGTAGTATTCCCTCAAT...
peak.8225_shuff    TTATTAAGAAAAAAATAGCTCATCAGTCTATGCACTCTGCATGCTA...
peak.857_shuff     GGTTCCGTGGGCTTTAGGTCGGAGGAGTGTAAGTTAAGTCAGGTGT...
peak.5896_shuff    GTTGAGTATGGGGTGCCCTGTCAGGCTCCGCGCTGGTTGTATTGGC...
peak.4736_shuff    CTTGGTAGGGCTGAATTCTACTTCCTGAGTTGGTCTCCCTTGAAGA...
peak.71_shuff      AATTAGCAGCAGGTTAAGCTTGGGTGCTGGCACGTCTTTGTGCACT...
peak.2626_shuff    TGCACTGCTGTGGGGTGGTAACGTAAGGCTCAGATGAGGAACTGTC...
peak.3005_shuff    TGATTTCAGTTTGAGATGCAATGAGAGATCCTAAACAACCACCCTT...
peak.1369_shuff    GGGCCACTTCAAGAAGATCACGGAGCTTAGCGTGACACAGGCGGCG...
dtype: object

#### drop any scrambles that contain RE site match

In [60]:
scrambled_fasta = scrambled_fasta.drop(scrambled_REmatch.index)

In [61]:
# attached the shuffled fasta sequences to dataframe
scrambled_df = scrambled_fasta.to_frame().reset_index(drop=False).rename(columns={"label":"peak.id", 0:"shuffled.FASTA"})
scrambled_df["peak.id"] = scrambled_df["peak.id"].apply(lambda x: x.strip("_shuff"))
scrambled_df = pd.merge(subsetCRE_df, scrambled_df, left_on="peak.id", right_on="peak.id", how="inner")

In [62]:
scrambled_df

Unnamed: 0,peak.id,seqnames,start,end,summit,width,strand,CRX_Corbo,N50,HDmono,HDdimer,NRL,AP1,annotation,FASTA,shuffled.FASTA
0,peak.8080,chr7,34142450,34142583,34142517,134,+,1,0,2,0,1,0,KLost,CTGCTCAAAGAGCCAGATGCACAGTTTATGGCCTACAAATAAGCAG...,CCCCAAATGCATGTTAGGCATACGCCGGAGCCTACAGAAGCTAGAC...
1,peak.4849,chr19,37211579,37211712,37211646,134,+,3,0,2,0,0,0,KLost,CACGTGTAACCTTAGTGCTAGAGAAGCAGGAACAGGATAATCCCTG...,CATGAGGCGTAGTAGTGTCCCTAGCTCAAAACATGGAGCTTAGCTA...
2,peak.9110,chr9,16257517,16257650,16257584,134,+,0,0,0,0,0,0,KGain,TAGCAGGAAACCTATTAGGGCTTTTCTATTTACAGCTTTCTACAAG...,TTAGCATTGTCTCACCACCTATATCTTACTATCCAGGCTAGATTCC...
3,peak.5992,chr3,100538894,100539027,100538961,134,+,2,0,2,0,0,1,KLost,AGACATTTCAGTCACTCCTGAAAGCTTACTGGCGCTCTGCTGCTCA...,AGGAAGCCTACACAAGCAAACTTGGCATCTCTGCTGTCTTCTTTGT...
4,peak.1842,chr11,118825478,118825611,118825545,134,+,1,0,1,0,0,0,NotDB,CAGGGTTGTCAACAGCTCCGGGGAGGGGCTGTTGGGTGAAGCTCTG...,CTGGGTTGCGCTATGGGCCTACCAACAGCAGAAGGCGTGGTGAGAG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,peak.9755,chrX,57892454,57892587,57892521,134,+,0,1,1,1,1,0,NotDB,AATAAAGCCAGGAGCAGGTGTCAGCTGCTCCTGCAGTCAAAACAAC...,AGTAAGAGCCAGTTCTATTGGTACATGGGCTTTTGGGGCATTTCAT...
161,peak.9378,chr9,60603065,60603198,60603132,134,+,2,0,1,0,0,0,NotDB,CTCCCCTCCTCCCACAGACTCCCTGCTGCCTTCATGTCACATGTAT...,CAAAGGCTAACTACATCTGCATTGGATGAAATGTAAATCAACCTCC...
162,peak.9711,chr9,123496490,123496623,123496557,134,+,4,1,3,0,0,2,KLost,TTCCCAATAATGACACAGAGAGACCAAGATTTATTTTCGGCTTAAA...,TTGTCCTCTAGTTTATTTCTATAATACACTTCAGTAATAAATGGAC...
163,peak.5739,chr3,19405166,19405299,19405233,134,+,1,0,1,0,2,0,NotDB,TGCAAGTATTGAGAGGTTCCGGAGAGTACTAAGCAGGTAATGTAAG...,TGCCAGCATGAAGAGCAACTAGATTGAGGGCTAGGTTAAGGAGCTA...


In [63]:
# write selected scramble peak annotations to file
scrambled_df.to_csv(os.path.join(mpraout_dir, "peaksets", "scrambledSubset.withShuffles.tsv"), sep="\t", header=True, index=False)

### now i have more than 150 validated scrambled fasta, select 150

In [140]:
scrambled_df = pd.read_csv(os.path.join(base_dir, "peaksets", "scrambledSubset.withShuffles.tsv"), sep="\t", header=0)
scrambled_fasta = sequence_annotator.read_fasta(os.path.join(base_dir,"sequences", "subsetCREs_shuffled.fa"))
scrambled_fasta = scrambled_fasta[scrambled_df["peak.id"]+"_shuff"]

In [17]:
# a complete list of the index of all scrambled CREs
allScrambled = range(1, len(scrambled_fasta))
# randomly draw n elements from the list
subsetScrambled = random.sample(allScrambled, 150)
# get the selected peaks
subsetScrambled_df = scrambled_df.iloc[subsetScrambled,:].copy()
subsetScrambled_fasta = scrambled_fasta[subsetScrambled].copy()

In [20]:
set(subsetScrambled_df["peak.id"]+"_shuff")-set(subsetScrambled_fasta.index)

set()

In [155]:
# write selected scramble peak annotations to file
scrambled_df.to_csv(os.path.join(base_dir, "peaksets", "scrambled150.withShuffles.tsv"), sep="\t", header=True, index=False)
sequence_annotator.write_fasta(subsetScrambled_fasta, os.path.join(base_dir, "sequences", "scrambled150_shuffled.fa"))

### IV. mutagensize dimeric HD motif cores

In [7]:
# 1-indexed nucleotide position dictionaries
dimer_core = {1:"T", 2:"A", 3:"A", 9:"T", 10:"T", 11:"A"} # k88n_olap.MEME.2
corbo_mono_core = {2:"T", 3:"A", 4:"A"} # CRX_Corbo
crx_mono_core = {8:"T", 9:"T", 10:"A"} # JASPAR
n50_mono_core = {5:"T", 6:"T", 7:"A"} # k88n_olap.DERME.1
dimer_mutant_core = {3:"C", 9:"G"}
corbo_mutant_core = {4:"C"}
crx_mutant_core = {8:"G"}
n50_mutant_core = {5:"G"}

In [8]:
meme_dir = f"{base_dir}/meme/all_chip_pwm.meme"
fimo_th = 2.5E-3 # as in Ryan's eLife paper
#fimo_th = 1.0E-3 # as in Drew's Genome Research paper
fimo_meta = f"{scriptdir}/fimo_meta.csv"

In [97]:
fimo_sample_list = pd.read_csv(fimo_meta, header=0)
fimo_sample_list

Unnamed: 0,sampleName,inputFA,markovBG,outputDir
0,allCRE,allCRE/mpraAllCRE.fa,allCRE/mpraAllCRE_background,allCRE_fimo
1,allCRE_fimo,allCRE_fimo/allCRE_fimo.k88n_olap.MEME.2.maske...,allCRE_fimo/allCRE_fimo_background,allCRE_fimo2
2,allCRE_fimo2,allCRE_fimo2/allCRE_fimo2.k88n_olap.MEME.2.mas...,allCRE_fimo2/allCRE_fimo2_background,allCRE_fimo3
3,allCRE_fimo3,allCRE_fimo3/allCRE_fimo3.k88n_olap.MEME.2.mas...,allCRE_fimo3/allCRE_fimo2_background,allCRE_fimo4
4,dimerMutCRE,dimerMutCRE/allCRE.k88n_olap.MEME.2.mutated.fa,dimerMutCRE/dimerMutCRE_fimo_background,dimerMutCRE_fimo
5,monoMutCRE,monoMutCRE/allCRE.Crx.MA0467.1.mutated.fa,monoMutCRE/monoMutCRE_fimo_background,monoMutCRE_fimo
6,monoMutCRE_fimo,monoMutCRE_fimo/allCRE.Crx.MA0467.1.mutated.fa,monoMutCRE_fimo/monoMutCRE_fimo_background,monoMutCRE_fimo2
7,monoMutCRE_fimo2,monoMutCRE_fimo2/allCRE.Crx.MA0467.1.mutated.fa,monoMutCRE_fimo2/monoMutCRE_fimo2_background,monoMutCRE_fimo3
8,monoMutCRE,monoMutCRE/allCRE.CRX_Corbo.mutated.fa,monoMutCRE/monoMutCRE_fimo_background,monoMutCRE_fimo
9,monoMutCRE_fimo,monoMutCRE_fimo/allCRE.CRX_Corbo.mutated.fa,monoMutCRE_fimo/monoMutCRE_fimo_background,monoMutCRE_fimo2


In [10]:
# copy the newly generated fasta to fimo folder
new_dir = os.path.join(base_dir, f"fimo_{fimo_th}", os.path.split(fimo_sample_list.iloc[0,1])[0].split("/")[-1])
outputFasta=os.path.join(base_dir,"sequences","mpraAllCRE.fa")
!cp "{outputFasta}" "{new_dir}"

In [11]:
!bash ./meme_fimo_scanning.sh "{scriptdir}" "{base_dir}" "{meme_dir}" "{fimo_th}" "{fimo_meta}" 1

working directory: /mnt/v/yqzheng/qiaoer/VSCode_yiqiao/SPEC-SEQ/scripts
query fasta: /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/allCRE/mpraAllCRE.fa
[Kcessed: 100.0%9829 134 134 134.0 1317086
Scanning with threshold 0.0025
FIMO output will be written to /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/allCRE_fimo
ha! this is the end of the script!


In [12]:
dimer_motif = "k88n_olap.MEME.2"
# read fimo score for the unmasked fasta sequences
raw_fimo_score = pd.read_csv(os.path.join(base_dir, f"fimo_{fimo_th}", "allCRE_fimo", "fimo.tsv"), sep="\t", header=0)
# find matches of the dimeric HD site and mutate the core TAA > TAC
dimerMatched_fimo_score, dimerMutated_fasta = mpra_CRE_utils.find_and_mutate_motif(allCRE_fasta, raw_fimo_score, dimer_motif, dimer_mutant_core, dimer_core)

In [13]:
len(dimerMatched_fimo_score["peak.id"].unique())

1410

In [14]:
sequence_annotator.write_fasta(dimerMutated_fasta, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"allCRE.{dimer_motif}.mutated.fa"))

#### go through the second fimo scan results to remove any further dimeric motifs

In [21]:
raw_fimo_score2 = pd.read_csv(os.path.join(base_dir, f"fimo_{fimo_th}", "allCRE_fimo2", "fimo.tsv"), sep="\t", header=0)
dimerMatched_fimo_score2, dimerMutated_fasta2 = mpra_CRE_utils.find_and_mutate_motif(dimerMutated_fasta, raw_fimo_score2, dimer_motif, dimer_mutant_core, dimer_core)

In [22]:
dimerMatched_fimo_score2

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq
0,peak.822,k88n_olap.MEME.2,74,84,+,-1.23,TAACCGACTTA,TACCCGACGTA
1,peak.1212,k88n_olap.MEME.2,61,71,+,-1.19,TAACCTGCTTA,TACCCTGCGTA
2,peak.1467,k88n_olap.MEME.2,118,128,+,-1.15,TAAGCGTGTTA,TACGCGTGGTA
3,peak.3193,k88n_olap.MEME.2,71,81,+,-1.19,TAACCTGTTTA,TACCCTGTGTA
4,peak.3801,k88n_olap.MEME.2,38,48,+,-1.19,TAACCTGCTTA,TACCCTGCGTA
5,peak.3823,k88n_olap.MEME.2,89,99,-,-1.15,TAAGCGTCTTA,TACGACGCGTA
6,peak.4961,k88n_olap.MEME.2,50,60,+,-1.19,TAACCTGCTTA,TACCCTGCGTA
7,peak.5196,k88n_olap.MEME.2,2,12,+,-1.19,TAACCTGCTTA,TACCCTGCGTA
8,peak.5411,k88n_olap.MEME.2,67,77,+,-1.15,TAAGCGTCTTA,TACGCGTCGTA
9,peak.6337,k88n_olap.MEME.2,37,47,-,-1.19,TAACCTGCTTA,TACGCAGGGTA


In [23]:
sequence_annotator.write_fasta(dimerMutated_fasta2, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"allCRE.{dimer_motif}.mutated.fa"))

#### scan with FIMO again to check any spurious dimeric motifs generated

In [26]:
# copy the newly generated fasta to fimo folder
old_dir = os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"allCRE.{dimer_motif}.mutated.fa")
new_dir = os.path.join(base_dir, f"fimo_{fimo_th}", os.path.split(fimo_sample_list.iloc[4,1])[0].split("/")[-1])
!cp "{old_dir}" "{new_dir}"

In [27]:
!bash ./scripts/meme_fimo_scanning.sh "{scriptdir}" "{base_dir}" "{meme_dir}" "{fimo_th}" "{fimo_meta}" 5

working directory: /mnt/v/yqzheng/qiaoer/VSCode_yiqiao/SPEC-SEQ/scripts
query fasta: /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/dimerMutCRE/allCRE.k88n_olap.MEME.2.mutated.fa
[K9829 134 134 134.0 1317086
Scanning with threshold 0.0025
FIMO output will be written to /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/dimerMutCRE_fimo
ha! this is the end of the script!


In [28]:
# retrieve raw fimo results
f = os.path.join(base_dir, f"fimo_{fimo_th}", fimo_sample_list.iloc[4,-1], "fimo.tsv")
name = os.path.split(f)[0].split("/")[-1]
print('reading ' + f)
# read the raw fimo output
raw_fimo_score3 = pd.read_csv(f, sep="\t", header=0)[:-3] # drop the last three row

reading /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/dimerMutCRE_fimo/fimo.tsv


In [29]:
match_count = mpra_CRE_utils.count_motif_occur(allCRE_fasta, raw_fimo_score3, dimer_motif, coremotif_dict=dimer_core)
match_count.loc[match_count["motif_count"]!=0,:]

Unnamed: 0_level_0,motif_count
label,Unnamed: 1_level_1


In [30]:
dimerMatched_fimo_score3, dimerMutated_fasta2 = mpra_CRE_utils.find_and_mutate_motif(dimerMutated_fasta2, raw_fimo_score3, dimer_motif, dimer_mutant_core, dimer_core)

In [31]:
dimerMatched_fimo_score3

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq


In [32]:
# all dimeric sites have been mutated but peak.id convention is kept
sequence_annotator.write_fasta(dimerMutated_fasta2, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"allCRE.{dimer_motif}.mutated.fa"))

In [204]:
allDimer_CREs = [peak_df["peak.id"] for peak_df in [dimerMatched_fimo_score,dimerMatched_fimo_score2,dimerMatched_fimo_score3]]
allDimer_CREs = set().union(*allDimer_CREs)

In [208]:
# keep only sequences that contain a mutated dimeric HD site
dimerMutated_fasta3 = dimerMutated_fasta2[list(allDimer_CREs)].copy()
# all peaks containing dimeric motifs that have been mutated
allMutatedDimer_peaks = dimerMutated_fasta3.index
# some renaming thing
dimerMutated_fasta3.index = dimerMutated_fasta3.index + ".mutD"
len(dimerMutated_fasta3) # total number of D mutants

1420

In [209]:
# all dimeric sites have been mutated, only those mutated fasta were keep, mutated fasta named as mut.D
sequence_annotator.write_fasta(dimerMutated_fasta3, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"mutatedCRE.{dimer_motif}.mutated.fa"))

### VI. mutate all WT monomeric HD core motifs

In [35]:
mono_motif = "CRX_Corbo"
n50_motif = "k88n_olap.DREME.1"

In [222]:
# use mutated dimer fasta sequences as input, then collect all monomer motif fimo df and mutate all from wt fasta at once
monoMutated_fimo_score, monoMutated_fasta = mpra_CRE_utils.find_and_mutate_motif(dimerMutated_fasta2, raw_fimo_score3, mono_motif, corbo_mutant_core, corbo_mono_core)
n50Mutated_fimo_score, monoMutated_fasta = mpra_CRE_utils.find_and_mutate_motif(monoMutated_fasta, raw_fimo_score3, n50_motif, n50_mutant_core, n50_mono_core)

In [223]:
len(monoMutated_fimo_score["peak.id"].unique())

7909

In [224]:
len(n50Mutated_fimo_score["peak.id"].unique())

3210

In [225]:
sequence_annotator.write_fasta(monoMutated_fasta, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"allCRE.{mono_motif}.mutated.fa"))

#### scan again to spot any spurious monomeric HD motif

In [226]:
# copy the newly generated fasta to fimo folder
old_dir = os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"allCRE.{mono_motif}.mutated.fa")
new_dir = os.path.join(base_dir, f"fimo_{fimo_th}", os.path.split(fimo_sample_list.iloc[8,1])[0].split("/")[-1])
!mkdir -p "{new_dir}"
!cp "{old_dir}" "{new_dir}"

In [227]:
!bash ./scripts/meme_fimo_scanning.sh "{scriptdir}" "{base_dir}" "{meme_dir}" "{fimo_th}" "{fimo_meta}" 9

working directory: /mnt/v/yqzheng/qiaoer/VSCode_yiqiao/SPEC-SEQ/scripts
query fasta: /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/monoMutCRE/allCRE.CRX_Corbo.mutated.fa
[K9829 134 134 134.0 1317086
Scanning with threshold 0.0025
FIMO output will be written to /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/monoMutCRE_fimo
ha! this is the end of the script!


In [228]:
# retrieve raw fimo results
f = os.path.join(base_dir, f"fimo_{fimo_th}", fimo_sample_list.iloc[8,-1], "fimo.tsv")
name = os.path.split(f)[0].split("/")[-1]
print('reading ' + f)
# read the raw fimo output
raw_fimo_score4 = pd.read_csv(f, sep="\t", header=0)[:-3] # drop the last three row

reading /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/monoMutCRE_fimo/fimo.tsv


In [229]:
monoMutated_fimo_score2, monoMutated_fasta2 = mpra_CRE_utils.find_and_mutate_motif(monoMutated_fasta, raw_fimo_score4, mono_motif, corbo_mutant_core, corbo_mono_core)
n50Mutated_fimo_score2, monoMutated_fasta2 = mpra_CRE_utils.find_and_mutate_motif(monoMutated_fasta2, raw_fimo_score4, n50_motif, n50_mutant_core, n50_mono_core)

In [230]:
len(monoMutated_fimo_score2["peak.id"].unique())

970

In [231]:
len(n50Mutated_fimo_score2["peak.id"].unique())

1103

In [232]:
sequence_annotator.write_fasta(monoMutated_fasta2, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"allCRE.{mono_motif}.mutated.fa"))

In [233]:
# copy the newly generated fasta to fimo folder
old_dir = os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"allCRE.{mono_motif}.mutated.fa")
new_dir = os.path.join(base_dir, f"fimo_{fimo_th}", os.path.split(fimo_sample_list.iloc[9,1])[0].split("/")[-1])
!mkdir -p "{new_dir}"
!cp "{old_dir}" "{new_dir}"

In [234]:
!bash ./scripts/meme_fimo_scanning.sh "{scriptdir}" "{base_dir}" "{meme_dir}" "{fimo_th}" "{fimo_meta}" 10

working directory: /mnt/v/yqzheng/qiaoer/VSCode_yiqiao/SPEC-SEQ/scripts
query fasta: /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/monoMutCRE_fimo/allCRE.CRX_Corbo.mutated.fa
[K9829 134 134 134.0 1317086
Scanning with threshold 0.0025
FIMO output will be written to /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/monoMutCRE_fimo2
ha! this is the end of the script!


In [235]:
# retrieve raw fimo results
f = os.path.join(base_dir, f"fimo_{fimo_th}", fimo_sample_list.iloc[9,-1], "fimo.tsv")
name = os.path.split(f)[0].split("/")[-1]
print('reading ' + f)
# read the raw fimo output
raw_fimo_score5 = pd.read_csv(f, sep="\t", header=0)[:-3] # drop the last three row

reading /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/monoMutCRE_fimo2/fimo.tsv


In [236]:
monoMutated_fimo_score3, monoMutated_fasta3 = mpra_CRE_utils.find_and_mutate_motif(monoMutated_fasta2, raw_fimo_score5, mono_motif, corbo_mutant_core, corbo_mono_core)
n50Mutated_fimo_score3, monoMutated_fasta3 = mpra_CRE_utils.find_and_mutate_motif(monoMutated_fasta3, raw_fimo_score5, n50_motif, n50_mutant_core, n50_mono_core)

In [237]:
monoMutated_fimo_score3

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq
0,peak.41,CRX_Corbo,75,82,+,6.752,ATAATACT,ATACTACT
1,peak.122,CRX_Corbo,52,59,+,6.288,GTAATTCT,GTACTTCT
2,peak.580,CRX_Corbo,79,86,-,9.136,GTAATCCT,AGGAGTAC
3,peak.615,CRX_Corbo,95,102,-,9.136,GTAATCCT,AGGAGTAC
4,peak.622,CRX_Corbo,91,98,+,8.744,GTAATCCA,GTACTCCA
...,...,...,...,...,...,...,...,...
64,peak.9178,CRX_Corbo,112,119,-,6.376,GTAATACT,AGTAGTAC
65,peak.9472,CRX_Corbo,90,97,-,9.568,GTAATCCC,GGGAGTAC
66,peak.9490,CRX_Corbo,54,61,+,6.376,GTAATACT,GTACTACT
67,peak.9604,CRX_Corbo,63,70,+,8.744,GTAATCCA,GTACTCCA


In [238]:
n50Mutated_fimo_score3

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq
0,peak.1282,k88n_olap.DREME.1,37,43,+,2.16,TTATTTA,TTATGTA
1,peak.1446,k88n_olap.DREME.1,72,78,+,2.16,TCAATTA,TCAAGTA
2,peak.2730,k88n_olap.DREME.1,40,46,-,2.16,TAAATTA,TACTTTA
3,peak.2730,k88n_olap.DREME.1,60,66,+,2.16,TTATTTA,TTATGTA
4,peak.2811,k88n_olap.DREME.1,33,39,+,2.16,TTACTTA,TTACGTA
5,peak.5050,k88n_olap.DREME.1,4,10,+,2.16,TTATTTA,TTATGTA
6,peak.5050,k88n_olap.DREME.1,8,14,+,2.16,TTATTTA,TTATGTA
7,peak.5050,k88n_olap.DREME.1,12,18,+,2.16,TTATTTA,TTATGTA
8,peak.5050,k88n_olap.DREME.1,16,22,+,2.16,TTATTTA,TTATGTA
9,peak.5050,k88n_olap.DREME.1,20,26,+,2.16,TTATTTA,TTATGTA


In [239]:
sequence_annotator.write_fasta(monoMutated_fasta3, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"allCRE.{mono_motif}.mutated.fa"))

In [240]:
# copy the newly generated fasta to fimo folder
old_dir = os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"allCRE.{mono_motif}.mutated.fa")
new_dir = os.path.join(base_dir, f"fimo_{fimo_th}", os.path.split(fimo_sample_list.iloc[10,1])[0].split("/")[-1])
!mkdir -p "{new_dir}"
!cp "{old_dir}" "{new_dir}"

In [241]:
!bash ./scripts/meme_fimo_scanning.sh "{scriptdir}" "{base_dir}" "{meme_dir}" "{fimo_th}" "{fimo_meta}" 11

working directory: /mnt/v/yqzheng/qiaoer/VSCode_yiqiao/SPEC-SEQ/scripts
query fasta: /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/monoMutCRE_fimo2/allCRE.CRX_Corbo.mutated.fa
[K9829 134 134 134.0 1317086
Scanning with threshold 0.0025
FIMO output will be written to /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/monoMutCRE_fimo3
ha! this is the end of the script!


In [242]:
# retrieve raw fimo results
f = os.path.join(base_dir, f"fimo_{fimo_th}", fimo_sample_list.iloc[10,-1], "fimo.tsv")
name = os.path.split(f)[0].split("/")[-1]
print('reading ' + f)
# read the raw fimo output
raw_fimo_score6 = pd.read_csv(f, sep="\t", header=0)[:-3] # drop the last three row

reading /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/monoMutCRE_fimo3/fimo.tsv


In [243]:
mpra_CRE_utils.count_motif_occur(monoMutated_fasta3, raw_fimo_score6, mono_motif, coremotif_dict=corbo_mono_core).sum()

motif_count    3.0
dtype: float64

In [244]:
mpra_CRE_utils.count_motif_occur(monoMutated_fasta3, raw_fimo_score6, n50_motif, coremotif_dict=n50_mono_core).sum()

motif_count    12.0
dtype: float64

In [245]:
monoMutated_fimo_score4, monoMutated_fasta4 = mpra_CRE_utils.find_and_mutate_motif(monoMutated_fasta3, raw_fimo_score6, mono_motif, corbo_mutant_core, corbo_mono_core)
n50Mutated_fimo_score4, monoMutated_fasta4 = mpra_CRE_utils.find_and_mutate_motif(monoMutated_fasta4, raw_fimo_score6, n50_motif, n50_mutant_core, n50_mono_core)

In [246]:
monoMutated_fimo_score4

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq
0,peak.2811,CRX_Corbo,29,36,-,6.528,GTAAGCCA,TGGCGTAC
1,peak.5176,CRX_Corbo,28,35,-,6.272,ATAATTCA,TGAAGTAT
2,peak.8097,CRX_Corbo,63,70,-,6.24,ATAAGCTT,AAGCGTAT


In [247]:
n50Mutated_fimo_score4

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq
0,peak.5050,k88n_olap.DREME.1,4,10,+,2.16,TTATTTA,TTATGTA
1,peak.5050,k88n_olap.DREME.1,8,14,+,2.16,TTATTTA,TTATGTA
2,peak.5050,k88n_olap.DREME.1,12,18,+,2.16,TTATTTA,TTATGTA
3,peak.5050,k88n_olap.DREME.1,16,22,+,2.16,TTATTTA,TTATGTA
4,peak.5050,k88n_olap.DREME.1,20,26,+,2.16,TTATTTA,TTATGTA
5,peak.5050,k88n_olap.DREME.1,24,30,+,2.16,TTATTTA,TTATGTA
6,peak.5176,k88n_olap.DREME.1,28,34,+,2.16,TGAATTA,TGAAGTA
7,peak.5507,k88n_olap.DREME.1,71,77,+,2.16,TTATTTA,TTATGTA
8,peak.5507,k88n_olap.DREME.1,75,81,+,2.16,TTATTTA,TTATGTA
9,peak.5507,k88n_olap.DREME.1,79,85,+,2.16,TTATTTA,TTATGTA


In [248]:
sequence_annotator.write_fasta(monoMutated_fasta4, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"allCRE.{mono_motif}.mutated.fa"))

In [249]:
# copy the newly generated fasta to fimo folder
old_dir = os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"allCRE.{mono_motif}.mutated.fa")
new_dir = os.path.join(base_dir, f"fimo_{fimo_th}", os.path.split(fimo_sample_list.iloc[11,1])[0].split("/")[-1])
!mkdir -p "{new_dir}"
!cp "{old_dir}" "{new_dir}"

In [250]:
!bash ./scripts/meme_fimo_scanning.sh "{scriptdir}" "{base_dir}" "{meme_dir}" "{fimo_th}" "{fimo_meta}" 12

working directory: /mnt/v/yqzheng/qiaoer/VSCode_yiqiao/SPEC-SEQ/scripts
query fasta: /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/monoMutCRE_fimo3/allCRE.CRX_Corbo.mutated.fa
[Kcessed: 100.0%9829 134 134 134.0 1317086
Scanning with threshold 0.0025
FIMO output will be written to /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/monoMutCRE_fimo4
ha! this is the end of the script!


In [251]:
# retrieve raw fimo results
f = os.path.join(base_dir, f"fimo_{fimo_th}", fimo_sample_list.iloc[11,-1], "fimo.tsv")
name = os.path.split(f)[0].split("/")[-1]
print('reading ' + f)
# read the raw fimo output
raw_fimo_score7 = pd.read_csv(f, sep="\t", header=0)[:-3] # drop the last three row

reading /mnt/v/yqzheng/qiaoer/PhD Thesis/Experiment/MPRA/hdmuts_library/fimo_0.0025/monoMutCRE_fimo4/fimo.tsv


In [252]:
mpra_CRE_utils.count_motif_occur(monoMutated_fasta4, raw_fimo_score7, mono_motif, coremotif_dict=corbo_mono_core).sum()

motif_count    0.0
dtype: float64

In [253]:
mpra_CRE_utils.count_motif_occur(monoMutated_fasta4, raw_fimo_score7, n50_motif, coremotif_dict=n50_mono_core).sum()

motif_count    9.0
dtype: float64

In [254]:
monoMutated_fimo_score5, monoMutated_fasta5 = mpra_CRE_utils.find_and_mutate_motif(monoMutated_fasta4, raw_fimo_score7, mono_motif, corbo_mutant_core, corbo_mono_core)
n50Mutated_fimo_score5, monoMutated_fasta5 = mpra_CRE_utils.find_and_mutate_motif(monoMutated_fasta5, raw_fimo_score7, n50_motif, n50_mutant_core, n50_mono_core)

In [255]:
n50Mutated_fimo_score5

Unnamed: 0,peak.id,motif,start,end,strand,score,match_seq,mutated_seq
0,peak.5050,k88n_olap.DREME.1,4,10,+,2.16,TTATTTA,TTATGTA
1,peak.5050,k88n_olap.DREME.1,8,14,+,2.16,TTATTTA,TTATGTA
2,peak.5050,k88n_olap.DREME.1,12,18,+,2.16,TTATTTA,TTATGTA
3,peak.5050,k88n_olap.DREME.1,16,22,+,2.16,TTATTTA,TTATGTA
4,peak.5050,k88n_olap.DREME.1,20,26,+,2.16,TTATTTA,TTATGTA
5,peak.5507,k88n_olap.DREME.1,71,77,+,2.16,TTATTTA,TTATGTA
6,peak.5507,k88n_olap.DREME.1,75,81,+,2.16,TTATTTA,TTATGTA
7,peak.5507,k88n_olap.DREME.1,79,85,+,2.16,TTATTTA,TTATGTA
8,peak.5507,k88n_olap.DREME.1,83,89,+,2.16,TTATTTA,TTATGTA


#### bizarre sequences in the three remaining CREs, drop them in the candidate set

In [256]:
# peak.2730
mpraAnnot_df.at[2726, "FASTA"]

'TTGTGGGATCTTCGAGTAATAGCACAAAGATATAATAAGTAATTTATTTATTTATTTATTTATTTATTTATATAATTCCTACAAGTCTTTTCCTCACTAATCTACTCTAATGAACAACTCTTTGGAGAAACGCA'

In [257]:
# peak.5050
mpraAnnot_df.at[5043, "FASTA"]

'GTTTTATTTATTTATTTATTTATTTATTTATTTATTTACAGCTGAGTCCTTTCTGGGTAGCCTAGGCTGACCTGAAACTTACTAATCTCCAAGCCCTAGGATTATAGATGTGCACTGGCACACCTGGCCTCCTT'

In [258]:
# peak.5507
mpraAnnot_df.at[5500, "FASTA"]

'CCCCACCCCCACCTCCCCATCCTTTAATCTTTGGGTAAATCCTCCCAAGACTAATCCCTGGGCACAAACTTTATTTATTTATTTATTTATTTATTTATTTATTTTCATCTTGGATGGCTTATGGTAGAGACCGA'

#### no more identifiable monomeric HD sites, not collect all monomer motif fimo instances, mutate from wt sequences

In [274]:
# keep only sequences that contain a mutated monomeric HD site
allMonomerSites = pd.concat([monoMutated_fimo_score, monoMutated_fimo_score2, monoMutated_fimo_score3, monoMutated_fimo_score4]).sort_values(by="peak.id").reset_index(drop=True)
allMonomerSites = pd.concat([allMonomerSites, n50Mutated_fimo_score, n50Mutated_fimo_score2, n50Mutated_fimo_score3, n50Mutated_fimo_score4]).sort_values(by="peak.id").reset_index(drop=True)

In [275]:
len(allMonomerSites["peak.id"].unique())

8465

In [276]:
allMonomerSites = allMonomerSites.rename(columns={"motif":"motif_id", "match_seq": "matched_sequence", "peak.id": "sequence_name", "end": "stop"})

In [277]:
allMonomerSites

Unnamed: 0,sequence_name,motif_id,start,stop,strand,score,matched_sequence,mutated_seq
0,peak.1,CRX_Corbo,104,111,+,6.70635,ATAATCAC,ATACTCAC
1,peak.1,k88n_olap.DREME.1,105,111,-,2.23000,GTGATTA,TACTCAC
2,peak.1,k88n_olap.DREME.1,9,15,-,2.64000,TTTATTA,TACTAAA
3,peak.10,CRX_Corbo,66,73,-,6.82540,ATAAGCCA,TGGCGTAT
4,peak.100,k88n_olap.DREME.1,26,32,+,2.64000,TTGATTA,TTGAGTA
...,...,...,...,...,...,...,...,...
23108,peak.998,CRX_Corbo,40,47,+,10.65870,TTAATCCC,TTACTCCC
23109,peak.998,CRX_Corbo,63,70,+,8.95238,ATAATCCA,ATACTCCA
23110,peak.999,CRX_Corbo,66,73,+,9.78571,CTAATCCT,CTACTCCT
23111,peak.999,k88n_olap.DREME.1,17,23,+,1.94000,CTAGTTA,CTAGGTA


In [278]:
allMonomerSites.to_csv(os.path.join(base_dir, "peaksets", f"allMonomerMutatedSites.fimo{fimo_th}.tsv"), sep="\t", header=True, index=False)

In [282]:
_, allmonoMutated_fasta = mpra_CRE_utils.find_and_mutate_motif(allCRE_fasta, allMonomerSites, mono_motif, corbo_mutant_core, corbo_mono_core)
_, allmonoMutated_fasta  = mpra_CRE_utils.find_and_mutate_motif(allmonoMutated_fasta , allMonomerSites, n50_motif, n50_mutant_core, n50_mono_core)

In [283]:
sequence_annotator.write_fasta(allmonoMutated_fasta, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"allCRE.{mono_motif}.mutated.fa"))

In [284]:
# compile peaks that contain at least one K50 or N50 monomeric motif
allMonomer_CREs = allMonomerSites["sequence_name"].unique()

In [285]:
# keep only sequences that contain a mutated monomeric HD site
monoMutated_fasta5 = allmonoMutated_fasta[lambda df: df.index.isin(allMonomer_CREs)]
# all peaks containing dimeric motifs that have been mutated
allMutatedMonomer_peaks = monoMutated_fasta5.index
# some renaming thing
monoMutated_fasta5.index = monoMutated_fasta5.index + ".mutM"

In [286]:
# all monomeric sites have been mutated, only those mutated fasta were keep, mutated fasta named as mut.M
sequence_annotator.write_fasta(monoMutated_fasta5, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", f"mutatedCRE.{mono_motif}.mutated.fa"))

### VII. mutating both dimeric and monomeric sites

In [167]:
# keep only sequences that contain a mutated dimeric HD site
allDimerSites = pd.concat([dimerMatched_fimo_score, dimerMatched_fimo_score2, dimerMatched_fimo_score3]).sort_values(by="peak.id").reset_index(drop=True)

In [168]:
len(allDimerSites["peak.id"].unique())

1420

In [175]:
allDimerSites = allDimerSites.rename(columns={"motif":"motif_id", "match_seq": "matched_sequence", "peak.id": "sequence_name", "end": "stop"})

In [176]:
allDimerSites.to_csv(os.path.join(base_dir, "peaksets", f"allDimerMutatedSites.fimo{fimo_th}.tsv"), sep="\t", header=True, index=False)

#### mutate both monomeric and dimeric sites that were in the single mutant CREs

In [287]:
_, dimerMonoMutated_fasta = mpra_CRE_utils.find_and_mutate_motif(allmonoMutated_fasta, allDimerSites, dimer_motif, dimer_mutant_core, dimer_core)
#dimerMonoMutated_fimo_score1, dimerMonoMutated_fasta = mpra_CRE_utils.find_and_mutate_motif(dimerMonoMutated_fasta, allMonomerSites, mono_motif, corbo_mutant_core, corbo_mono_core)
#dimerMonoMutated_fimo_score2, dimerMonoMutated_fasta = mpra_CRE_utils.find_and_mutate_motif(dimerMonoMutated_fasta, allMonomerSites, n50_motif, n50_mutant_core, n50_mono_core)

In [288]:
sequence_annotator.write_fasta(dimerMonoMutated_fasta, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", "allCRE.dimerMonomer.mutated.fa"))

#### there's no need to run additional FIMO since we need to match the patterns in single site mutants

In [289]:
dimerMonoMutated_fasta2 = dimerMonoMutated_fasta[lambda df: (df.index.isin(allDimerSites["sequence_name"]))&(df.index.isin(allMonomerSites["sequence_name"]))]
# all peaks containing dimeric motifs that have been mutated
allMutatedDouble_peaks = dimerMonoMutated_fasta2.index
# some renaming thing
dimerMonoMutated_fasta2.index = dimerMonoMutated_fasta2.index + ".mutDM"

In [290]:
len(dimerMonoMutated_fasta2.index)

1255

In [291]:
# all dimeric and monomeric sites have been mutated, only those mutated fasta were keep, mutated fasta named as mut.DM
sequence_annotator.write_fasta(dimerMonoMutated_fasta2, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", "mutatedCRE.dimerMonomer.mutated.fa"))

#### take a look at some stats of single and double mutants

In [292]:
singleDimerMutants = allCRE_fasta[lambda df: (df.index.isin(allDimerSites["sequence_name"]))&(~df.index.isin(allMonomerSites["sequence_name"]))]

In [293]:
sequence_annotator.write_fasta(singleDimerMutants, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", "singleDimerMutants.fa"))

In [294]:
singleMonomerMutants = allCRE_fasta[lambda df: (~df.index.isin(allDimerSites["sequence_name"]))&(df.index.isin(allMonomerSites["sequence_name"]))]

In [295]:
sequence_annotator.write_fasta(singleMonomerMutants, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", "singleMonomerMutants.fa"))

In [296]:
doubleMutants = allCRE_fasta[lambda df: (df.index.isin(allDimerSites["sequence_name"]))&(df.index.isin(allMonomerSites["sequence_name"]))]

In [297]:
sequence_annotator.write_fasta(doubleMutants, os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", "doubleMutants.fa"))

#### compile WT and mutated FASTA into the same dataframe

In [298]:
# full peak list with annotation
#mpraAnnot_df = pd.read_csv(os.path.join(base_dir, "peaksets", f"allCRE_annotation.fimo{fimo_th}.tsv"), sep="\t", header=0)
# motif position information
#allMonomerSites = pd.read_csv(os.path.join(base_dir, "peaksets", "allMonomerMutatedSites.tsv"), sep="\t", header=0)
#allDimerSites = pd.read_csv(os.path.join(base_dir, "peaksets", "allDimerMutatedSites.tsv"), sep="\t", header=0)
# fastas
singleDimerMutants = sequence_annotator.read_fasta(os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", "mutatedCRE.k88n_olap.MEME.2.mutated.fa"))
singleMonomerMutants = sequence_annotator.read_fasta(os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", "mutatedCRE.CRX_Corbo.mutated.fa"))
doubleMutants = sequence_annotator.read_fasta(os.path.join(base_dir, "sequences", f"mutatedCREs_fimo{fimo_th}", "mutatedCRE.dimerMonomer.mutated.fa"))

In [299]:
dimer_df = singleDimerMutants.to_frame().reset_index(drop=False).rename(columns={"label":"peak.id", 0:"mutD.FASTA"})
dimer_df["peak.id"] = dimer_df["peak.id"].apply(lambda x: x.strip(".mutD"))

monomer_df = singleMonomerMutants.to_frame().reset_index(drop=False).rename(columns={"label":"peak.id", 0:"mutM.FASTA"})
monomer_df["peak.id"] = monomer_df["peak.id"].apply(lambda x: x.strip(".mutM"))

double_df = doubleMutants.to_frame().reset_index(drop=False).rename(columns={"label":"peak.id", 0:"mutDM.FASTA"})
double_df["peak.id"] = double_df["peak.id"].apply(lambda x: x.strip(".mutMD"))

In [300]:
data_merge = reduce(lambda left, right:
                     pd.merge(left , right, left_on="peak.id", right_on="peak.id", how="outer"),
                     [mpraAnnot_df, dimer_df, monomer_df, double_df])
data_merge = data_merge.rename(columns={"FASTA":"wt.FASTA"})

#### check if all steps were correct

In [301]:
# peak.29 only contains one dimeric site but no single monomeric site
data_merge.loc[lambda df: df["peak.id"]=="peak.29",:].to_numpy()

array([['peak.29', 'chr1', 16229659.0, 16229792.0, 16229726.0, 134.0,
        '+', 0, 0, 0, 1, 1, 0, 'KGain',
        'TCTTCACATAAGACTCATAATGCTATTAGTCATTCGTTTCCTAGAACACAGACACCCTCGTGCGTGCTGACAGGTTTATACGGACGCAGTGGCAGCCGTGGGTTCTGGGAGCTGCTAGATGGCCGACTTTGATT',
        'TCTTCACATAAGACTCATACTGCTAGTAGTCATTCGTTTCCTAGAACACAGACACCCTCGTGCGTGCTGACAGGTTTATACGGACGCAGTGGCAGCCGTGGGTTCTGGGAGCTGCTAGATGGCCGACTTTGATT',
        nan, nan]], dtype=object)

In [302]:
# peak.9 only contains monomeric sites but no dimeric site
data_merge.loc[lambda df: df["peak.id"]=="peak.9",:].to_numpy()

array([['peak.9', 'chr1', 4970713.0, 4970846.0, 4970780.0, 134.0, '+', 1,
        2, 1, 0, 1, 1, 'KLost',
        'TCTGGGCTTAGCTGCCTACTCTGAAAATAGCAACAGGTGAAAGCAGCTTCTGTGAATCACATCCTCCCAGGATAATGCTGGGTCACCTGATGCTAATTAAGTCACAAACAGGCACTCTGCCAGCCCAGGCTGCC',
        nan,
        'TCTGGGCGTAGCTGCCTACTCTGAAAATAGCAACAGGTGAAAGCAGCTTCTGTGAATCACATCCTCCCAGGATAATGCTGGGTCACCTGATGCTACTTAAGTCACAAACAGGCACTCTGCCAGCCCAGGCTGCC',
        nan]], dtype=object)

In [305]:
data_merge.loc[lambda df: df["peak.id"]=="peak.1943",:].to_numpy()

array([['peak.1943', 'chr12', 32456940.0, 32457073.0, 32457007.0, 134.0,
        '+', 1, 1, 1, 2, 1, 2, 'KGain',
        'TCATCTCAAGTAATCAGAATGTGGCATAGTGCCAGAGATGCAATTTAAAAGTGACACTGCCTTATAGGCTTAGGTTTTAATATAGAGCTTAATAGTGTTATTTTAAGCCCCTTAATGATCTCGGAGGTCTCTAT',
        'TCATCTCAAGTAATCAGAATGTGGCATAGTGCCAGAGATGCAATTTAAAAGTGACACTGCCTTATAGGCTTAGGTTTTAATATAGAGCTTACTAGTGGTATTTTACGCCCCGTAATGATCTCGGAGGTCTCTAT',
        'TCATCTCAAGTACTCAGAATGTGGCATAGTGCCAGAGATGCAATTTAAAAGTGACACTGCCTTATAGGCGTAGGTTTTAATATAGAGCGTACTAGTGTTATTTTAAGCCCCTTACTGATCTCGGAGGTCTCTAT',
        'TCATCTCAAGTACTCAGAATGTGGCATAGTGCCAGAGATGCAATTTAAAAGTGACACTGCCTTATAGGCGTAGGTTTTAATATAGAGCGTACTAGTGGTATTTTACGCCCCGTACTGATCTCGGAGGTCTCTAT']],
      dtype=object)

In [303]:
data_merge

Unnamed: 0,peak.id,seqnames,start,end,summit,width,strand,CRX_Corbo,N50,HDmono,HDdimer,NRL,AP1,annotation,wt.FASTA,mutD.FASTA,mutM.FASTA,mutDM.FASTA
0,peak.1,chr1,4357711.0,4357844.0,4357778.0,134.0,+,1,2,0,0,2,1,KLost,TTTTAAGATAATAAAGGTAGCCATAGCAGACAAGTGCGTGAGTAGC...,,TTTTAAGATACTAAAGGTAGCCATAGCAGACAAGTGCGTGAGTAGC...,
1,peak.2,chr1,4358542.0,4358675.0,4358609.0,134.0,+,1,0,1,0,0,1,KLost,ATCCACAAAGGACAAGCTGAAGATTGCCATGCTCTGGAAGACTTGA...,,ATCCACAAAGGACAAGCTGAAGATTGCCATGCTCTGGAAGACTTGA...,
2,peak.3,chr1,4360266.0,4360399.0,4360333.0,134.0,+,3,2,3,0,2,1,RetinalGene,GGATATGCAACCTGCTTGTTTCACGTAAACAAATGTCTTTGGATTT...,,GGATATGCAACCTGCTTGTTTCACGTAAACAAATGTCTTTGGATTT...,
3,peak.4,chr1,4383772.0,4383905.0,4383839.0,134.0,+,3,1,0,0,3,1,NotDB,GTTCCTGTGTGTTTGTTTCCCTGCACACACAGGCTCAGCAGCACAT...,,GTTCCTGTGTGTTTGTTTCCCTGCACACACAGGCTCAGCAGCACAT...,
4,peak.5,chr1,4802559.0,4802692.0,4802626.0,134.0,+,0,0,0,1,0,1,ELost,AAACTCTGTCTGAAAAACCATAAAAGAAAAAGAAAGATGTAGCCTC...,AAACTCTGTCTGAAAAACCATAAAAGAAAAAGAAAGATGTAGCCTC...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9824,peak.9836,chr6,29380515.0,29380648.0,29380582.0,134.0,+,2,0,2,0,0,0,NotDB,TGAGACTCTGAACTATCCTAAGCCTCCCAAAGACAAAGTCCCAGAT...,,TGAGACTCTGAACTATCCTACGCCTCCCAAAGACAAAGTCCCAGAG...,
9825,peak.9837,chr12,84569704.0,84569837.0,84569771.0,134.0,+,0,0,2,0,1,0,NotDB,CGGCGGGAGCTGCCAGCTTTTTGGAATTCCTAATCGCTCCTGGCCC...,,,
9826,peak.9838,chr11,50842591.0,50842724.0,50842658.0,134.0,+,2,1,2,0,0,0,NotDB,ATGAAGTAGATATTACCAAATTGCTTTTTCAGCATCCATTTAGATA...,,ATGAAGTAGATATTACCAAATTGCTTTTTCAGCATCCATTTAGATA...,
9827,peak.9839,chr7,13397989.0,13398122.0,13398056.0,134.0,+,2,0,1,0,0,1,NotDB,TCACCCTAATCCCTCTTTCAAAATGTACTATCCAATTCCATTCTGG...,,TCACCCTACTCCCTCTTTCAAAATGTACTATCCAATTCCATTCTGG...,


In [304]:
data_merge.to_csv(os.path.join(base_dir, "peaksets", f"allCRE_annotation.withMutants.fimo{fimo_th}.tsv"), sep="\t", header=True, index=False)