### Set up data directory, data set names, and external data paths

In [2]:
import os
dataDirectory = os.path.abspath(os.path.join("..","..","data", "okita", "main_seCLIP_analysis"))
bindingProteins = ["RBP_A", "RBP_I", "RBP_J", "RBP_K"]
repsByProtein = {"RBP_A" : ["rep1", "rep2", "rep3"],
                 "RBP_I" : ["rep1", "rep2", "rep3_combined"],
                 "RBP_J" : ["rep1_combined", "rep2", "rep3_combined"],
                 "RBP_K" : ["rep1", "rep2", "rep3"]}

annotatedGenesFilePath = os.path.abspath(os.path.join(dataDirectory,"..","__external_data","all.locus_brief_info.7.0_sorted.tsv"))
exonsFilePath = os.path.abspath(os.path.join(dataDirectory,"..","__external_data","Osativa7_exons.bed"))
genomeFastaFilePath = os.path.abspath(os.path.join(dataDirectory,"..","__external_data","Osativa7.fa"))

### Annotate Peaks
Note: Make sure all seCLIP .narrowPeak files have their headers removed and are sorted before running this step!

In [3]:
from AnnotatePeaks import annotatePeaks
for bindingProtein in bindingProteins:
    for rep in repsByProtein[bindingProtein]:
        annotatePeaks([os.path.join(dataDirectory,bindingProtein,f"{bindingProtein}_{rep}.narrowPeak")],
                      annotatedGenesFilePath, exonsFilePath, genomeFastaFilePath, True)


Working with RBP_A_rep1.narrowPeak
Checking for encompassing genes...
Checking input files for proper sorting...
Checking encompassed features file for proper sorting...
Counting in Chr1
Counting in Chr10
Counting in Chr11
Counting in Chr12
Counting in Chr2
Counting in Chr3
Counting in Chr4
Counting in Chr5
Counting in Chr6
Counting in Chr7
Counting in Chr8
Counting in Chr9
Counting in ChrSy
Counting in ChrUn
Sorting output...
Checking for encompassing exons...
Checking input files for proper sorting...
Checking encompassed features file for proper sorting...
Checking encompassing features file for proper sorting...
Counting in Chr1
Counting in Chr10
Counting in Chr11
Counting in Chr12
Counting in Chr2
Counting in Chr3
Counting in Chr4
Counting in Chr5
Counting in Chr6
Counting in Chr7
Counting in Chr8
Counting in Chr9
Counting in ChrSy
Counting in ChrUn
Sorting output...
Adding peak sequences...

Working with RBP_A_rep2.narrowPeak
Checking for encompassing genes...
Checking input fil

### Find common loci between repetitions

In [4]:
from FindCommonLoci import findCommonLoci
for bindingProtein in bindingProteins:
    fullAnnotationFilePaths = list()
    for rep in repsByProtein[bindingProtein]:
        fullAnnotationFilePaths.append(
            os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_{rep}_full_annotation.bed")
        )
    findCommonLoci(fullAnnotationFilePaths, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_common_loci.tsv"))

Finding unique loci in RBP_A_rep1_full_annotation.bed...
Finding unique loci in RBP_A_rep2_full_annotation.bed...
Finding unique loci in RBP_A_rep3_full_annotation.bed...
Writing results...
Finding unique loci in RBP_I_rep1_full_annotation.bed...
Finding unique loci in RBP_I_rep2_full_annotation.bed...
Finding unique loci in RBP_I_rep3_combined_full_annotation.bed...
Writing results...
Finding unique loci in RBP_J_rep1_combined_full_annotation.bed...
Finding unique loci in RBP_J_rep2_full_annotation.bed...
Finding unique loci in RBP_J_rep3_combined_full_annotation.bed...
Writing results...
Finding unique loci in RBP_K_rep1_full_annotation.bed...
Finding unique loci in RBP_K_rep2_full_annotation.bed...
Finding unique loci in RBP_K_rep3_full_annotation.bed...
Writing results...


#### (Maybe pool common loci here later...)

### Use the common loci file to format read sequences for STREME

In [3]:
from FormatReadSequencesForSTREME import formatReadSequencesForSTREME
for bindingProtein in bindingProteins:
    fullAnnotationFilePaths = list()
    for rep in repsByProtein[bindingProtein]:
        fullAnnotationFilePaths.append(
            os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_{rep}_full_annotation.bed")
        )
    # Only filter on common loci in at least 2 files.
    formatReadSequencesForSTREME(
        fullAnnotationFilePaths, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_STREME_input_minimal_filtering.fa"),
        genomeFastaFilePath, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_common_loci.tsv"), 2
    )
    # Filter on common loci in at least 2 files and sequences <= 50 base pairs. Also expand sequences under 20 base pairs.
    formatReadSequencesForSTREME(
        fullAnnotationFilePaths, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_STREME_input_20-50bp.fa"),
        genomeFastaFilePath, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_common_loci.tsv"), 2, 50, 20
    )
    # Filter on common loci in at least 2 files. Also expand sequences under 20 base pairs.
    formatReadSequencesForSTREME(
        fullAnnotationFilePaths, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_STREME_input_20bp_min.fa"),
        genomeFastaFilePath, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_common_loci.tsv"), 2,
        minSequenceLength=20
    )

Filtering requested. Finding valid loci...
Found 185 valid loci.
Writing positions from RBP_A_rep1_full_annotation.bed to intermediate bed file...
Writing positions from RBP_A_rep2_full_annotation.bed to intermediate bed file...
Writing positions from RBP_A_rep3_full_annotation.bed to intermediate bed file...
Finished writitng 787 sequences!
Filtering requested. Finding valid loci...
Found 185 valid loci.
Writing positions from RBP_A_rep1_full_annotation.bed to intermediate bed file...
Writing positions from RBP_A_rep2_full_annotation.bed to intermediate bed file...
Writing positions from RBP_A_rep3_full_annotation.bed to intermediate bed file...
Finished writitng 587 sequences!
Filtering requested. Finding valid loci...
Found 185 valid loci.
Writing positions from RBP_A_rep1_full_annotation.bed to intermediate bed file...
Writing positions from RBP_A_rep2_full_annotation.bed to intermediate bed file...
Writing positions from RBP_A_rep3_full_annotation.bed to intermediate bed file...
F