### Set up data directory, data set names, and external data paths

In [None]:
import os
dataDirectory = os.path.abspath(os.path.join("..","..","data", "okita", "main_seCLIP_analysis"))
bindingProteins = ["RBP_A", "RBP_I", "RBP_J", "RBP_K"]
repsByProtein = {"RBP_A" : ["rep1", "rep2", "rep3"],
                 "RBP_I" : ["rep1", "rep2", "rep3_combined"],
                 "RBP_J" : ["rep1_combined", "rep2", "rep3_combined"],
                 "RBP_K" : ["rep1", "rep2", "rep3"]}

annotatedGenesFilePath = os.path.abspath(os.path.join(dataDirectory,"..","__external_data","all.locus_brief_info.7.0_sorted.tsv"))
exonsFilePath = os.path.abspath(os.path.join(dataDirectory,"..","__external_data","Osativa7_exons.bed"))
locusSortedExonsFilePath = os.path.abspath(os.path.join(dataDirectory,"..","__external_data","Osativa7_exons_locus_sorted.bed"))
genomeFastaFilePath = os.path.abspath(os.path.join(dataDirectory,"..","__external_data","Osativa7.fa"))

### Annotate Peaks
Note: Make sure all seCLIP .narrowPeak files have their headers removed and are sorted before running this step!

In [None]:
from AnnotatePeaks import annotatePeaks
for bindingProtein in bindingProteins:
    for rep in repsByProtein[bindingProtein]:
        annotatePeaks([os.path.join(dataDirectory,bindingProtein,f"{bindingProtein}_{rep}.narrowPeak")],
                      annotatedGenesFilePath, exonsFilePath, genomeFastaFilePath, True)

### Combine adjacent peaks

In [None]:
from CombineAdjacentPeaks import combineAdjacentPeaks
fullAnnotationFilePaths = list()
for bindingProtein in bindingProteins:
    for rep in repsByProtein[bindingProtein]:
        fullAnnotationFilePaths.append(
            os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_{rep}_full_annotation.bed")
        )
combineAdjacentPeaks(fullAnnotationFilePaths, locusSortedExonsFilePath, 10, 1)

### Find common loci between repetitions

In [None]:
from FindCommonLoci import findCommonLoci
for bindingProtein in bindingProteins:
    fullAnnotationFilePaths = list()
    for rep in repsByProtein[bindingProtein]:
        fullAnnotationFilePaths.append(
            os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_{rep}_full_annotation.bed")
        )
    findCommonLoci(fullAnnotationFilePaths, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_common_loci.tsv"))

#### (Maybe pool common loci here later...)

### Use the common loci file to format read sequences for STREME

In [None]:
from FormatReadSequencesForSTREME import formatReadSequencesForSTREME
for bindingProtein in bindingProteins:
    fullAnnotationFilePaths = list()
    fullAnnotationCombinedFilePaths = list()
    for rep in repsByProtein[bindingProtein]:
        fullAnnotationFilePaths.append(
            os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_{rep}_full_annotation.bed")
        )
        fullAnnotationCombinedFilePaths.append(
            os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_{rep}_full_annotation_combined.bed")
        )
    # Only filter on common loci in at least 2 files.
    formatReadSequencesForSTREME(
        fullAnnotationFilePaths, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_STREME_input_minimal_filtering.fa"),
        genomeFastaFilePath, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_common_loci.tsv"), 2
    )
    # Filter on common loci in at least 2 files and sequences <= 50 base pairs. Also expand sequences under 20 base pairs.
    formatReadSequencesForSTREME(
        fullAnnotationFilePaths, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_STREME_input_20-50bp.fa"),
        genomeFastaFilePath, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_common_loci.tsv"), 2, 50, 20
    )
    # Filter on common loci in at least 2 files. Also expand sequences under 20 base pairs.
    formatReadSequencesForSTREME(
        fullAnnotationFilePaths, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_STREME_input_20bp_min.fa"),
        genomeFastaFilePath, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_common_loci.tsv"), 2,
        minSequenceLength=20
    )
    # Use combined peaks. Filter on common loci in at least 2 files.
    # Call peaks from the 3' end, expanded 30 bp in the 5' direction and 10 bp in the 3' direction.
    # (Parameters based on relative positions of known prolamine binding motifs)
    formatReadSequencesForSTREME(
        fullAnnotationCombinedFilePaths,
        os.path.join(dataDirectory, bindingProtein,f"{bindingProtein}_STREME_input_based_on_known_motif.fa"),
        genomeFastaFilePath, os.path.join(dataDirectory, bindingProtein, f"{bindingProtein}_common_loci.tsv"), 2,
        callFromThreePrimeEnd = True, fivePrimeExtension = 30, threePrimeExtension = 10
    )