## Set up pipeline with basic imports, data directories, and file basenames.

In [None]:
import os
from benbiohelpers.DataPipelineManagement.GenomeManager import getGenomeFastaFilePath
from bpde_chromatin_analysis.helper_scripts.BPDE_DataDir import getDataDir
from mutperiodpy.helper_scripts.UsefulFileSystemFunctions import getExternalDataDirectory as getMutperiodExternalDataDirectory

hg19FastaFilePath = getGenomeFastaFilePath("hg19")
mutperiodHg19Directory = getMutperiodExternalDataDirectory()

CTCF_DataDirectory = os.path.join(getDataDir(), "CTCF_binding_sites")

CTCF_NucMapDirectory = os.path.join(CTCF_DataDirectory, "nucleosome_maps")
CTCF_NucMapFilePaths = [os.path.join(CTCF_NucMapDirectory, "hybrid", "hg19_hybrid_nucleosome_map.bed"),
                        os.path.join(CTCF_NucMapDirectory, "LCL_all_mappable", "hg19_LCL_MNase_nuc_map_all_mappable.bed"),
                        os.path.join(CTCF_NucMapDirectory, "NHF1_all_mappable", "hg19_NHF1_MNase_nuc_map_all_mappable.bed")]

CTCF_BindingSitesFilePaths = [os.path.join(CTCF_DataDirectory, "sequence_logo", "hg19_CTCF_known_all.bed")]

## First, run the nucleosome analysis

### Parse the nucleosome dyad positions for mutperiod input

In [None]:
from mutperiodpy.input_parsing.ParseStandardBed import parseStandardBed

CTCF_MutperiodInputFilePaths = parseStandardBed(CTCF_NucMapFilePaths, hg19FastaFilePath)

### Run the mutperiod pipeline to determine nucleosome positions relative to CTCF binding sites.

In [None]:
from mutperiodpy.RunAnalysisSuite import runAnalysisSuite
nucleosomeMapFilePaths = list()
nucleosomeMapFilePaths.append(os.path.join(mutperiodHg19Directory, "hg19_CTCF_known_all", "hg19_CTCF_known_all.bed"))

nucleosomeMapNames = [os.path.basename(nucleosomeMapFilePath).rsplit('.', 1)[0] for nucleosomeMapFilePath in nucleosomeMapFilePaths]

runAnalysisSuite(CTCF_MutperiodInputFilePaths, nucleosomeMapNames,
                 normalizationMethod = "No Normalization", customBackgroundDir = None,
                 useSingleNucRadius = False, includeLinker = False, useNucGroupRadius = True)

In [None]:
from mutperiodpy.helper_scripts.UsefulFileSystemFunctions import DataTypeStr
from mutperiodpy.RunNucleosomeMutationAnalysis import runNucleosomeMutationAnalysis
from benbiohelpers.FileSystemHandling.DirectoryHandling import getFilesInDirectory

runNucleosomeMutationAnalysis(getFilesInDirectory(CTCF_NucMapDirectory, DataTypeStr.rawNucCounts + ".tsv"),
                              outputFilePath = os.path.join(CTCF_NucMapDirectory, "Nucleosome-relative_CTCF_periodicity_data.tsv"),
                              overridePeakPeriodicityWithExpected = False, alignStrands = True)

## Next, generate a sequence logo for CTCF binding sites
Notably, this analysis can be run independent of the nucleosome analysis.

### Extend the CTCF binding sites by 10 nucleotides on either side

In [None]:
from benbiohelpers.FileSystemHandling.ExpandSequenceContext import expandSequenceContext
expandedCTCF_FilePaths = expandSequenceContext(CTCF_BindingSitesFilePaths, hg19FastaFilePath, 10)

### The relevant file is the fasta file in the intermediate_files directory. It should contain few enough sequences to make the logo at https://weblogo.berkeley.edu/logo.cgi