### Set up pipeline with basic imports, data directories, and file basenames.

In [None]:
import os
from benbiohelpers.DataPipelineManagement.GenomeManager import getGenomeFastaFilePath
from bpde_chromatin_analysis.helper_scripts.BPDE_DataDir import getDataDir
from mutperiodpy.helper_scripts.UsefulFileSystemFunctions import getExternalDataDirectory as getMutperiodExternalDataDirectory

hg19FastaFilePath = getGenomeFastaFilePath("hg19")
mutperiodHg19Directory = getMutperiodExternalDataDirectory()

alexandrovDataDirectory = os.path.join(getDataDir(), "Alexandrov_LUAD")
alexandrovBaseDataFilePath = os.path.join(alexandrovDataDirectory, "Lung_Adeno_clean_somatic_mutations_for_signature_analysis.txt")

### Parse the mutation data for mutperiod input

In [None]:
from mutperiodpy.input_parsing.ParseAlexandrov import parseAlexandrov
from mutperiodpy.helper_scripts.UsefulFileSystemFunctions import DataTypeStr
from benbiohelpers.FileSystemHandling.DirectoryHandling import getFilesInDirectory

if getFilesInDirectory(alexandrovDataDirectory, DataTypeStr.mutations + ".bed", searchRecursively = False) is None:
    print("Parsed data not found. Generating...")
    alexandrovMutperiodInputFilePaths = parseAlexandrov([alexandrovBaseDataFilePath], hg19FastaFilePath)
else:
    alexandrovMutperiodInputFilePaths = [getFilesInDirectory(alexandrovDataDirectory, DataTypeStr.mutations + ".bed", searchRecursively = False)]
    print(f"Found parsed data at {alexandrovMutperiodInputFilePaths[0]}. Continuing.")

### Run the mutperiod pipeline to count mutations relative to transcription factor binding sites.

In [None]:
from mutperiodpy.RunAnalysisSuite import runAnalysisSuite

TFBS_FilePaths = list()
TFBS_FilePaths.append(os.path.join(mutperiodHg19Directory, "hg19_CTCF_known", "hg19_CTCF_known.bed"))
TFBS_FilePaths.append(os.path.join(mutperiodHg19Directory, "hg19_ETS_and_GABPA_known", "hg19_ETS_and_GABPA_known.bed"))
TFBS_FilePaths.append(os.path.join(mutperiodHg19Directory, "hg19_SP1_known", "hg19_SP1_known.bed"))

TFBS_Names = [os.path.basename(TFBS_FilePath).rsplit('.', 1)[0] for TFBS_FilePath in TFBS_FilePaths]

runAnalysisSuite(alexandrovMutperiodInputFilePaths, TFBS_Names, normalizationMethod = "Trinuc/Quadrunuc", customBackgroundDir = None,
                 useSingleNucRadius = True, includeLinker = False, useNucGroupRadius = True, useNucStrand = True)