### Set up pipeline with basic imports, data directories, and file basenames.

In [5]:
import os
from benbiohelpers.DataPipelineManagement.GenomeManager import getGenomeFastaFilePath
from bpde_chromatin_analysis.helper_scripts.BPDE_DataDir import getDataDir
from mutperiodpy.helper_scripts.UsefulFileSystemFunctions import getExternalDataDirectory as getMutperiodExternalDataDirectory

hg19FastaFilePath = getGenomeFastaFilePath("hg19")
mutperiodHg19Directory = getMutperiodExternalDataDirectory()

liDataDirectory = os.path.join(getDataDir(), "Li_tXR-seq")
liBaseDataFilePath = os.path.join(liDataDirectory, "Li_tXR-seq.bed")

adarDamageSeqDataDir = os.path.join(getDataDir(), "Adar_damage-seq", "A549_BPDE_cell_2h")

### Parse the tXR-seq data for mutperiod input

In [6]:
from mutperiodpy.input_parsing.ParseXRSeq import parseXRSeq, PresetCallParams
from mutperiodpy.helper_scripts.UsefulFileSystemFunctions import DataTypeStr
from benbiohelpers.FileSystemHandling.DirectoryHandling import getFilesInDirectory

if getFilesInDirectory(liDataDirectory, DataTypeStr.mutations + ".bed", searchRecursively = False) is None:
    print("Parsed data not found. Generating...")
    liMutperiodInputFilePaths = parseXRSeq([liBaseDataFilePath], PresetCallParams.BPDE, hg19FastaFilePath)
else:
    liMutperiodInputFilePaths = [getFilesInDirectory(liDataDirectory, DataTypeStr.mutations + ".bed", searchRecursively = False)]
    print(f"Found parsed data at {liMutperiodInputFilePaths[0]}. Continuing.")

Found parsed data at /home/ben/bioinformatics/bpde_chromatin_analysis/BPDE_data/Li_tXR-seq/Li_tXR-seq_trinuc_context_mutations.bed. Continuing.


### Run the mutperiod pipeline to determine translational and rotational periodicity of the data.
NOTE: This analysis requires that the damage data has already been parsed for mutperiod using the Adar_damage-seq_nucleosome_periodicity_data_generation notebook.

In [7]:
from mutperiodpy.RunAnalysisSuite import runAnalysisSuite
nucleosomeMapFilePaths = list()
nucleosomeMapFilePaths.append(os.path.join(mutperiodHg19Directory, "hg19_hybrid_nuc_map", "hg19_hybrid_nuc_map.bed"))
nucleosomeMapFilePaths.append(os.path.join(mutperiodHg19Directory, "hg19_LCL_MNase_nuc_map", "hg19_LCL_MNase_nuc_map.bed"))
# nucleosomeMapFilePaths.append(os.path.join(mutperiodHg19Directory, "hg19_LCL_MNase_nuc_map_euchromatin",
#                                            "hg19_LCL_MNase_nuc_map_euchromatin.bed"))
# nucleosomeMapFilePaths.append(os.path.join(mutperiodHg19Directory, "hg19_LCL_MNase_nuc_map_heterochromatin",
#                                            "hg19_LCL_MNase_nuc_map_heterochromatin.bed"))
# nucleosomeMapFilePaths.append(os.path.join(mutperiodHg19Directory, "hg19_LCL_MNase_nuc_map_stringent_euchromatin",
#                                            "hg19_LCL_MNase_nuc_map_stringent_euchromatin.bed"))
# nucleosomeMapFilePaths.append(os.path.join(mutperiodHg19Directory, "hg19_NHF1_MNase_nuc_map", "hg19_NHF1_MNase_nuc_map.bed"))

nucleosomeMapNames = [os.path.basename(nucleosomeMapFilePath).rsplit('.', 1)[0] for nucleosomeMapFilePath in nucleosomeMapFilePaths]

# runAnalysisSuite(liMutperiodInputFilePaths, nucleosomeMapNames,
#                  normalizationMethod = "No Normalization", customBackgroundDir = None,
#                  useSingleNucRadius = True, includeLinker = False, useNucGroupRadius = True)
# runAnalysisSuite(liMutperiodInputFilePaths, nucleosomeMapNames,
#                  normalizationMethod = "Singlenuc/Dinuc", customBackgroundDir = None,
#                  useSingleNucRadius = True, includeLinker = False, useNucGroupRadius = True)
runAnalysisSuite(liMutperiodInputFilePaths, nucleosomeMapNames,
                 normalizationMethod = "Custom Background", customBackgroundDir = adarDamageSeqDataDir,
                 useSingleNucRadius = True, includeLinker = False, useNucGroupRadius = True)
runAnalysisSuite(liMutperiodInputFilePaths, nucleosomeMapNames,
                 normalizationMethod = "Trinuc/Quadrunuc", customBackgroundDir = None,
                 useSingleNucRadius = True, includeLinker = False, useNucGroupRadius = True)


Counting mutations at each dyad position...

Working with Li_tXR-seq_trinuc_context_mutations.bed
Counting with nucleosome map: hg19_hybrid_nuc_map
Counting mutations at each nucleosome position in a 73 bp radius + 0 bp linker DNA.
Checking input files for proper sorting...
Checking encompassed features file for proper sorting...
Checking encompassing features file for proper sorting...
Counting in chr1
Counting in chr10
Counting in chr11
Counting in chr12
Counting in chr13
Counting in chr14
Counting in chr15
Counting in chr16
Counting in chr17
Counting in chr18
Counting in chr19
Counting in chr2
Counting in chr20
Counting in chr21
Counting in chr22
Counting in chr3
Counting in chr4
Counting in chr5
Counting in chr6
Counting in chr7
Counting in chr8
Counting in chr9
Counting in chrX
Counting mutations at each nucleosome position in a 1000 bp radius.
Checking input files for proper sorting...
Checking encompassed features file for proper sorting...
Checking encompassing features file f

In [None]:
from mutperiodpy.RunNucleosomeMutationAnalysis import runNucleosomeMutationAnalysis

# rawNucleosomeCountsFilePaths = list()
# for nucleosomeMapName in nucleosomeMapNames:
#     rawNucleosomeCountsFilePaths += getFilesInDirectory(os.path.join(liDataDirectory, nucleosomeMapName), DataTypeStr.rawNucCounts + ".tsv")
# runNucleosomeMutationAnalysis(rawNucleosomeCountsFilePaths,
#                               outputFilePath = os.path.join(liDataDirectory, "Li_tXR-seq_raw_periodicity_data.tsv"),
#                               overridePeakPeriodicityWithExpected = False, alignStrands = False)
# runNucleosomeMutationAnalysis(rawNucleosomeCountsFilePaths,
#                               outputFilePath = os.path.join(liDataDirectory, "Li_tXR-seq_raw_aligned_periodicity_data.tsv"),
#                               overridePeakPeriodicityWithExpected = False, alignStrands = True)

normalizedNucleosomeCountsFilePaths = list()
for nucleosomeMapName in nucleosomeMapNames:
    normalizedNucleosomeCountsFilePaths += getFilesInDirectory(os.path.join(liDataDirectory, nucleosomeMapName), DataTypeStr.normNucCounts + ".tsv")
runNucleosomeMutationAnalysis(normalizedNucleosomeCountsFilePaths,
                              outputFilePath = os.path.join(liDataDirectory, "Li_tXR-seq_normalized_periodicity_data.tsv"),
                              overridePeakPeriodicityWithExpected = False, alignStrands = False)
runNucleosomeMutationAnalysis(normalizedNucleosomeCountsFilePaths,
                              outputFilePath = os.path.join(liDataDirectory, "Li_tXR-seq_normalized_aligned_periodicity_data.tsv"),
                              overridePeakPeriodicityWithExpected = False, alignStrands = True)

Generating inputs to run analysis without grouped comparison...
Calling R script...
[1] "Working with Li_tXR-seq_LCL_MNase_map_nuc-group_raw"
[1] "Running periodicity analysis..."
[1] "Working with Li_tXR-seq_LCL_MNase_map_raw"
[1] "Running periodicity analysis..."
[1] "Working with Li_tXR-seq_hybrid_nuc_map_nuc-group_raw"
[1] "Running periodicity analysis..."
[1] "Working with Li_tXR-seq_hybrid_nuc_map_raw"
[1] "Running periodicity analysis..."
Results can be found at /home/ben/bioinformatics/bpde_chromatin_analysis/BPDE_data/Li_tXR-seq/Li_tXR-seq_raw_periodicity_data.tsv
Generating inputs to run analysis without grouped comparison...
Calling R script...
[1] "Working with Li_tXR-seq_LCL_MNase_map_nuc-group_raw"
[1] "Running periodicity analysis..."
[1] "Working with Li_tXR-seq_LCL_MNase_map_raw"
[1] "Running periodicity analysis..."
[1] "Working with Li_tXR-seq_hybrid_nuc_map_nuc-group_raw"
[1] "Running periodicity analysis..."
[1] "Working with Li_tXR-seq_hybrid_nuc_map_raw"
[1] "Run