In [1]:
import os
import tkinter as tk
from tkinter import filedialog
from python.RelateMismatchesToFeature import relateMismatchesToFeature
from FormatMismatchesForRelation import formatMismatchesForRelation, combineSingleAndTandemMismatches
tk.Tk().withdraw()
bioinformaticsDir = filedialog.askdirectory(title = "Choose Bioinformatics Directory")
deaminationDeterminationDataDir = os.path.join(bioinformaticsDir, "deamination_determination", "data")
TFBS_Dir = os.path.join(deaminationDeterminationDataDir, "TFBS")
TFBS_MidpointFilePaths = {"CTCF_known1_DHS":os.path.join(TFBS_Dir, "CTCF_known1_DHS_midpoint.bed"),
                          "CTCF_known1_non-DHS":os.path.join(TFBS_Dir, "CTCF_known1_non-DHS_midpoint.bed")}
mutperiodExternalDataDir = os.path.join(bioinformaticsDir,"mutperiod","mutperiod_data","__external_data")
nucleosomeFilePaths = {"hg19_mnase_all":os.path.join(mutperiodExternalDataDir,"hg19", "hg19_MNase_nucleosome_map_all","hg19_MNase_nucleosome_map_all.bed"),}
mismatchesByReadDirectories = dict()
sequenceAndPositionDirectories = dict()
cellTypes = ("NHF1",)
timepointsByCellType = {"NHF1":["1h", "8h", "24h", "48h"]}
for cellType in cellTypes:
    mismatchesByReadDirectories[cellType] = os.path.join(deaminationDeterminationDataDir, cellType, "mismatches_by_read")
    sequenceAndPositionDirectories[cellType] = os.path.join(deaminationDeterminationDataDir, cellType, "sequence_and_position_analysis")

In [2]:
for cellType in cellTypes:
    for timepoint in timepointsByCellType[cellType]:
        formattedMismatchesByReadFilePaths = list()
        print(f"\nWorking with: {cellType}; {timepoint}")
        print("Formatting and filtering...")
        for mismatchType in ("C_to_T", "CC_to_TT"):
            if mismatchType == "C_to_T":
                originalMismatchesByReadFilePath = os.path.join(mismatchesByReadDirectories[cellType],
                                                                f"{cellType}_CPD_{timepoint}_all_reps_mismatches_by_read_TGG_filtered.bed")
            else:
                originalMismatchesByReadFilePath = os.path.join(mismatchesByReadDirectories[cellType],
                                                                f"{cellType}_CPD_{timepoint}_all_reps_{mismatchType}_mismatches_by_read_TGG_filtered.bed")
            zScoresFilePath = os.path.join(sequenceAndPositionDirectories[cellType],
                                           f"{cellType}_CPD_{timepoint}_all_reps_{mismatchType}_mismatch_frequency_z-scores_TGG_filtered.tsv")
            formattedMismatchesByReadFilePaths.append(
                formatMismatchesForRelation(originalMismatchesByReadFilePath, zScoresFilePath,
                                            acceptableMismatchTypes=[mismatchType.replace("_to_",'>')], sortOutput = False)
            )
        
        print("Combining single and tandem mismatches...")
        combinedFormattedMismatchesFilePath = combineSingleAndTandemMismatches(formattedMismatchesByReadFilePaths[0], formattedMismatchesByReadFilePaths[1])

        print("Relating genomic features...")
        for TFBS_Midpoint in TFBS_MidpointFilePaths:
            print(f"\tWorking with {TFBS_Midpoint}")
            relateMismatchesToFeature(combinedFormattedMismatchesFilePath, TFBS_MidpointFilePaths[TFBS_Midpoint],
                                      outputSuffix = f"_{TFBS_Midpoint}_related", enforcedStrand = '+')
        
        for nucleosomeMap in nucleosomeFilePaths:
            print(f"\tWorking with {nucleosomeMap}")
            relateMismatchesToFeature(combinedFormattedMismatchesFilePath, nucleosomeFilePaths[nucleosomeMap],
                                      outputSuffix = f"_{nucleosomeMap}_related", enforcedStrand = '+')



Working with: NHF1; 1h
Formatting and filtering...
Combining single and tandem mismatches...
Relating genomic features...
	Working with CTCF_known1_DHS
	Working with CTCF_known1_non-DHS
	Working with hg19_mnase_all

Working with: NHF1; 8h
Formatting and filtering...
Combining single and tandem mismatches...
Relating genomic features...
	Working with CTCF_known1_DHS
	Working with CTCF_known1_non-DHS
	Working with hg19_mnase_all

Working with: NHF1; 24h
Formatting and filtering...
Combining single and tandem mismatches...
Relating genomic features...
	Working with CTCF_known1_DHS
	Working with CTCF_known1_non-DHS
	Working with hg19_mnase_all

Working with: NHF1; 48h
Formatting and filtering...
Combining single and tandem mismatches...
Relating genomic features...
	Working with CTCF_known1_DHS
	Working with CTCF_known1_non-DHS
	Working with hg19_mnase_all
