In [None]:
import os
import tkinter as tk
from tkinter import filedialog
from FormatMismatchesForRelation import formatMismatchesForRelation, combineSingleAndTandemMismatches
tk.Tk().withdraw()
bioinformaticsDir = filedialog.askdirectory(title = "Choose Bioinformatics Directory")
deaminationDeterminationDataDir = os.path.join(bioinformaticsDir, "deamination_determination", "data")
mismatchesByReadDirectories = dict()
sequenceAndPositionDirectories = dict()
cellTypes = ("NHF1","Arabidopsis", "yeast")
timepointsByCellType = {"NHF1":["1h","8h","24h","48h"], "Arabidopsis":["ZT2"], "yeast":["20min"]}
readLengthsByCellType = {"NHF1":range(22,31), "Arabidopsis":range(24,31), "yeast":range(22,26)}
for cellType in cellTypes:
    mismatchesByReadDirectories[cellType] = os.path.join(deaminationDeterminationDataDir, cellType, "mismatches_by_read")
    sequenceAndPositionDirectories[cellType] = os.path.join(deaminationDeterminationDataDir, cellType, "sequence_and_position_analysis")

In [None]:
# Generate C>T- and CC>TT-specific reads
for cellType in cellTypes:
    for timepoint in timepointsByCellType[cellType]:
        formattedMismatchesByReadFilePaths = list()
        print(f"\nWorking with: {cellType}; {timepoint}")
        for mismatchType in ("C_to_T", "CC_to_TT"):
            print(f"Formatting and filtering for {mismatchType} mismatches...")
            if mismatchType == "C_to_T":
                originalMismatchesByReadFilePath = os.path.join(mismatchesByReadDirectories[cellType],
                                                                f"{cellType}_CPD_{timepoint}_all_reps_mismatches_by_read_TGG_filtered.bed")
            else:
                originalMismatchesByReadFilePath = os.path.join(mismatchesByReadDirectories[cellType],
                                                                f"{cellType}_CPD_{timepoint}_all_reps_{mismatchType}_mismatches_by_read_TGG_filtered.bed")
            zScoresFilePath = os.path.join(sequenceAndPositionDirectories[cellType],
                                           f"{cellType}_CPD_{timepoint}_all_reps_{mismatchType}_mismatch_frequency_z-scores_TGG_filtered.tsv")
            formattedMismatchesByReadFilePath = formatMismatchesForRelation(originalMismatchesByReadFilePath, zScoresFilePath,
                                                                            acceptableMismatchTypes=[mismatchType.replace("_to_",'>')],
                                                                            acceptableReadLengths = readLengthsByCellType[cellType])

            formattedMismatchesByReadFilePaths.append(formattedMismatchesByReadFilePath)

            # Also generate results without position filtering
            formatMismatchesForRelation(originalMismatchesByReadFilePath, zScoresFilePath = None, acceptableMismatchTypes=[mismatchType.replace("_to_",'>')],
                                        acceptableReadLengths = readLengthsByCellType[cellType], outputSuffix= "_relation_formatted_no_position_filtering")
        
        print("Combining single and tandem mismatches...")
        combinedFormattedMismatchesFilePath = combineSingleAndTandemMismatches(formattedMismatchesByReadFilePaths[0], formattedMismatchesByReadFilePaths[1])


In [None]:
# Generate mismatch-omitted reads
for cellType in cellTypes:
    for timepoint in timepointsByCellType[cellType]:
        print(f"\nWorking with: {cellType}; {timepoint}")
        for mismatchType in ("C_to_T",):
            print(f"Formatting and filtering for non-{mismatchType} mismatches...")
            originalMismatchesByReadFilePath = os.path.join(mismatchesByReadDirectories[cellType],
                                                            f"{cellType}_CPD_{timepoint}_all_reps_mismatches_by_read_TGG_filtered.bed")
            formatMismatchesForRelation(originalMismatchesByReadFilePath, None, acceptableMismatchTypes = None,
                                        unacceptableMismatchTypes = [mismatchType.replace("_to_",'>')], filterCompositeMismatches = True,
                                        acceptableReadLengths = readLengthsByCellType[cellType])