In [None]:
import os
from benbiohelpers.DataPipelineManagement.GenomeManager import getGenomeFastaFilePath
from bpde_chromatin_analysis.helper_scripts.BPDE_DataDir import getDataDir
from mutperiodpy.helper_scripts.UsefulFileSystemFunctions import getExternalDataDirectory as getMutperiodExternalDataDirectory

hg19FastaFilePath = getGenomeFastaFilePath("hg19")
mutperiodHg19Directory = os.path.join(getMutperiodExternalDataDirectory(), "hg19")

smmSeqDataDirectory = os.path.join(getDataDir(), "Adar_smm-seq")
smmSeqBaseDataFilePath = os.path.join(smmSeqDataDirectory, "smm-seq_BPDE_all_reps.bed")
smmSeqTrinucDataFilePath = os.path.join(smmSeqDataDirectory, "smm-seq_BPDE_all_reps_trinuc_context.bed")

### Generate trinucleotide mutation counts inside and outside nucleosomes

In [None]:
from benbiohelpers.CountThisInThat.Counter import ThisInThatCounter
from benbiohelpers.CountThisInThat.CounterOutputDataHandler import ENCOMPASSED_DATA, OutputDataWriter, CounterOutputDataHandler
from benbiohelpers.CountThisInThat.InputDataStructures import EncompassingDataDefaultStrand

from benbiohelpers.FileSystemHandling.DirectoryHandling import checkDirs

def getCountDerivatives(outputDataWriter: OutputDataWriter, getHeaders):
    if getHeaders: return ["Ratio"]
    else:
        if outputDataWriter.totalCounts == 0: return 0
        else: return [str(outputDataWriter.outputDataStructure[outputDataWriter.previousKeys[0]][outputDataWriter.previousKeys[1]][None]/outputDataWriter.totalCounts)]

class MutationsInNucleosomes(ThisInThatCounter):

    def constructEncompassingFeature(self, line) -> EncompassingDataDefaultStrand:
        return EncompassingDataDefaultStrand(line, self.acceptableChromosomes)

    def initOutputDataHandler(self):
        """
        Use this function to create the instance of the CounterOutputDataHandler object.
        Default behavior creates the output data handler and passes in the counter's "writeIncrementally" value.
        """
        self.outputDataHandler = CounterOutputDataHandler(self.writeIncrementally)

    def setupOutputDataStratifiers(self):
        self.outputDataHandler.addSimpleEncompassedColStratifier(outputName="Mutated_From", colIndex=3)
        self.outputDataHandler.addSimpleEncompassedColStratifier(outputName="Mutated_To", colIndex=4)
        self.outputDataHandler.addPlaceholderStratifier(outputName="Counts")

    def setupOutputDataWriter(self):
        self.outputDataHandler.createOutputDataWriter(self.outputFilePath, getCountDerivatives = getCountDerivatives, omitZeroRows = True)

class MutationsOutsideNucleosomes(ThisInThatCounter):

    def constructEncompassingFeature(self, line) -> EncompassingDataDefaultStrand:
        return EncompassingDataDefaultStrand(line, self.acceptableChromosomes)

    def initOutputDataHandler(self):
        """
        Use this function to create the instance of the CounterOutputDataHandler object.
        Default behavior creates the output data handler and passes in the counter's "writeIncrementally" value.
        """
        self.outputDataHandler = CounterOutputDataHandler(self.writeIncrementally, trackAllEncompassed=True, countOnlyNonEncompassed=True)

    def setupOutputDataStratifiers(self):
        self.outputDataHandler.addSimpleEncompassedColStratifier(outputName="Mutated_From", colIndex=3)
        self.outputDataHandler.addSimpleEncompassedColStratifier(outputName="Mutated_To", colIndex=4)
        self.outputDataHandler.addPlaceholderStratifier(outputName="Counts")

    def setupOutputDataWriter(self):
        self.outputDataHandler.createOutputDataWriter(self.outputFilePath, getCountDerivatives = getCountDerivatives, omitZeroRows = True)

nucleosomeMapFilePaths = list()
nucleosomeMapFilePaths.append(os.path.join(mutperiodHg19Directory, "hg19_hybrid_nuc_map", "hg19_hybrid_nuc_map.bed"))
nucleosomeMapFilePaths.append(os.path.join(mutperiodHg19Directory, "hg19_LCL_MNase_nuc_map", "hg19_LCL_MNase_nuc_map.bed"))

mutationsInNucleosomesFilePaths = dict()
mutationsOutsideNucleosomesFilePaths = dict()
for nucleosomeMapFilePath in nucleosomeMapFilePaths:
    nucleosomeMapName = os.path.basename(nucleosomeMapFilePath).rsplit('.', 1)[0]
    checkDirs(os.path.join(smmSeqDataDirectory, nucleosomeMapName))
    outputFilePath = os.path.join(smmSeqDataDirectory, nucleosomeMapName, "mutations_in_" + nucleosomeMapName + ".tsv")
    MutationsInNucleosomes(smmSeqBaseDataFilePath, nucleosomeMapFilePath, outputFilePath,
                           encompassingFeatureExtraRadius=73).count()
    outputFilePath = os.path.join(smmSeqDataDirectory, nucleosomeMapName, "trinuc_mutations_in_" + nucleosomeMapName + ".tsv")
    MutationsInNucleosomes(smmSeqTrinucDataFilePath, nucleosomeMapFilePath, outputFilePath,
                           encompassingFeatureExtraRadius=73).count()
    mutationsInNucleosomesFilePaths[nucleosomeMapFilePath] = outputFilePath

    outputFilePath = os.path.join(smmSeqDataDirectory, nucleosomeMapName, "mutations_outside_" + nucleosomeMapName + ".tsv")
    MutationsOutsideNucleosomes(smmSeqBaseDataFilePath, nucleosomeMapFilePath, outputFilePath,
                                encompassingFeatureExtraRadius=73).count()
    outputFilePath = os.path.join(smmSeqDataDirectory, nucleosomeMapName, "trinuc_mutations_outside_" + nucleosomeMapName + ".tsv")
    MutationsOutsideNucleosomes(smmSeqTrinucDataFilePath, nucleosomeMapFilePath, outputFilePath,
                                encompassingFeatureExtraRadius=73).count()
    mutationsOutsideNucleosomesFilePaths[nucleosomeMapFilePath] = outputFilePath


### Fill in missing trinucleotide counts

In [None]:
import pandas as pd
for nucleosomeMapFilePath in nucleosomeMapFilePaths:
    mutationsInNucleosomes = pd.read_table(mutationsInNucleosomesFilePaths[nucleosomeMapFilePath])
    mutationsOutsideNucleosomes = pd.read_table(mutationsOutsideNucleosomesFilePaths[nucleosomeMapFilePath])
    missedContextsInNucleosomes = []
    missedContextsOutsideNucleosomes = []
    for fivePrimeNucleotide in ['A','C','T','G']:
        for mutatedFrom in ['C','T']:
            for threePrimeNucleotide in ['A','C','T','G']:
                trinucleotide = fivePrimeNucleotide + mutatedFrom + threePrimeNucleotide
                mutInNucTrinucRows = mutationsInNucleosomes[mutationsInNucleosomes["Mutated_From"]==trinucleotide]
                mutOutsideNucTrinucRows = mutationsOutsideNucleosomes[mutationsOutsideNucleosomes["Mutated_From"]==trinucleotide]
                for mutatedTo in ['A','C','T','G']:
                    if mutatedTo == mutatedFrom: continue
                    if mutatedTo not in mutInNucTrinucRows["Mutated_To"].values:
                        missedContextsInNucleosomes.append([trinucleotide, mutatedTo, 0, 0])
                    if mutatedTo not in mutOutsideNucTrinucRows["Mutated_To"].values:
                        missedContextsOutsideNucleosomes.append([trinucleotide, mutatedTo, 0, 0])
    missedContextsInNucleosomes = pd.DataFrame(missedContextsInNucleosomes, columns=mutationsInNucleosomes.columns)
    missedContextsOutsideNucleosomes = pd.DataFrame(missedContextsOutsideNucleosomes, columns=mutationsOutsideNucleosomes.columns)
    mutationsInNucleosomes = pd.concat([mutationsInNucleosomes, missedContextsInNucleosomes], ignore_index=True).sort_values(by=["Mutated_From", "Mutated_To"])
    mutationsOutsideNucleosomes = pd.concat([mutationsOutsideNucleosomes, missedContextsOutsideNucleosomes], ignore_index=True).sort_values(by=["Mutated_From", "Mutated_To"])
    mutationsInNucleosomes.to_csv(mutationsInNucleosomesFilePaths[nucleosomeMapFilePath], sep = '\t', index = False)
    mutationsOutsideNucleosomes.to_csv(mutationsOutsideNucleosomesFilePaths[nucleosomeMapFilePath], sep = '\t', index = False)

### Compute cosine similarity on mutation signatures inside and outside nucleosomes

In [None]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
for nucleosomeMapFilePath in nucleosomeMapFilePaths:
    mutationsInNucleosomes = pd.read_table(mutationsInNucleosomesFilePaths[nucleosomeMapFilePath])
    mutationsOutsideNucleosomes = pd.read_table(mutationsOutsideNucleosomesFilePaths[nucleosomeMapFilePath])
    mergedMutations = mutationsInNucleosomes.merge(
        mutationsOutsideNucleosomes, "outer", ["Mutated_From","Mutated_To"], sort = True, suffixes=("_In_Nucleosomes", "_Outside_Nucleosomes")
    ).fillna(0)
    print(os.path.basename(nucleosomeMapFilePath).rsplit('.', 1)[0],":",sep='')
    ratioIn = np.array(mergedMutations["Ratio_In_Nucleosomes"])
    ratioOut = np.array(mergedMutations["Ratio_Outside_Nucleosomes"])
    cosineSimilarity = np.dot(ratioIn, ratioOut) / (norm(ratioIn) * norm(ratioOut))
    print("Cosine similarity (mutations inside vs. outside nucleosomes):",cosineSimilarity)
    print()