# Mutational Signature Extraction
In this notebook, we will extract SNV and indel signatures for the metastatic pheochromocytoma and paraganglioma cohort.

In [None]:
# Import required packages
import os
import csv
from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen

In [None]:
# Set global variants
read_depth = 8
genome = 'GrCh38'
project = 'mPPGL'

## Create SigProfilerMatrixGenerator Input
We will use basic data wrangling to create the required input for SigProfilerMatrixGenerator

In [None]:
snvs = [file for file in os.listdir("../../data/raw/snvs/") if ".csv" in file] # Create list of SNV files
samples = [sample.split(".")[0] for sample in snvs] # Extract sample names

In [None]:
# Create input files
outfields = ['Project', 'Sample', 'ID', 'Genome', 'mut_type', 'chrom', 'pos_start', 'pos_end', 'ref', 'alt', 'type']

for snv, sample in zip(snvs, samples):
    with open("../../data/raw/snvs/{0}".format(snv), 'r') as infile, open("../../data/processed/mutational_signatures/input/{0}.SigProfiler.txt".format(sample), 'w') as outfile:
    
        reader = csv.DictReader(infile, delimiter=',')
        writer = csv.DictWriter(outfile, delimiter='\t', fieldnames=outfields, lineterminator='\r')
        writer.writeheader()

        for row in reader:

            if float(row['Tumor.AltDepth']) >= float(read_depth):

                newDict = {'Project': project,
                        'Sample': row['Tumor.ID'],
                        'ID': '.', 
                        'Genome': genome,
                        'mut_type': row['Variant.Class'],
                        'chrom': row['Chr'],
                        'pos_start': row['Start'],
                        'pos_end': row['Start'], 
                        'ref': row['REF'],
                        'alt': row['ALT'],
                        'type': 'SOMATIC'}

                writer.writerow(newDict)

## Run SigProfilerMatrixGenerator

In [None]:
matrices = matGen.SigProfilerMatrixGeneratorFunc(project, 
                                                 genome, 
                                                 "../../data/processed/mutational_signatures/input",
                                                 plot=True, exome=False, bed_file=None, chrom_based=False, 
                                                 tsb_stat=True, seqInfo=True, cushion=100)

## Extract SBS Signatures

In [None]:
from SigProfilerExtractor import sigpro as sig

In [None]:
# Load data
df1_path = "../../data/processed/mutational_signatures/input/output/SBS/mPPGL.SBS96.all"
df1 = pd.read_table(df1_path)
df1.head()

In [None]:
sig.sigProfilerExtractor(
        "matrix",
        "../../data/processed/mutational_signatures/SigProfilerExtractor_SBS",
        df1,
        opportunity_genome="GrCh38",
        exome=True,
        minimum_signatures=1,
        maximum_signatures=10,
        cpu=4,
    )

In [None]:
# Load data
df2_path = "../../data/processed/mutational_signatures/input/output/ID/mPPGL.ID83.all"
df2 = pd.read_table(df1_path)
df2.head()

In [None]:
sig.sigProfilerExtractor(
        "matrix",
        "../../data/processed/mutational_signatures/SigProfilerExtractor_ID",
        df2,
        opportunity_genome="GrCh38",
        exome=True,
        minimum_signatures=1,
        maximum_signatures=10,
        cpu=4,
    )