In [None]:
import os
import pandas as pd
from SigProfilerAssignment import Analyzer as Analyze
import matplotlib.pyplot as plt
import gc
import re

In [None]:
os.getcwd()

In [None]:
tmp=pd.read_csv("somatic_variant_v1.tsv",sep="\t")

In [None]:
tmp.head()

In [None]:
aa=tmp['id'].value_counts()
aa=pd.DataFrame.from_dict({"sample_id":aa.index,"variant_count":aa.values})

In [None]:
aa

In [None]:
aa.to_csv("variant_per_sample_v1.tsv",sep="\t",index=None)

In [None]:
aa['variant_count'].describe()

In [None]:
samples=tmp['id'].unique()

#### Looped mutational signature analysis

In [None]:
for sample in samples:
    sub_variants=tmp.loc[(tmp['id']==sample) & (tmp['Variant_type']=="SNV") & (tmp['tumor']>=0.02) ,['Chr','Start','id','Ref','Alt']]
    result_path=f"signature_v3/{sample}"
    if not os.path.exists(result_path):
        os.mkdir(result_path)
    sub_variants[['Chr','Start','id','Ref','Alt']].to_csv(f"{result_path}/filtered.vcf",sep="\t",index=None,header=None)
    Analyze.cosmic_fit(samples=result_path, 
                   output=f"{result_path}/Assignment",
                   input_type="vcf",
                   context_type="96",
                   genome_build="GRCh37",
                   make_plots=True,
                   sample_reconstruction_plots=True,
                   exclude_signature_subgroups=None,
                   cosmic_version=3.4)
    aetiology=pd.read_csv('aetiology_map.tsv',sep='\t')
    tmp_signature_assignment=pd.read_csv(f"{result_path}/Assignment/Assignment_Solution/Activities/Assignment_Solution_Activities.txt",sep='\t')
    tmp_signature_assignment={tmp_signature_assignment.columns[i]:tmp_signature_assignment.iloc[0,i] for i in range(1,tmp_signature_assignment.shape[1]) }
    tmp_signature_assignment={i:round(tmp_signature_assignment[i]/sum(tmp_signature_assignment.values()),2)  for i in tmp_signature_assignment.keys()}
    plotdata=pd.DataFrame.from_dict({'signature':tmp_signature_assignment.keys(),'freq':tmp_signature_assignment.values()})
    plotdata=plotdata[plotdata['freq']!=0]
    plotdata=pd.merge(plotdata,aetiology,on='signature',how='left')
    plotdata.loc[plotdata['aetiology'].isnull(),'aetiology']='Possible sequencing artefact'
    labels = list(tmp_signature_assignment.keys())
    sizes = list(tmp_signature_assignment.values())
    fig1, ax1 = plt.subplots()
    ax1.pie(plotdata['freq'], labels=plotdata['signature']+'\n'+plotdata['aetiology'], autopct='%1.1f%%', startangle=90)
    plt.savefig(f"{result_path}/pie_chart.pdf", format="pdf", bbox_inches="tight")
    plt.show()

### Merge all signatures

In [None]:
sample_full=pd.read_csv("variant_per_sample_v1.tsv",sep="\t")
sig_matrix=pd.DataFrame()
for sample in sample_full['sample_id']:
    result_path=f"signature_v3/{sample}"
    tmp_signature_assignment=pd.read_csv(f"{result_path}/Assignment/Assignment_Solution/Activities/Assignment_Solution_Activities.txt",sep='\t')
    tmp_signature_assignment['Samples']=sample
    total=tmp_signature_assignment.iloc[:,1:].values.sum()
    for signature in tmp_signature_assignment.columns[tmp_signature_assignment.columns.str.contains('SBS')]:
        tmp_signature_assignment[signature]=tmp_signature_assignment[signature]/total
    sig_matrix=pd.concat([sig_matrix,tmp_signature_assignment],axis=0)

In [None]:
sig_matrix.head()

In [None]:
merged_results=pd.merge(sample_full,sig_matrix,left_on="sample_id",right_on="Samples",how="inner")

In [None]:
merged_results.head()

#### Summarize QC metrics for mutational signature

In [None]:
sig_qc_table=pd.DataFrame()
for sample in sample_full.sample_id:
    result_path=f"signature_v3/{sample}"
    tmp_signature_assignment_quality=pd.read_csv(f"{result_path}/Assignment/Assignment_Solution/Solution_Stats/Assignment_Solution_Samples_Stats.txt",sep='\t')
    tmp_signature_assignment_quality['Sample Names']=sample
    sig_qc_table = pd.concat([sig_qc_table,tmp_signature_assignment_quality],axis=0)

In [None]:
sig_qc_table.head()

#### Combine all metrics

In [None]:
merged_results=pd.merge(sample_full,sig_qc_table,left_on='sample_id',right_on='Sample Names',how="inner")
merged_results=merged_results.merge(sig_matrix,left_on="sample_id",right_on="Samples",how="inner")
merged_results=merged_results.drop(columns=['Sample Names','Samples'])
merged_results=merged_results.rename(columns={'sample_id':'Sample ID','variant_count':'Variant count','Total Mutations':'Total SNV'})

In [None]:
merged_results['Sample ID']=merged_results['Sample ID'].astype(str).str.zfill(8)

In [None]:
merged_results.to_excel('merged_mutational_signatures_v3.xlsx',index=None,sheet_name="full metrics")