In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay, confusion_matrix, ConfusionMatrixDisplay, mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, median_absolute_error, explained_variance_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
#############################
#### Signature Database #####
#############################
folder_path = '../../../Downloads/new_sigs/'
sig_data = pd.DataFrame()

# Loop through all CSV files in the folder
for filename in os.listdir(folder_path):
    # Filter to match the pattern 'SBS96_catalogue.<sample_name>.hg19.tally.csv' -  only doing sbs first
    if filename.startswith('SBS96_catalogue.') and filename.endswith('.hg19.tally.csv'):
        file_path = os.path.join(folder_path, filename)
        data = pd.read_csv(file_path, index_col=None)

        # Extract the sample name from the filename (e.g., 'TCGA-CA-6717-01')
        sample_name = filename.split('.')[1]
        select_data = data.drop(columns=['type', 'count'])

        # Pivot the DataFrame so that 'channel' becomes the columns
        pivot_data = select_data.pivot_table(index=None, columns="channel", values="fraction").reset_index(drop=True)
        pivot_data.insert(0, 'sample', sample_name)
        sig_data = pd.concat([sig_data, pivot_data], ignore_index=True)

# Add final column to data (empty) - maybe dont do it here??
sig_data["p53 status"] = ""

# Display the final DataFrame or save it to a file
print("\n--------------------------\n")
print(sig_data)
print("\n--------------------------\n")


--------------------------

channel           sample   A[C>A]A   A[C>A]C   A[C>A]G  A[C>A]T   A[C>G]A  \
0        TCGA-A2-A0T5-01  0.000000  0.000000  0.000000  0.00000  0.000000   
1        TCGA-CF-A9FF-01  0.000000  0.010811  0.010811  0.00000  0.000000   
2        TCGA-CA-6717-01  0.000848  0.001774  0.000309  0.01049  0.000848   

channel   A[C>G]C   A[C>G]G  A[C>G]T   A[C>T]A  ...   T[T>A]T   T[T>C]A  \
0        0.000000  0.000000  0.00000  0.000000  ...  0.000000  0.000000   
1        0.000000  0.010811  0.00000  0.010811  ...  0.000000  0.005405   
2        0.000848  0.000000  0.00054  0.001465  ...  0.002854  0.006788   

channel   T[T>C]C   T[T>C]G   T[T>C]T   T[T>G]A   T[T>G]C  T[T>G]G   T[T>G]T  \
0        0.000000  0.000000  0.000000  0.000000  0.000000   0.0000  0.000000   
1        0.000000  0.000000  0.000000  0.000000  0.000000   0.0000  0.005405   
2        0.017123  0.009873  0.017663  0.007636  0.009873   0.0027  0.082298   

channel  p53 status  
0                 

In [3]:
#############################
######### MAF FILE ##########
#############################
maf_file_path = '../../../Downloads/October_2016_whitelist_2583.snv_mnv_indel.maf.xena.nonUS'
maf_data = pd.read_csv(maf_file_path, sep='\t', comment='#', low_memory=False)

filtered_data = maf_data[(maf_data['start'] == maf_data['end']) & (maf_data['alt'] != '-')]
filtered_data['HG19_Variant'] = 'chr17:g.' + \
                                (filtered_data['start'] - 1).astype(str) + \
                                filtered_data['reference'] + '>' + \
                                filtered_data['alt']

maf_filtered = filtered_data[['Sample', 'HG19_Variant', 'gene', 'effect']]

print(maf_filtered)
print("\n--------------------------\n")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['HG19_Variant'] = 'chr17:g.' + \


           Sample          HG19_Variant      gene             effect
0         DO46416    chr17:g.1230447G>A     ACAP3             Intron
1         DO46416    chr17:g.1609722C>T  SLC35E2B             Intron
2         DO46416    chr17:g.1903275C>T   Unknown                IGR
3         DO46416    chr17:g.2574998C>T     TTC34             Intron
4         DO46416    chr17:g.3151249G>A    PRDM16             Intron
...           ...                   ...       ...                ...
21980512  DO51503  chr17:g.150611902G>A   Unknown                IGR
21980513  DO51503  chr17:g.151231465T>C   Unknown                IGR
21980514  DO51503  chr17:g.151815421C>T     GABRQ  Missense_Mutation
21980515  DO51503  chr17:g.152000524T>C     NSDHL             Intron
21980516  DO51503  chr17:g.152538786G>A   Unknown                IGR

[20621575 rows x 4 columns]

--------------------------



In [4]:
#############################
####### TP53 Database #######
#############################
new_file_tester =  '../../../Downloads/UMD_variants_EU.xlsx'
tester_data =  pd.read_excel(new_file_tester)

pattern = r'^chr17:g\.\d+[A, C, G, T]>[A, C, G, T]$'
tester_data_filtered = tester_data[tester_data['HG19_Variant'].str.contains(pattern, regex=True)]
p53_db = tester_data_filtered[['HG19_Variant', 'COSMIC_ID', 'Pathogenicity', 'Final comment']]

print(p53_db)
print("\n--------------------------\n")

            HG19_Variant    COSMIC_ID        Pathogenicity  \
0     chr17:g.7578406G>A    COSM10648           Pathogenic   
1     chr17:g.7577538G>A    COSM10662           Pathogenic   
2     chr17:g.7577120G>A    COSM10660           Pathogenic   
3     chr17:g.7577121C>T    COSM10659           Pathogenic   
4     chr17:g.7577539C>T    COSM10656           Pathogenic   
...                  ...          ...                  ...   
6857  chr17:g.7574040G>A          NaN               Benign   
6859  chr17:g.7574041T>A  COSM3773295               Benign   
6862  chr17:g.7574032T>C          NaN  Possibly pathogenic   
6863  chr17:g.7574031C>G          NaN                  VUS   
6867  chr17:g.7574029G>C  COSM4749449                  VUS   

                                          Final comment  
0     Published research as well as database analysi...  
1     Published research as well as database analysi...  
2     Published research as well as database analysi...  
3     Published researc

  warn(msg)


In [5]:
#############################
###### Join Dataframes ######
#############################
merged_data = pd.merge(maf_filtered, p53_db[['HG19_Variant', 'COSMIC_ID', 'Pathogenicity', 'Final comment']], 
                       on='HG19_Variant', how='left')
merged_data.fillna({ 'Pathogenicity' : 'Unknown' }, inplace=True)
merged_data.fillna({ 'Final comment' : 'No Comment' }, inplace=True)

pathogenicity_mapping = {
    'Pathogenic': 1,
    'Likely Pathogenic': 0.75,
    'Possibly pathogenic': 0.5,
    'VUS': 0.25,
    'Benign': 0,
    'Unknown': -1,
}

merged_data['Pathogenicity Score'] = merged_data['Pathogenicity'].map(pathogenicity_mapping)
formatted_merged = merged_data[['HG19_Variant', 'gene', 'effect', 'Pathogenicity', 'Final comment', 'Pathogenicity Score']]

print(formatted_merged[(formatted_merged['Pathogenicity Score'] != -1.0) & (formatted_merged['gene'] == 'TP53')])
print("\n--------------------------\n")

                HG19_Variant  gene             effect        Pathogenicity  \
128974    chr17:g.7577546C>T  TP53  Missense_Mutation                  VUS   
142812    chr17:g.7577555C>A  TP53  Missense_Mutation           Pathogenic   
192463    chr17:g.7577093G>A  TP53  Missense_Mutation  Possibly pathogenic   
260865    chr17:g.7578265T>A  TP53  Missense_Mutation    Likely Pathogenic   
441994    chr17:g.7577093G>A  TP53  Missense_Mutation  Possibly pathogenic   
...                      ...   ...                ...                  ...   
20327269  chr17:g.7578393T>C  TP53  Missense_Mutation               Benign   
20345243  chr17:g.7574017G>A  TP53  Missense_Mutation  Possibly pathogenic   
20389252  chr17:g.7578405C>T  TP53  Missense_Mutation               Benign   
20399949  chr17:g.7578211G>A  TP53  Nonsense_Mutation           Pathogenic   
20491625  chr17:g.7578507C>T  TP53  Missense_Mutation                  VUS   

                                              Final comment  \


In [None]:
##############################
###### Machine Learning ######
##############################