In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report, precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay, confusion_matrix, ConfusionMatrixDisplay, mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, median_absolute_error, explained_variance_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
Mutational Signature Dataset - Tally file

In [2]:
#############################
#### Signature Database #####
#############################
folder_path = '../../../Downloads/new_sigs/'
sig_data = pd.DataFrame()

# Loop through all CSV files in the folder
for filename in os.listdir(folder_path):
    # Filter to match the pattern 'SBS96_catalogue.<sample_name>.hg19.tally.csv' -  only doing sbs first
    if filename.startswith('SBS96_catalogue.') and filename.endswith('.hg19.tally.csv'):
        file_path = os.path.join(folder_path, filename)
        data = pd.read_csv(file_path, index_col=None)

        # Extract the sample name from the filename (e.g., 'TCGA-CA-6717-01')
        sample_name = filename.split('.')[1]
        select_data = data.drop(columns=['type', 'count'])

        # Pivot the DataFrame so that 'channel' becomes the columns
        pivot_data = select_data.pivot_table(index=None, columns="channel", values="fraction").reset_index(drop=True)
        pivot_data.insert(0, 'Sample', sample_name)
        sig_data = pd.concat([sig_data, pivot_data], ignore_index=True)

# Add final column to data (empty) - maybe dont do it here??
sig_data["p53 status"] = ""

MAF File: https://xenabrowser.net/datapages/?dataset=October_2016_whitelist_2583.snv_mnv_indel.maf.xena.nonUS&host=https%3A%2F%2Fpcawg.xenahubs.net&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443

In [3]:
#############################
######### MAF FILE ##########
#############################
maf_file_path = '../../../Downloads/October_2016_whitelist_2583.snv_mnv_indel.maf.xena.nonUS'
maf_data = pd.read_csv(maf_file_path, sep='\t', comment='#', low_memory=False)

filtered_data = maf_data[(maf_data['start'] == maf_data['end']) & (maf_data['alt'] != '-')]
filtered_data['HG19_Variant'] = 'chr17:g.' + \
                                (filtered_data['start'] - 1).astype(str) + \
                                filtered_data['reference'] + '>' + \
                                filtered_data['alt']

maf_filtered = filtered_data[['Sample', 'HG19_Variant', 'gene', 'effect']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['HG19_Variant'] = 'chr17:g.' + \


Dataset: Download at https://p53.fr/download-the-database - Using .xlsx EU Variants

In [4]:
#############################
####### TP53 Database #######
#############################
new_file_tester =  '../../../Downloads/UMD_variants_EU.xlsx'
tester_data =  pd.read_excel(new_file_tester)

pattern = r'^chr17:g\.\d+[A, C, G, T]>[A, C, G, T]$'
tester_data_filtered = tester_data[tester_data['HG19_Variant'].str.contains(pattern, regex=True)]
tester_data_filtered['type'] = tester_data_filtered['Mutational_event']

  warn(msg)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tester_data_filtered['type'] = tester_data_filtered['Mutational_event']


In [5]:
p53_db = tester_data_filtered[['HG19_Variant', 'Pathogenicity', 'Final comment']]

In [6]:
#############################
###### Join Dataframes ######
#############################
merged_data = pd.merge(maf_filtered, p53_db[['HG19_Variant', 'Pathogenicity', 'Final comment']], 
                       on='HG19_Variant', how='left')
merged_data.fillna({ 'Pathogenicity' : 'Unknown' }, inplace=True)
merged_data.fillna({ 'Final comment' : 'No Comment' }, inplace=True)

In [35]:
def find_tp53_genes(df):
    rows = df[df['gene'] == 'TP53']
    return rows

def assign_pathogenicity_status(df):
    pathogenic_rows = df[df['Pathogenicity'].isin(['Pathogenic', 'Likely Pathogenic'])]
    return pathogenic_rows

all_tp53_genes = find_tp53_genes(merged_data)                    
pathogenic_tp53 = assign_pathogenicity_status(all_tp53_genes[['HG19_Variant', 'Pathogenicity']])

In [39]:
all_tp53_pathogenicity = all_tp53_genes[['HG19_Variant', 'Pathogenicity']]

print("\n--------------------------\n")
print(all_tp53_pathogenicity)          # 569 TP53 genes
print("\n--------------------------\n")
print(pathogenic_tp53)                 # 55 Pathogenic or Likely Pathogenic TP53
print("\n--------------------------\n")


--------------------------

                HG19_Variant Pathogenicity
4925      chr17:g.7577057C>A       Unknown
45681     chr17:g.7577559A>G       Unknown
92006     chr17:g.7578554C>T       Unknown
128974    chr17:g.7577546C>T           VUS
142812    chr17:g.7577555C>A    Pathogenic
...                      ...           ...
20549255  chr17:g.7578554C>T       Unknown
20595734  chr17:g.7577547C>T       Unknown
20605153  chr17:g.7584138G>A       Unknown
20612527  chr17:g.7578280G>A       Unknown
20619304  chr17:g.7577134T>A       Unknown

[569 rows x 2 columns]

--------------------------

                HG19_Variant      Pathogenicity
142812    chr17:g.7577555C>A         Pathogenic
260865    chr17:g.7578265T>A  Likely Pathogenic
1091836   chr17:g.7578211G>A         Pathogenic
1707413   chr17:g.7577538G>A         Pathogenic
2022113   chr17:g.7578211G>A         Pathogenic
2093424   chr17:g.7578211G>A         Pathogenic
2492096   chr17:g.7578211G>A         Pathogenic
2698036   chr17:g.

In [40]:
##########################################
####### Sample-Level Summarisation #######
##########################################
def group_variants_sample(df):
    grouped = df.groupby('Sample')

    new_frame = grouped.apply(
        lambda group: pd.Series({
            'HG19_Variants': group['HG19_Variant'].tolist()
        }), include_groups=False
    ).reset_index()

    return new_frame

reduced_filtered_data = filtered_data[['Sample', 'HG19_Variant']]
list_hg19_sample = group_variants_sample(reduced_filtered_data)

In [43]:
def sample_level_summary(df):
    # Convert relevant columns to sets for fast lookup
    pathogenic_set = set(pathogenic_tp53['HG19_Variant'])
    tp53_set = set(all_tp53_genes['HG19_Variant'])

    def classify_row(variant_list):
        if any(variant in pathogenic_set for variant in variant_list):
            return "Pathogenic"
        elif any(variant in tp53_set for variant in variant_list):
            return "Unknown"
        else:
            return "Not Pathogenic"

    df['Pathogenicity'] = df['HG19_Variants'].apply(classify_row)
    return df

value = sample_level_summary(list_hg19_sample)
sample_level_df = value[['Sample',  'Pathogenicity']]

In [44]:
print(sample_level_df)

      Sample   Pathogenicity
0     DO1000  Not Pathogenic
1     DO1001      Pathogenic
2     DO1002  Not Pathogenic
3     DO1003  Not Pathogenic
4     DO1004  Not Pathogenic
...      ...             ...
1777  DO7196  Not Pathogenic
1778  DO7214  Not Pathogenic
1779  DO7280  Not Pathogenic
1780  DO7304  Not Pathogenic
1781  DO7328  Not Pathogenic

[1782 rows x 2 columns]


Sample Level Data Frame: 1782 Samples

        - 56 Pathogenic
        - 1276 Benign
        - 450 Unknown