In [1]:
import pandas as pd
import os
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [5]:
rna_folder = '../../DNA2RNA_ML/Data/RNA/'
dna_folder = '../../DNA2RNA_ML/Data/DNA/'

rna_files = os.listdir(rna_folder)
dna_files = os.listdir(dna_folder)

#### Map DNA to RNA file

In [6]:
terra_files_list = []
terra_folder = '../../DNA2RNA_ML/Data/Terra_download_commands/'
for file in os.listdir(terra_folder):
    if file.endswith('.tsv'):
        terra_df = pd.read_csv(os.path.join(terra_folder, file), sep = '\t')
        if file.startswith('LUAD'):
            terra_df['DNA_file'] = terra_df['maf_file_capture_oxoG_filtered'].str.split('/').str[-1]
        else:
            terra_df['DNA_file'] = terra_df['maf_file_capture_novo_realign_filtered'].str.split('/').str[-1]
        terra_df['RNA_file'] = terra_df['maf_file_rna_final_paper_v1_3'].str.split('/').str[-1]
        terra_files_list.append(terra_df[['DNA_file', 'RNA_file']])

terra_df = pd.concat(terra_files_list)
terra_df = terra_df[((~terra_df['DNA_file'].isna()) & (~terra_df['RNA_file'].isna()))]

# Initialize a dictionary where each value is a list
dna_to_rna = defaultdict(list)

# Populate the dictionary
for dna, rna in zip(terra_df['DNA_file'], terra_df['RNA_file']):
    dna_to_rna[dna].append(rna)

# Convert defaultdict back to a regular dict if needed
dna_to_rna = dict(dna_to_rna)

#### Create a merged dataframe for DNA and RNA data

In [None]:
merged_df_list = []
chosen_columns = ['Hugo_Symbol', 'Entrez_Gene_Id', 'Tumor_Sample_Barcode', 'Chromosome', 'Start_position', 'Variant_Classification', 'Reference_Allele', 'Tumor_Seq_Allele2', 'Transcript_Exon', 'Transcript_Position', 'cDNA_Change', 'Codon_Change', 'Protein_Change', 'COSMIC_tissue_types_affected', 'COSMIC_total_alterations_in_gene', 'ref_context', 'gc_content', 'i_COSMIC_n_overlapping_mutations', 'i_init_t_lod', 'i_t_lod_fstar','t_alt_count','t_ref_count','i_tumor_f']

for dna_file in dna_to_rna:
    for rna_file in dna_to_rna[dna_file]:
        # if str(rna_file) == 'nan':
        #     print(dna_file, "doesn't exist in RNA data")
        #     continue
        if rna_file.endswith('.txt'):
            rna_sample_df = pd.read_csv(os.path.join(rna_folder, rna_file), sep = '\t', encoding = 'latin1', usecols = chosen_columns)
        if dna_file.endswith('.maf.annotated'):
            dna_sample_df = pd.read_csv(os.path.join(dna_folder, dna_file), sep = '\t', encoding = 'latin1',  skiprows=3, usecols = chosen_columns)
        elif dna_file.endswith('.maf'):
            dna_sample_df = pd.read_csv(os.path.join(dna_folder, dna_file), sep = '\t', encoding = 'latin1', skiprows=1, usecols = chosen_columns)
        elif dna_file.endswith('.txt'):
            dna_sample_df = pd.read_csv(os.path.join(dna_folder, dna_file), sep = '\t', encoding = 'latin1', skiprows=1, usecols = chosen_columns)
        # elif file.endswith('.txt'):
        #     dna_sample_df = pd.read_csv(os.path.join(dna_folder, dna_file), sep = '\t', encoding = 'latin1', skiprows=1, usecols = chosen_columns)

        # dna_sample_df = dna_sample_df[chosen_columns]
        # rna_sample_df = rna_sample_df[chosen_columns]
        dna_sample_df.index = dna_sample_df['Chromosome'].astype(str) + '_' + dna_sample_df['Start_position'].astype(str) + '_' + dna_sample_df['Tumor_Seq_Allele2'].astype(str) 
        rna_sample_df.index = rna_sample_df['Chromosome'].astype(str) + '_' + rna_sample_df['Start_position'].astype(str) + '_' + rna_sample_df['Tumor_Seq_Allele2'].astype(str) 
        dna_sample_df['Appears_in_rna'] = dna_sample_df.index.isin(rna_sample_df.index)
        merged_df_list.append(dna_sample_df)

merged_df = pd.concat(merged_df_list)

In [74]:
for col in ['i_init_t_lod','i_t_lod_fstar','t_alt_count', 't_ref_count', 'i_tumor_f']:
    merged_df[col] = (merged_df[col].astype(str) + '|').str.split('|').str[0].astype(float)

#### Format data for full model

In [75]:
merged_df.columns = merged_df.columns.str.replace('i_COSMIC_n_overlapping_mutations', 'COSMIC_n_overlapping_mutations')
merged_df.index = merged_df.index + '__' + merged_df['Tumor_Sample_Barcode']
merged_df = merged_df[~merged_df.index.duplicated()]
merged_df['Cancer_type'] = merged_df['Tumor_Sample_Barcode'].str.split('-').str[0]

In [76]:
merged_df['Chromosome'] = merged_df['Chromosome'].astype(str)
chromosomes = set([str(i) for i in range(1,23)] + ['X', 'Y'])
merged_df = merged_df[merged_df['Chromosome'].isin(chromosomes)]

#### Filter out by the variant type - need to make sure this is OK

In [77]:
variant_removal = merged_df[['Variant_Classification', 'Appears_in_rna']].groupby('Variant_Classification').mean()
non_zero_relevant = list(variant_removal[variant_removal['Appears_in_rna'] != 0].index)
merged_df = merged_df[merged_df['Variant_Classification'].isin(non_zero_relevant)] # remove RNA, and lincRNA?
merged_df['Left_flank_base'] =  merged_df['ref_context'].str.slice(9,10).str.upper()
merged_df['Right_flank_base'] =  merged_df['ref_context'].str.slice(11,12).str.upper()

merged_df['Reference_Allele'] = merged_df['Reference_Allele'].str.slice(0, 1)
merged_df['Tumor_Seq_Allele2'] = merged_df['Tumor_Seq_Allele2'].str.slice(0, 1)

merged_df['Transcript_Position'] = pd.to_numeric(merged_df['Transcript_Position'], errors='coerce')

In [78]:
tissues_dict = {}
for index, row in merged_df['COSMIC_tissue_types_affected'][~merged_df['COSMIC_tissue_types_affected'].isna()].str.split('|').items():
    for tissues in row:
        tissue = tissues[:tissues.index('(')]
        number = tissues[tissues.index('(')+1:tissues.index(')')]
        if 'Tissue_' + tissue not in tissues_dict:
            tissues_dict['Tissue_' + tissue] = {}
        try:
            number = int(number)
        except:
            number = 0
        tissues_dict['Tissue_' + tissue][index] = number

tissues_df = pd.DataFrame(tissues_dict).fillna(0).astype(int)

In [85]:
# Step 1: One-hot encode selected categorical columns
numerical_df = pd.get_dummies(
    merged_df,
    columns=[
        'Chromosome',
        'Variant_Classification',
        'Reference_Allele',
        'Tumor_Seq_Allele2',
        'Left_flank_base',
        'Right_flank_base',
        'Cancer_type'
    ],
    dtype=int  # Ensures dummies are 0/1 integers
)
numerical_df = numerical_df.merge(tissues_df, left_index=True, right_index=True, how='left')

numerical_df = numerical_df.drop(['Protein_Change', 'Transcript_Exon', 'cDNA_Change', 'Codon_Change', 'Entrez_Gene_Id', 'COSMIC_tissue_types_affected', 'ref_context'], axis = 1)
numerical_df['Cancer_type'] = merged_df['Cancer_type']
numerical_df = numerical_df.loc[:,(numerical_df.nunique() > 1)]


In [90]:
numerical_df.to_csv('../data/TCGA_mutations.csv')