In [23]:
import pandas as pd
import numpy as np
import os
import warnings
from pandas.core.common import SettingWithCopyWarning

In [24]:
def load(file_path):
    data = pd.read_csv(file_path, index_col=0)
    return data

warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*slice*")
warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

## Cell Line Mutation Data

#### 1. Load data

In [25]:
os.chdir('/Users/amyhayward/Documents/BIOINFORMATICS MSC/Project/Mutation_DATA')
mutation_data_1 = 'model_list_20230608.csv'
mutation_data_2 = 'mutations_all_20230202.csv'

mut1_df = pd.read_csv(mutation_data_1, index_col=0,low_memory=False)
mut2_df = pd.read_csv(mutation_data_2, index_col=0,low_memory=False)

#### 2. Drop mutations

In [26]:
#MUTATIONS OF INTEREST - we are only interested in these mutations
mutation_filtered = mut2_df[mut2_df['effect'].str.contains('nonsense|missense|ess_splice|frameshift')]
drop_rows = 'prime'

# Keep only rows where the 'Gene' column contains genes from the gene_list
mutation_filtered = mutation_filtered[~mutation_filtered['effect'].str.contains(drop_rows)]

#### 3. Keep only breast data

In [27]:
# Define the string you want to check for
defined_string = 'Breast'

# Subset the DataFrame based on the condition
mut1_df_sub = mut1_df[mut1_df['tissue'].str.contains(defined_string, case=False, na=False)]
columns_to_keep = ['model_name', 'tissue_status', 'sample_site', 'sample_treatment', 'sample_treatment_details']
mut1_df_sub = mut1_df_sub[columns_to_keep]

In [28]:
#matching
defined_strings_set = set(mut1_df_sub.index)

# Define a function to check if any of the defined strings are in the given text
def contains_any_defined_string(text):
    return any(defined_string in text for defined_string in defined_strings_set)

# Keep only rows from df1 that match any of the defined strings in 'column_name'
mut2_matched = mutation_filtered[mutation_filtered['model_id'].apply(contains_any_defined_string)]

#### 4. Model_name formatting

In [29]:
mut2_matched['model_name'] = mut2_matched['model_name'].str.replace("-", "")
mut2_matched['model_name'] = mut2_matched['model_name'].str.replace("s", "S")

#### 5. Reindex - set model_name as index

In [30]:
mut2_matched.reset_index(inplace=True, drop=False)
mut2_matched.set_index('model_name', inplace=True)

#### 6. Keep cell lines which are present in IC50 subset

In [31]:
os.chdir('/Users/amyhayward/Documents/BIOINFORMATICS MSC/Project/FINAL_FILES')
cl_df_tala = load("TALA_TERTILE_LN.csv")

In [32]:
commonrows = mut2_matched.index.intersection(cl_df_tala.index)
mut0_filtered = mut2_matched.loc[commonrows]
#mut0_filtered.reset_index(inplace=True, drop=False)
mut0_filtered.index.name = 'model_name'

#### 7. Output - for mutation landscape plots

In [33]:
gof = ['ATM', 'BRCA1', 'BRCA2','RAD51','RAD51B', 'RAD51D','XRCC2','XRCC3',
       'RAD52','RAD54L','RAD50','MRE11','RBBP8','MUS81','EME1','EME2','SLX1A',
       'SLX1B','TP53BP1','MAD2L2','MAD2L2','RIF1','SHLD1','SHLD2','SHLD3','SETD1A','BOD1L1']

gof_mut = mut0_filtered[mut0_filtered['gene_symbol'].isin(gof)]

file_path0 = 'TALA_mutationNEW.txt'
gof_mut.to_csv(file_path0, sep='\t', index=True)

#### 8. Create gene mutation matrix

In [34]:
#MUTATION DATA TRANSPOSED
mutation_sub = mut0_filtered.loc[:, ['gene_symbol', 'effect']]

mutation_sub['effect'] = mutation_sub['effect'].str.replace("missense", "1")
mutation_sub['effect'] = mutation_sub['effect'].str.replace("nonsense", "1")
mutation_sub['effect'] = mutation_sub['effect'].str.replace("frameshift", "1")
mutation_sub['effect'] = mutation_sub['effect'].str.replace("ess_splice", "1")

mutation_sub = mutation_sub.pivot_table(index='model_name', columns='gene_symbol', values='effect', aggfunc='first')
mutation_sub = mutation_sub.fillna(0)
mutation_sub = mutation_sub.applymap(int)
mutation_sub = mutation_sub.rename_axis('Cell line')

#### 9. Check TCGA mutatix matrix is in the same format

In [35]:
os.chdir('/Users/amyhayward/Documents/BIOINFORMATICS MSC/Project/Mutation_DATA')
tcga_mut = load("TCGA_mutationdataraw.csv")
tcga_mut = tcga_mut.T

#### 10. Fitler to contain only samples without radiation

In [36]:
os.chdir('/Users/amyhayward/Documents/BIOINFORMATICS MSC/Project/FINAL_FILES')
tcga_no_rad_df = load("TCGA_MATCH_TALA.csv")

In [37]:
matched_index = tcga_mut.index.intersection(tcga_no_rad_df.index)
ge_match = tcga_no_rad_df.reindex(index=matched_index)
mut_match = tcga_mut.reindex(index=matched_index)

#### 11. Save new matched TCGA gene expression

In [38]:
file = "X_UNLABELLED_EXP_TALA.csv"
ge_match.to_csv(file, index=True)

#### 12. Drop genes which have no mutations

In [39]:
tcga_mutations = mut_match.loc[:, (mut_match != 0).any()]

#### 13. Match Mutations in cell line and TCGA

In [40]:
matched_cols = mutation_sub.columns.intersection(tcga_mutations.columns)
mut_match_cols = tcga_mutations[matched_cols]
cl_match_cols = mutation_sub[matched_cols]

#### 14. Save TCGA matrix

In [41]:
fle = "X_UNLABELLED_MUT_TALA.csv"
mut_match_cols.to_csv(fle, index=True)

#### 15. Match matrixes to corresponding expression data (cell lines)

In [42]:
test_df = load("X_TEST_MATCH_TALA.csv")

train_df = load("X_TRAIN_MATCH_TALA.csv")

In [43]:
test_index = test_df.index.intersection(cl_match_cols.index)
mut_test = cl_match_cols.reindex(index=test_index)

train_index = train_df.index.intersection(cl_match_cols.index)
mut_train = cl_match_cols.reindex(index=train_index)

In [44]:
testmatrix = "TEST_MATRIX_TALA.csv"
mut_test.to_csv(testmatrix, index=True)

trainmatrix = "TRAIN_MATRIX_TALA.csv"
mut_train.to_csv(trainmatrix, index=True)