In [None]:
import random as rd
import pandas as pd
import numpy as np
from collections import Counter

In [None]:
#Create genotype (cell2mutation) data

def create_genotype_data(gene_list, cell_list, cell_gene_map):

    cell_gene_df = pd.DataFrame(columns=gene_list)

    i = 0
    for cell in cell_list:
        ko_list = []
        mut_list = cell_gene_map[cell]
        for gene in gene_list:
            if gene in mut_list:
                ko_list.append(1)
            else:
                ko_list.append(0)
        cell_gene_df.loc[i] = ko_list
        i += 1
        
    return cell_gene_df

In [None]:
# creating clinical trial gene list

def create_clinical_trial_gene_list():
    ctDNA_genes = list(ctDNA_genes_df['G'])
    print(len(ctDNA_genes))

    fm_tempus_genes = list(gene_panels.query('`FM One` == 1 or `Tempus xT` == 1')['Gene'])
    print(len(fm_tempus_genes))

    genie_oncokb_genes = list(genie_oncokb_genes_df['G'])
    print(len(genie_oncokb_genes))

    clinical_trial_genes = []
    clinical_trial_genes.extend(ctDNA_genes)
    clinical_trial_genes.extend(fm_tempus_genes)
    clinical_trial_genes.extend(genie_oncokb_genes)
    clinical_trial_genes = sorted(set(clinical_trial_genes))
    print(len(clinical_trial_genes))

    return sorted(set(clinical_trial_genes) & set(nest_gene_list))

In [None]:
ccle_mutations = pd.read_csv("../data/CCLE/CCLE_mutations.csv")

ccle_cn = pd.read_csv("../data/CCLE/CCLE_gene_cn.csv")
ccle_cn.rename(columns={'Unnamed: 0':'DepMap_ID'}, inplace=True)

ccle_sample_info = pd.read_csv("../data/CCLE/sample_info.csv")

cell_index = pd.read_csv("../data/training_files_av/cell2ind_av.txt", sep="\t", header=None, names=['I', 'C'])
    
gene_list_ctg = list(pd.read_csv("../data/training_files_av/gene2ind_ctg_av.txt", sep="\t", header=None, names=['I', 'G'])['G'])

In [None]:
#Remove cell lines from DrugCell data not present in CCLE

cell_id_map = dict()
cell_list = list(cell_index['C'])

for _,row in ccle_sample_info.iterrows():
    if row['CCLE_Name'] in cell_list:
        cell_id_map[row['CCLE_Name']] = row['DepMap_ID']

cell_list = sorted(list(cell_id_map.keys()))

In [None]:
#Get all valid cell-gene_list pairs in form of dict {cell : genes[]}

#For most mutated and random lists change the query to have nest_gene_list

mut_list = ['Missense_Mutation', 'Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins', 
            'Splice_Site', 'Splice_Region', 'In_Frame_Del', 'In_Frame_Ins', 'Nonstop_Mutation']

filtered_ccle_mutations_df = ccle_mutations.query('DepMap_ID in @cell_id_map.values() and Variant_Classification in @mut_list and Hugo_Symbol in @gene_list_ctg')

cell_gene_map = dict()
for cell in cell_list:
    depmap_id = cell_id_map[cell]
    cell_gene_map[cell] = list(filtered_ccle_mutations_df.query('DepMap_ID == @depmap_id')['Hugo_Symbol'])

In [None]:
cn_filtered_columns = [s for s in ccle_cn.columns if s.split(" ")[0] in gene_list_ctg]
cn_filtered_columns.append('DepMap_ID')

In [None]:
mut_cell_lines = set(filtered_ccle_mutations_df['DepMap_ID'])
filtered_cn_df = ccle_cn.query('DepMap_ID in @mut_cell_lines')[cn_filtered_columns]

In [None]:
col_name = dict()
for c in filtered_cn_df.columns:
    if " " in c:
        c1 = c.split(" ")[0]
        col_name[c] = c1
filtered_cn_df.rename(columns=col_name, inplace=True)

In [None]:
cn_genes = col_name.values()

new_row = dict()
new_row['DepMap_ID'] = ''
for g in cn_genes:
    new_row[g] = 0
    
new_rows_list = []
extra_cl = [c for c in mut_cell_lines if c not in set(filtered_cn_df['DepMap_ID'])]
for c in extra_cl:
    new_row['DepMap_ID'] = c
    new_rows_list.append(pd.DataFrame(new_row, index=[0]))
        
new_rows_list.append(filtered_cn_df)
filtered_cn_df = pd.concat(new_rows_list, axis=0, ignore_index=True)

In [None]:
for g in gene_list_ctg:
    if g not in cn_genes:
        filtered_cn_df[g] = 0

In [None]:
for cell_id, depmap_id in cell_id_map.items():
    filtered_cn_df.replace(depmap_id, cell_id, inplace=True)

In [None]:
filtered_cn_df.sort_values(by='DepMap_ID', inplace=True, ignore_index=True)
filtered_cn_df

In [None]:
cna_df = filtered_cn_df[gene_list_ctg].copy(deep=True)
cnd_df = filtered_cn_df[gene_list_ctg].copy(deep=True)

ploidy = 2
del_th = np.log2(0.87/ploidy + 1)
amp_th = np.log2(3.36/ploidy + 1)

cnd_df[cnd_df > del_th] = 0
cnd_df[cnd_df != 0] = 1
cnd_df = cnd_df.astype('int')
cnd_df = cnd_df.reindex(sorted(cnd_df.columns), axis=1)

cna_df[cna_df >= amp_th] = 1
cna_df[cna_df != 1] = 0
cna_df = cna_df.astype('int')
cna_df = cna_df.reindex(sorted(cna_df.columns), axis=1)

In [None]:
# Clinical trial genes

filtered_gene_list = gene_list_ctg

gene2ind_file = "../data/training_files_av/gene2ind_ctg_av.txt"
cell2ind_file = "../data/training_files_av/cell2ind_av.txt"
cell2mut_file = "../data/training_files_av/cell2mutation_ctg_av.txt"
cell2cnd_file = "../data/training_files_av/cell2cndeletion_ctg_av.txt"
cell2cna_file = "../data/training_files_av/cell2cnamplification_ctg_av.txt"

In [None]:
#Most frequently mutated genes

n = len(gene_list_cg)
print(n)

gene_mutation_freq = Counter(list(filtered_ccle_map_df['Hugo_Symbol']))
filtered_gene_list = sorted(tup[0] for tup in gene_mutation_freq.most_common(n))

gene2ind_file = "../data/gene2ind_mf_" + str(n) + ".txt"
cell2mut_file = "../data/cell2mutation_mf_" + str(n) + ".txt"

In [None]:
#Random genes

n = len(gene_list_cg)
print(n)

filtered_gene_list = sorted(rd.sample(list(filtered_ccle_map_df['Hugo_Symbol'].unique()), n))

gene2ind_file = "../data/gene2ind_random_" + str(n) + "_e.txt"
cell2mut_file = "../data/cell2mutation_random_" + str(n) + "_e.txt"

In [None]:
#Saving both gene2ind and cell2mut files

gene_index = pd.DataFrame(filtered_gene_list, columns=(['G']))
#gene_index.to_csv(gene2ind_file, sep='\t', header=False, index=True)

cell_index = pd.DataFrame(cell_list, columns=(['C']))
cell_gene_df = create_genotype_data(filtered_gene_list, cell_list, cell_gene_map)

empty_rows = []
for i, row in enumerate(cell_gene_df.values):
    if np.all(row == row[0]):
        empty_rows.append(i)

cell_index = cell_index.drop(cell_index.index[empty_rows]).reset_index()['C']
#cell_index.to_csv(cell2ind_file, sep='\t', header=False, index=True)

cell_gene_df = cell_gene_df.drop(cell_gene_df.index[empty_rows])
cell_gene_df.to_csv(cell2mut_file, header=False, index=False)

In [None]:
cnd_df.to_csv(cell2cnd_file, header=False, index=False)

cna_df.to_csv(cell2cna_file, header=False, index=False)