In [1]:
import random as rd
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
#Create genotype (cell2mutation) data

def create_genotype_data(gene_list, cell_list, cell_gene_map):

    cell_gene_df = pd.DataFrame(columns=gene_list)

    i = 0
    for cell in cell_list:
        ko_list = []
        mut_list = cell_gene_map[cell]
        for gene in gene_list:
            if gene in mut_list:
                ko_list.append(1)
            else:
                ko_list.append(0)
        cell_gene_df.loc[i] = ko_list
        i += 1
        
    return cell_gene_df

In [3]:
# creating clinical trial gene list

def create_clinical_trial_gene_list():
    ctDNA_genes = list(ctDNA_genes_df['G'])
    print(len(ctDNA_genes))

    fm_tempus_genes = list(gene_panels.query('`FM One` == 1 or `Tempus xT` == 1')['Gene'])
    print(len(fm_tempus_genes))

    genie_oncokb_genes = list(genie_oncokb_genes_df['G'])
    print(len(genie_oncokb_genes))

    clinical_trial_genes = []
    clinical_trial_genes.extend(ctDNA_genes)
    clinical_trial_genes.extend(fm_tempus_genes)
    clinical_trial_genes.extend(genie_oncokb_genes)
    clinical_trial_genes = sorted(set(clinical_trial_genes))
    print(len(clinical_trial_genes))

    return sorted(set(clinical_trial_genes) & set(nest_gene_list))

In [4]:
ccle_map = pd.read_csv("../data/CCLE/CCLE_mutations.csv")

ccle_sample_info = pd.read_csv("../data/CCLE/sample_info.csv")

gene_panels = pd.read_csv("../data/ClinicalGenePanels.txt", sep='\t')

ctDNA_genes_df = pd.read_csv("../data/gene_list_ctDNA.txt", sep="\t", header=None, names=['G'])

genie_oncokb_genes_df = pd.read_csv("../data/GENIE/GENIE_gene_list.txt", sep="\t", header=None, names=['G'])

nest_gene_list = []
with open('../data/NeST/gene_list_NeST.txt', 'r') as file:
    nest_gene_list = file.read().split()

cell_index = pd.read_csv("../data/cell2ind_cg.txt", sep="\t", header=None, names=['I', 'C'], dtype={'I':int, 'C':str})
    
gene_list_cg = list(pd.read_csv("../data/gene2ind_cg.txt", sep="\t", header=None, names=['I', 'G'])['G'])


  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
#Remove cell lines from DrugCell data not present in CCLE

cell_id_map = dict()
cell_list = list(cell_index['C'])

for _,row in ccle_sample_info.iterrows():
    if row['CCLE_Name'] in cell_list:
        cell_id_map[row['CCLE_Name']] = row['DepMap_ID']

cell_list = sorted(list(cell_id_map.keys()))

In [6]:
print(len(cell_list))

1258


In [7]:
#Get all valid cell-gene_list pairs in form of dict {cell : genes[]}

#For most mutated and random lists change the query to have nest_gene_list

mut_list = ['Missense_Mutation', 'Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins', 
            'Splice_Site', 'Splice_Region', 'In_Frame_Del', 'In_Frame_Ins', 'Nonstop_Mutation']

filtered_ccle_map_df = ccle_map.query('DepMap_ID in @cell_id_map.values() and Variant_Classification in @mut_list and Hugo_Symbol in @nest_gene_list')

cell_gene_map = dict()
for cell in cell_list:
    depmap_id = cell_id_map[cell]
    cell_gene_map[cell] = list(filtered_ccle_map_df.query('DepMap_ID == @depmap_id')['Hugo_Symbol'])

In [9]:
# Clinical trial genes

filtered_gene_list = gene_list_cg

gene2ind_file = "../data/gene2ind_nci.txt"
cell2ind_file = "../data/cell2ind_nci.txt"
cell2mut_file = "../data/cell2mutation_nci.txt"

In [8]:
#Most frequently mutated genes

n = len(gene_list_cg)
print(n)

gene_mutation_freq = Counter(list(filtered_ccle_map_df['Hugo_Symbol']))
filtered_gene_list = sorted(tup[0] for tup in gene_mutation_freq.most_common(n))

gene2ind_file = "../data/gene2ind_mf_" + str(n) + ".txt"
cell2mut_file = "../data/cell2mutation_mf_" + str(n) + ".txt"

718


In [23]:
#Random genes

n = len(gene_list_cg)
print(n)

filtered_gene_list = sorted(rd.sample(list(filtered_ccle_map_df['Hugo_Symbol'].unique()), n))

gene2ind_file = "../data/gene2ind_random_" + str(n) + "_e.txt"
cell2mut_file = "../data/cell2mutation_random_" + str(n) + "_e.txt"

718


In [24]:
#Saving both gene2ind and cell2mut files

gene_index = pd.DataFrame(filtered_gene_list, columns=(['G']))
gene_index.to_csv(gene2ind_file, sep='\t', header=False, index=True)

#cell_index = pd.DataFrame(cell_list, columns=(['C']))
#cell_index.to_csv(cell2ind_file, sep='\t', header=False, index=True)

cell_gene_df = create_genotype_data(filtered_gene_list, cell_list, cell_gene_map)
cell_gene_df.to_csv(cell2mut_file, header=False, index=False)

In [25]:
cell_gene_df

Unnamed: 0,A4GNT,ABCD1,ABHD5,ACACA,ACBD5,ACOT4,ACSS3,ACY3,ADAMTS17,ADAMTS9,...,ZNF541,ZNF645,ZNF707,ZNF729,ZNF746,ZNF780B,ZNF839,ZNF880,ZSCAN21,ZSCAN32
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1253,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1254,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1256,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
