In [None]:
import copy
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
def generate_test_data(cell_lines, smiles):
    
    test_data = []
    for c in cell_lines:
        test_data.append((c, smiles, 0.5, 'GENIE'))
    test_data_df = pd.DataFrame(test_data, columns=['cell_line', 'smiles', 'auc', 'dataset'])
    
    return test_data_df

In [None]:
def generate_cell_mutation_file(all_genie_genes, other_genes, genie_data, all_mutation_data):
    
    filtered_mut_data = all_mutation_data.sort_values(by=['sampleId'], axis=0)
    filtered_mut_data.drop(columns=['sampleId', 'Altered'], inplace=True)
    
    non_overlapping_genes_AnotB = [g for g in all_genie_genes if g not in other_genes] #A-B
    
    filtered_mut_data.drop(columns=non_overlapping_genes_AnotB, inplace=True)
    
    non_overlapping_genes_BnotA = [g for g in other_genes if g not in all_genie_genes] #B-A
    
    for gene in non_overlapping_genes_BnotA:
        filtered_mut_data[gene] = 0
        
    filtered_mut_data.sort_index(inplace=True, axis=1)
    
    return filtered_mut_data

In [None]:
def generate_cn_file(all_genie_genes, other_genes, genie_data, cn_data):
    
    filtered_data = cn_data.sort_values(by=['SAMPLE_ID'], axis=0)
    filtered_data.drop(columns=['STUDY_ID', 'SAMPLE_ID'], inplace=True)
    
    non_overlapping_genes_AnotB = [g for g in all_genie_genes if g not in other_genes] #A-B
    
    filtered_data.drop(columns=non_overlapping_genes_AnotB, inplace=True)
    
    non_overlapping_genes_BnotA = [g for g in other_genes if g not in all_genie_genes] #B-A
    
    for gene in non_overlapping_genes_BnotA:
        filtered_data[gene] = 0
        
    filtered_data.sort_index(inplace=True, axis=1)
    
    cnd_data = filtered_data.copy(deep=True)
    cnd_data[cnd_data > -1] = 0
    cnd_data[cnd_data != 0] = 1
    cnd_data = cnd_data.astype('int')
    cnd_data.sort_index(inplace=True, axis=1)
    
    cna_data = filtered_data.copy(deep=True)
    cna_data[cna_data < 1] = 0
    cna_data[cna_data != 0] = 1
    cna_data = cna_data.astype('int')
    cna_data.sort_index(inplace=True, axis=1)
    
    return cnd_data, cna_data

In [None]:
#Common data

genie_data = pd.read_csv('../data/GENIE/brca_akt1_genie_2019_clinical_data.tsv', sep='\t')
genie_data.columns = genie_data.columns.str.replace(' ','_', regex=False)
genie_data.columns = genie_data.columns.str.replace('/','_', regex=False)
genie_data.columns = genie_data.columns.str.replace('(','', regex=False)
genie_data.columns = genie_data.columns.str.replace(')','', regex=False)

drugcell_genes = pd.read_csv('../data/training_files_av/gene2ind_ctg_av.txt', sep='\t', header=None, names=['I', 'G'])['G']

all_genie_genes = pd.read_csv('../data/GENIE/GENIE_gene_list.txt', header=None, names=['G'])['G']

mutation_data = pd.read_csv('../data/GENIE/sample_matrix.txt', sep='\t')
mutation_data['sampleId'] = mutation_data['studyID:sampleId'].str.split(":", expand=True)[1]
mutation_data.drop(columns =['studyID:sampleId'], inplace = True)

cn_data = pd.read_csv('../data/GENIE/cna.txt', sep='\t')
cn_data.fillna(0, inplace=True)

In [None]:
filtered_mut_data = generate_cell_mutation_file(list(all_genie_genes), list(drugcell_genes), genie_data, mutation_data)
filtered_mut_data.to_csv('../data/GENIE/cell2mutation.txt', header=False, index=False)

In [None]:
cnd_data, cna_data = generate_cn_file(list(all_genie_genes), list(drugcell_genes), genie_data, cn_data)
cnd_data.to_csv('../data/GENIE/cell2cndeletion.txt', header=False, index=False)
cna_data.to_csv('../data/GENIE/cell2cnamplification.txt', header=False, index=False)

In [None]:
cell_lines = sorted(cn_data['SAMPLE_ID'])
cell_line_df = pd.DataFrame(cell_lines, columns=['C'])
cell_line_df.to_csv('../data/GENIE/cell2ind.txt', sep='\t', header=False, index=True)

In [None]:
smiles_map = {'Palbociclib':"CC1=C(C(=O)N(C2=NC(=NC=C12)NC3=NC=C(C=C3)N4CCNCC4)C5CCCC5)C(=O)C"}

for d in smiles_map.keys():
    test_data = generate_test_data(cell_lines, smiles_map[d])
    test_data.to_csv("../data/GENIE/test_" + d + ".txt", sep='\t', header=False, index=False)