# **Exploratory Data Analysis**
In this notebook we load the data we chose, we inspect it and try to understand what it represents and how to build the optimal dataset for our classifier.

In [1]:
import pandas as pd

## **Data Loading**
We found a dataset on CCLE, called <code>CCLE_RNAseq_genes_rpkm_20180929</code>

In [3]:
import zipfile

with zipfile.ZipFile('ccle.zip', 'r') as zipf:
    zipf.extractall()

In [2]:
# we prepare the df to have the cell lines as rows and the genes as columns
df = pd.read_csv('CCLE_RNAseq_genes_rpkm_20180929.gct', sep='\t', skiprows=2)
df.drop(columns=['Name'], inplace=True)
df.rename(columns={'Description': 'Gene'}, inplace=True)
df.set_index('Gene', inplace=True)

df = df.T
df.columns.name = None
df.head()

Unnamed: 0,DDX11L1,WASH7P,MIR1302-11,FAM138A,OR4G4P,OR4G11P,OR4F5,RP11-34P13.7,CICP27,AL627309.1,...,MT-ND4,MT-TH,MT-TS2,MT-TL2,MT-ND5,MT-ND6,MT-TE,MT-CYB,MT-TT,MT-TP
22RV1_PROSTATE,0.053448,5.75256,0.039576,0.037401,0.0,0.032334,0.049664,0.077574,0.098682,3.34955,...,6914.89,0.770865,0.90152,1.17724,2324.83,1794.95,4.62519,4630.21,0.230258,0.446972
2313287_STOMACH,0.043078,4.26104,0.011961,0.022608,0.019401,0.048864,0.080056,0.044659,0.098997,4.04952,...,8190.65,0.532545,0.467105,0.517544,1909.74,1974.76,3.06213,3685.24,0.0,0.270188
253JBV_URINARY_TRACT,0.037956,3.10048,0.031618,0.01328,0.025642,0.025833,0.008817,0.011805,0.149851,0.212384,...,6250.53,0.586535,0.548758,1.82404,1822.52,2097.98,1.29038,1778.51,0.613195,1.30935
253J_URINARY_TRACT,0.028313,3.07909,0.031446,0.026416,0.008501,0.04282,0.035077,0.023481,0.100099,0.131432,...,12656.7,0.933357,1.228,1.81413,3220.78,3909.72,3.8501,4159.0,0.85381,2.48609
42MGBA_CENTRAL_NERVOUS_SYSTEM,0.009377,3.99494,0.02083,0.006562,0.008447,0.008509,0.017427,0.023332,0.053046,0.069961,...,4004.12,0.231852,0.813448,0.450643,1158.29,1622.68,2.55038,2215.69,0.121196,0.117631


In [3]:
df.isna().sum().sum()  # check for missing values

np.int64(0)

In [4]:
df.head()

Unnamed: 0,DDX11L1,WASH7P,MIR1302-11,FAM138A,OR4G4P,OR4G11P,OR4F5,RP11-34P13.7,CICP27,AL627309.1,...,MT-ND4,MT-TH,MT-TS2,MT-TL2,MT-ND5,MT-ND6,MT-TE,MT-CYB,MT-TT,MT-TP
22RV1_PROSTATE,0.053448,5.75256,0.039576,0.037401,0.0,0.032334,0.049664,0.077574,0.098682,3.34955,...,6914.89,0.770865,0.90152,1.17724,2324.83,1794.95,4.62519,4630.21,0.230258,0.446972
2313287_STOMACH,0.043078,4.26104,0.011961,0.022608,0.019401,0.048864,0.080056,0.044659,0.098997,4.04952,...,8190.65,0.532545,0.467105,0.517544,1909.74,1974.76,3.06213,3685.24,0.0,0.270188
253JBV_URINARY_TRACT,0.037956,3.10048,0.031618,0.01328,0.025642,0.025833,0.008817,0.011805,0.149851,0.212384,...,6250.53,0.586535,0.548758,1.82404,1822.52,2097.98,1.29038,1778.51,0.613195,1.30935
253J_URINARY_TRACT,0.028313,3.07909,0.031446,0.026416,0.008501,0.04282,0.035077,0.023481,0.100099,0.131432,...,12656.7,0.933357,1.228,1.81413,3220.78,3909.72,3.8501,4159.0,0.85381,2.48609
42MGBA_CENTRAL_NERVOUS_SYSTEM,0.009377,3.99494,0.02083,0.006562,0.008447,0.008509,0.017427,0.023332,0.053046,0.069961,...,4004.12,0.231852,0.813448,0.450643,1158.29,1622.68,2.55038,2215.69,0.121196,0.117631


### **TP53 Mutation DB**
[Link to the TP53 website 1](https://p53.fr/download-the-database)

[Link to the TP53 website 2](https://tp53.cancer.gov/get_tp53data#get_annot)

In [5]:
mut = pd.read_csv('CellLineDownload_r21.csv')

In [6]:
mut.head()

Unnamed: 0,Sample_ID,Sample_Name,ATCC_ID,Cosmic_ID,Short_topo,Morphology,Tumor_origin,Add_info,Sex,Age,...,exon2,exon3,exon4,exon5,exon6,exon7,exon8,exon9,exon10,exon11
0,2946,SW13,CCL-105,909744.0,ADRENAL GLAND,Adrenal cortical carcinoma (C74.0),,,F,55.0,...,False,False,False,True,True,True,True,False,False,False
1,24833,NCI-H295,CRL-10296,908466.0,ADRENAL GLAND,Adrenal cortical carcinoma (C74.0),,,F,48.0,...,False,False,False,True,True,True,True,False,False,False
2,24979,NCI-H295R,CRL-2128,,ADRENAL GLAND,Adrenal cortical carcinoma (C74.0),primary,Derived from NCI-H295 cell line,F,48.0,...,True,True,True,True,True,True,True,True,True,True
3,21569,NCI-H295,CRL-10296,908466.0,ADRENAL GLAND,Adrenal cortical carcinoma (C74.0),,,F,48.0,...,True,True,True,True,True,True,True,True,True,True
4,21329,ETK-1,,906861.0,BILIARY TRACT,"Adenocarcinoma, NOS",,,F,,...,True,True,True,True,True,True,True,True,True,True


In [7]:
cell_lines_mut = mut['Sample_Name']
cell_lines_mut_names = []
for cell in cell_lines_mut.values:
    cell = str(cell).lower()
    cell = cell.replace('_', '') 
    cell = cell.replace('-', '') 
    cell = cell.replace('.', '') 
    cell = cell.replace(' ', '') 

    cell_lines_mut_names.append(cell)

In [8]:
cell_lines_mut_names

['sw13',
 'ncih295',
 'ncih295r',
 'ncih295',
 'etk1',
 'tgbc1tkb',
 'egi1',
 'snu478',
 'snu869',
 'snu1196',
 'snu245',
 'hucct1',
 'tgbc24tkb',
 'huh28',
 'snu1079',
 'cubiii',
 'ej',
 'ld700',
 'ld627',
 'ld660',
 'ld692',
 'ld630',
 '5637',
 'scaber',
 'sd',
 'fhs738b1',
 'ht1197',
 'dsh1',
 'lb831blc',
 'jmsu1',
 'bftc905',
 'j82',
 'j82',
 'j82',
 'j82',
 'j82',
 'ht1376',
 't24p',
 't24a',
 'tccsup',
 'sw1710',
 'rt112',
 'rt112',
 'cal29',
 't24',
 'umuc3',
 'vmcub1',
 'vmcub1',
 'umuc1',
 'vmcub2',
 'bftc909',
 'bt1',
 'ej',
 'sd',
 'vmcub3',
 'ucrubl17',
 '647v',
 '647v',
 'hs172t',
 '253j',
 '486p',
 'ld583',
 'ld611',
 'ld71',
 'sw780',
 'bc3c',
 'ht1197',
 'ku1919',
 '253jbv',
 'rt4',
 'sw1353',
 'cal78',
 'rdes',
 'tc71',
 'hossl',
 'huo9',
 'u2os',
 'sjsa1',
 'oums27',
 'g292clonea141b1',
 'mg63',
 'hc98',
 'jj',
 'staet21',
 'ham9458',
 'ham9459',
 'ham9463',
 'mhhes1',
 'scmces1',
 'wes',
 'es1ot',
 'rdes',
 'scmces2',
 'a673',
 'es1',
 'ew11',
 'ew12',
 'rm82',
 'ske

In [9]:
df.index

count = 0
for cell in df.index:
    cell_name = cell.split('_')[0]
    if str(cell_name).lower() in cell_lines_mut_names:
        count += 1

In [10]:
count

949

In [11]:
print(len(cell_lines_mut.unique()), len(cell_lines_mut))

2632 2913
