The primary data source in this code comes from the main CCLE source (Broad Institute). Another source of data comes from https://ocg.cancer.gov/<br>

1. **Broad Institute:**
    - https://portals.broadinstitute.org/ccle/data


2. **ocg.cancer.gov:**
    - https://ocg.cancer.gov/ctd2-data-project/translational-genomics-research-institute-quantified-cancer-cell-line-encyclopedia
    - ftp://caftpd.nci.nih.gov/pub/OCG-DCC/CTD2/TGen/CCLE_RNA-seq_Analysis/

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
from glob import glob
from collections import OrderedDict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Utils
# file_path = os.getcwd()  # os.path.dirname(os.path.relpath(__file__))
# utils_path = os.path.abspath(os.path.join(file_path, 'utils_py'))
# sys.path.append(utils_path)
# import utils_all as utils

import warnings
warnings.filterwarnings('ignore')

SEED=0

In [2]:
DATADIR = '~/work/jdacs/cell-line-data/ccle/from_broad_institute'

## Load cell metadata

In [3]:
cell_meta = pd.read_csv(os.path.join(DATADIR, 'cell_line_annotations/CCLE_sample_info_file_2012-10-18.txt'), sep='\t')
print(cell_meta.shape)
print(cell_meta.columns.tolist())
# display(cell_meta[:2])
# cell_meta = cell_meta[['CCLE name', 'Cell line primary name', 'Cell line aliases', 'Gender',
#                        'Site Primary', 'Histology', 'Hist Subtype1']]
colNameMap = {'CCLE name': 'CCLEName', 'Cell line primary name': 'CellName',
                          'Cell line aliases': 'CellNameAliases', 'Site Primary': 'SitePrimary',
                          'Hist Subtype1': 'HistSubtype1', 'Expression arrays': 'ExpressionArrays', 'SNP arrays': 'SNPArrays'}
cell_meta.rename(columns=colNameMap, inplace=True)
display(cell_meta[:2])

(1046, 13)
['CCLE name', 'Cell line primary name', 'Cell line aliases', 'Gender', 'Site Primary', 'Histology', 'Hist Subtype1', 'Notes', 'Source', 'Expression arrays', 'SNP arrays', 'Oncomap', 'Hybrid Capture Sequencing']


Unnamed: 0,CCLEName,CellName,CellNameAliases,Gender,SitePrimary,Histology,HistSubtype1,Notes,Source,ExpressionArrays,SNPArrays,Oncomap,Hybrid Capture Sequencing
0,1321N1_CENTRAL_NERVOUS_SYSTEM,1321N1,,M,central_nervous_system,glioma,astrocytoma,"Identical lines: U-118 MG, U-138 MG and 1321N1...",ECACC,NIECE_p_NCLE_RNA3_HG-U133_Plus_2_B06_296024,HONEY_p_NCLE_DNAAffy3_S_GenomeWideSNP_6_E09_29...,yes,
1,143B_BONE,143B,,F,bone,osteosarcoma,NS,"Identical lines: HTK-, HOS and 143B share high...",ATCC,MAKER_p_NCLE_RNA7_HG-U133_Plus_2_F09_454702,BOWER_p_NCLE_DNAAffy8_GenomeWideSNP_6_D02_464552,yes,


In [4]:
cell_meta.nunique()

CCLEName                     1046
CellName                     1046
CellNameAliases               194
Gender                          3
SitePrimary                    24
Histology                      22
HistSubtype1                   68
Notes                          54
Source                         10
ExpressionArrays             1036
SNPArrays                     995
Oncomap                         1
Hybrid Capture Sequencing       1
dtype: int64

## Load drug metadata

In [5]:
# /Users/apartin/Dropbox/work/pilot1/pharmaco/ccle/ccle_drugmeta
drugmeta = pd.read_csv(os.path.join(DATADIR, 'pharmacological_profiling/CCLE_NP24.2009_profiling_2012.02.20.csv'),
                       encoding='unicode_escape')
colNameMap = {'Compound (code or generic name)': 'Drug', 'Compound (brand name)': 'DrugBrandName',
                        'Target(s)': 'Target', 'Mechanism of action': 'MechOfAction', 'Highest Phase': 'HighestPhase'}
drugmeta.rename(columns=colNameMap, inplace=True)
print(drugmeta.shape)
print(drugmeta.nunique())
display(drugmeta[:2])

(24, 7)
Drug             24
DrugBrandName    14
Target           22
MechOfAction     20
Class             3
HighestPhase     13
Organization     10
dtype: int64


Unnamed: 0,Drug,DrugBrandName,Target,MechOfAction,Class,HighestPhase,Organization
0,Erlotinib,Tarceva,EGFR,EGFR Inhibitor,Kinase inhibitor,Launched-2004,Genentech
1,Lapatinib,Tykerb,"EGFR, HER2",EGFR and HER2 Inhibitor,Kinase inhibitor,Launched-2007,GlaxoSmithKline


## Load dose response data
Reposne data for each cell-drug pair.

In [6]:
rspdata = pd.read_csv(os.path.join(DATADIR, 'pharmacological_profiling/CCLE_NP24.2009_Drug_data_2015.02.24.csv'))
print(rspdata.shape)
colNameMap = {'CCLE Cell Line Name': 'CCLEName', 'Primary Cell Line Name': 'CellName', 'Compound': 'Drug',
              'Doses (uM)': 'Doses_uM', 'Activity Data (median)': 'ActivityMedian', 'Activity SD': 'ActivitySD',
              'Num Data': 'nDataPoints', 'EC50 (uM)': 'EC50um', 'IC50 (uM)': 'IC50um'}
rspdata.rename(columns=colNameMap, inplace=True)
print(rspdata[['CCLEName', 'CellName', 'Drug']].nunique())
display(rspdata[:2])

(11670, 13)
CCLEName    504
CellName    504
Drug         24
dtype: int64


Unnamed: 0,CCLEName,CellName,Drug,Target,Doses_uM,ActivityMedian,ActivitySD,nDataPoints,FitType,EC50um,IC50um,Amax,ActArea
0,1321N1_CENTRAL_NERVOUS_SYSTEM,1321N1,AEW541,IGF1R,".0025,.0080,.025,.080,.25,.80,2.53,8","8.67,11.0,2.16,.27,-10,-13,-26,-43","3.31,3.72,5.36,4.67,13.1,.18,2.42,7.51",8,Sigmoid,8.717774,8.0,-42.558014,0.7124
1,22RV1_PROSTATE,22Rv1,AEW541,IGF1R,".0025,.0080,.025,.080,.25,.80,2.53,8",".94,12.5,-14,4.16,-25,-32,-52,-71","1.95,13.3,6.98,21.8,16.0,18.8,4.84,7.93",8,Sigmoid,8.165164,2.329924,-71.58934,1.6723


In [7]:
rspdata['Drug'].value_counts()

PD-0325901      504
PF2341066       504
Topotecan       504
Nutlin-3        504
Lapatinib       504
AZD0530         504
TKI258          504
TAE684          504
LBW242          503
PHA-665752      503
AEW541          503
17-AAG          503
AZD6244         503
Paclitaxel      503
Erlotinib       503
Sorafenib       503
Panobinostat    500
ZD-6474         496
PLX4720         496
L-685458        491
RAF265          460
PD-0332991      434
Nilotinib       420
Irinotecan      317
Name: Drug, dtype: int64

## Load expression data

In [8]:
path = os.path.join(DATADIR, 'current_data_11-08-2018/CCLE_DepMap_18q3_RNAseq_reads_20180718.gct.txt')
rna_cnt = pd.read_csv(path, header=2, sep='\t')
# rna_cnt.sort_values('Name', inplace=True)
# rna_cnt.reset_index(drop=True, inplace=True)
print(rna_cnt.shape)
display(rna_cnt[:2])

(56318, 1158)


Unnamed: 0,Name,Description,22RV1_PROSTATE (ACH-000956),2313287_STOMACH (ACH-000948),253JBV_URINARY_TRACT (ACH-000026),253J_URINARY_TRACT (ACH-000011),42MGBA_CENTRAL_NERVOUS_SYSTEM (ACH-000323),5637_URINARY_TRACT (ACH-000905),59M_OVARY (ACH-000520),639V_URINARY_TRACT (ACH-000973),...,UMUC16_URINARY_TRACT (ACH-001409),UMUC4_URINARY_TRACT (ACH-001410),UMUC5_URINARY_TRACT (ACH-001411),UMUC6_URINARY_TRACT (ACH-001414),UMUC7_URINARY_TRACT (ACH-001415),UMUC9_URINARY_TRACT (ACH-001416),UPCISCC152_UPPER_AERODIGESTIVE_TRACT (ACH-001228),UW228_CENTRAL_NERVOUS_SYSTEM (ACH-001232),Y79_AUTONOMIC_GANGLIA (ACH-001295),YAMATO_SOFT_TISSUE (ACH-001277)
0,ENSG00000223972.4,DDX11L1,0,2,0,4,0,1,1,2,...,0,3,3,4,4,3,0,2,3,0
1,ENSG00000227232.4,WASH7P,2316,1538,1094,1148,1367,1280,900,1109,...,1634,2346,2208,3215,2429,2298,2252,971,2485,1374


In [9]:
# Rename cell line names (cols) and gene names
cols_raw = rna_cnt.columns.tolist()
df = rna_cnt.rename(columns={'Name': 'ENSGName', 'Description': 'GeneName'})
df['ENSGName'] = df['ENSGName'].map(lambda x: x.split('.')[0])
df = df.rename(columns={c: c.split(' ')[0] for c in rna_cnt.columns[2:]})
df.sort_values('ENSGName', inplace=True)
df.reset_index(drop=True, inplace=True)
display(df[:2])

Unnamed: 0,ENSGName,GeneName,22RV1_PROSTATE,2313287_STOMACH,253JBV_URINARY_TRACT,253J_URINARY_TRACT,42MGBA_CENTRAL_NERVOUS_SYSTEM,5637_URINARY_TRACT,59M_OVARY,639V_URINARY_TRACT,...,UMUC16_URINARY_TRACT,UMUC4_URINARY_TRACT,UMUC5_URINARY_TRACT,UMUC6_URINARY_TRACT,UMUC7_URINARY_TRACT,UMUC9_URINARY_TRACT,UPCISCC152_UPPER_AERODIGESTIVE_TRACT,UW228_CENTRAL_NERVOUS_SYSTEM,Y79_AUTONOMIC_GANGLIA,YAMATO_SOFT_TISSUE
0,ENSG00000000003,TSPAN6,990,1113,4566,5026,3886,12040,3411,6146,...,22825,20495,16168,6468,35596,15750,8392,7324,8914,7830
1,ENSG00000000005,TNMD,0,0,0,0,0,0,0,0,...,0,38,1,2,9,0,2,0,0,259


In [11]:
# Keep cell lines that were actually screened
cells_screened = rspdata['CCLEName'].unique().tolist()  # cells that were screened
print('Unique cells screend:', len(cells_screened))
usecells = [c for c in df.columns[2:] if c in usecells]
print('Cells that were screened and have RNA-Seq:', len(usecells))

Unique cells screend: 504
Cells that were screened and have RNA-Seq: 472


In [110]:
df.set_index(['GeneName', 'Description'], inplace=True)
df = df[usecells]
df.reset_index(inplace=True)
print(df.shape)

(56318, 474)


# ocg.cancer.gov

In [68]:
path = '/Users/apartin/work/jdacs/cell-line-data/ccle/from_ocs_cancer_center/cgHub_CCLE_RNA-seq_metadata_summary.txt'
ccle_meta_ocs = pd.read_csv(path, sep='\t')

In [69]:
# Rmv cols with constant values
ccle_meta_ocs = ccle_meta_ocs[[ccle_meta_ocs.columns[i] for i, v in enumerate(ccle_meta_ocs.nunique()) if v>1]]
print(ccle_meta_ocs.nunique())
display(ccle_meta_ocs[:2])

barcode             935
disease              22
disease_name         22
sample_type           2
sample_type_name      2
filename            935
files_size          935
checksum            935
analysis_id         935
aliquot_id          935
published             6
uploaded              6
modified              4
dtype: int64


Unnamed: 0,barcode,disease,disease_name,sample_type,sample_type_name,filename,files_size,checksum,analysis_id,aliquot_id,published,uploaded,modified
0,CCLE-TE 441.T-RNA-08,SARC,Sarcoma,CELL,50,G30604.TE_441.T.1.bam,7991352314,0cdd5b9904c75b288faf09683f6fddfa,68dd4e2b-a352-4ac9-8944-90cb2c5538f6,1437d186-8b9b-49b4-846d-450243eb0a1d,2013-04-30,2013-04-16,2013-05-16
1,CCLE-SNU-1196-RNA-08,LIHC,Liver hepatocellular carcinoma,CELL,50,G27465.SNU-1196.2.bam,17730569372,740fcb7dd4e8fc12888289c100466420,29a6b3ce-44ec-4f33-898b-8f920a44072c,1466282a-aff6-41dd-8559-d980b740f57a,2013-04-30,2013-04-17,2013-05-16


In [76]:
ccle_meta_ocs['disease'].value_counts()

LUSC    184
LCLL     81
LGG      65
COAD     58
DLBC     57
BRCA     56
SKCM     52
OV       45
PAAD     41
STAD     41
SARC     40
HNSC     33
LIHC     32
BLCA     26
ESCA     26
KIRC     25
MM       25
CESC     25
THCA     12
PRAD      7
UCEC      3
MESO      1
Name: disease, dtype: int64

In [77]:
ccle_meta_ocs['disease_name'].value_counts()

Lung squamous cell carcinoma                                        184
Chronic Lymphocytic Leukemia                                         81
Brain Lower Grade Glioma                                             65
Colon adenocarcinoma                                                 58
Lymphoid Neoplasm Diffuse Large B-cell Lymphoma                      57
Breast invasive carcinoma                                            56
Skin Cutaneous Melanoma                                              52
Ovarian serous cystadenocarcinoma                                    45
Pancreatic adenocarcinoma                                            41
Stomach adenocarcinoma                                               41
Sarcoma                                                              40
Head and Neck squamous cell carcinoma                                33
Liver hepatocellular carcinoma                                       32
Bladder Urothelial Carcinoma                                    

In [73]:
ccle_meta_ocs['published'].value_counts()

2013-04-30    746
2014-07-10    141
2013-04-29     34
2014-07-23      7
2014-07-24      6
2013-05-11      1
Name: published, dtype: int64

In [74]:
ccle_meta_ocs['uploaded'].value_counts()

2013-04-17    405
2013-04-16    374
2014-06-20     87
2014-06-21     67
2013-04-18      1
2013-04-25      1
Name: uploaded, dtype: int64

In [75]:
ccle_meta_ocs['modified'].value_counts()

2013-05-16    781
2014-07-10    141
2014-07-23      7
2014-07-24      6
Name: modified, dtype: int64