In [2]:
import os
import re
import datetime
import tempfile

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 250)
# Don't cut off long string
# http://stackoverflow.com/questions/26277757/pandas-to-html-truncates-string-contents
pd.set_option('display.max_colwidth', -1)

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (15, 9)

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [3]:
df = pd.read_csv('data/LATEST_MANIFEST.final.tsv.gz', low_memory=False, sep='\t', compression='gzip')

In [4]:
pd.options.display.max_columns = 99

In [5]:
# Last time it's run:
import datetime
print(datetime.datetime.today().date())

2016-08-24


In [6]:
# only interested in those with state equals Live
ndf = df[(df.study == 'CCLE') & (df.library_type == 'RNA-Seq') & (df.state == 'Live')]

### Library types

In [19]:
ndf.library_type.value_counts()

RNA-Seq    935
Name: library_type, dtype: int64

### File types

In [20]:
ndf.file_type.value_counts()

bam    935
Name: file_type, dtype: int64

### Disease names

In [27]:
ndf['disease_name'] = ndf.disease_name.apply(lambda x: x.capitalize())

In [28]:
from misc.utils import count

In [29]:
count(ndf, ['disease', 'disease_name'])

Unnamed: 0_level_0,Unnamed: 1_level_0,study,percent
disease,disease_name,Unnamed: 2_level_1,Unnamed: 3_level_1
MESO,Mesothelioma,1,0.11%
UCEC,Uterine corpus endometrioid carcinoma,3,0.32%
PRAD,Prostate adenocarcinoma,7,0.75%
THCA,Thyroid carcinoma,12,1.28%
CESC,Cervical squamous cell carcinoma and endocervical adenocarcinoma,25,2.67%
MM,Multiple myeloma,25,2.67%
KIRC,Kidney renal clear cell carcinoma,25,2.67%
BLCA,Bladder urothelial carcinoma,26,2.78%
ESCA,Esophageal carcinoma,26,2.78%
LIHC,Liver hepatocellular carcinoma,32,3.42%


### Number of samples

In [30]:
ndf.shape[0]

935

## File sizes

In [31]:
# Note: the count doesn't make sense when converting to filesize, so it's shown separately
from misc.utils import sizeof_fmt
# args: pass a sep character for better readbility. See sizeof_fmt?? for more details
ndf.files_size.describe().apply(sizeof_fmt, args=(' ',))

count    935.0 Bytes
mean     13.8 GB    
std      3.3 GB     
min      4.8 GB     
25%      11.7 GB    
50%      13.4 GB    
75%      16.1 GB    
max      34.2 GB    
Name: files_size, dtype: object

#### Would be interesting to find out what the smallest and biggest files are

In [32]:
print(sorted(ndf.columns.tolist()))

['aliquot_id', 'analysis_id', 'analyte_type', 'analyte_type_code', 'assembly', 'barcode', 'catalog_number', 'center', 'center_name', 'checksum', 'disease', 'disease_name', 'file_type', 'filename', 'files_size', 'files_size_readable', 'is_custom', 'library_type', 'modified', 'participant_id', 'platform', 'platform_full_name', 'platform_name', 'probe_file_url', 'published', 'reagent_name', 'reagent_vendor', 'reason', 'sample_accession', 'sample_id', 'sample_type', 'sample_type_code', 'sample_type_name', 'state', 'study', 'target_file_url', 'tss_id', 'uploaded']


In [33]:
cols_to_show = ['disease_name', 'sample_type_name', 'filename', 'file_type',
                'files_size', 'files_size_readable', 'library_type', 'state', 'aliquot_id', 'analysis_id']
# turn off warning, http://stackoverflow.com/questions/20625582/how-to-deal-with-this-pandas-warning
ndf.is_copy = False
ndf['files_size_readable'] = ndf.files_size.apply(sizeof_fmt)
sorted_ndf = ndf.sort_values('files_size').reset_index()
sorted_ndf.ix[[0, 1, 2, sorted_ndf.shape[0]-1]][cols_to_show]

Unnamed: 0,disease_name,sample_type_name,filename,file_type,files_size,files_size_readable,library_type,state,aliquot_id,analysis_id
0,Breast invasive carcinoma,Cell Lines,G28034.MDA-MB-361.1.bam,bam,5159984000.0,4.8 GB,RNA-Seq,Live,bdda4bb5-c124-4976-ab0f-5ed93325029e,a337c425-4314-40c6-a40a-a444781bd1b7
1,Bladder urothelial carcinoma,Cell Lines,G30630.VM-CUB1.3.bam,bam,5645505000.0,5.3 GB,RNA-Seq,Live,b39aceb7-b5a0-4d66-a258-5dab2e605093,618953ee-2747-4568-aa34-2a5b6ef2c7d2
2,Kidney renal clear cell carcinoma,Cell Lines,G30603.TUHR4TKB.1.bam,bam,6278609000.0,5.8 GB,RNA-Seq,Live,94df1b60-37a3-4bbc-8ab0-50d1462d9f20,9abc5ed0-86d4-4e9b-959d-66000671301c
934,Uterine corpus endometrioid carcinoma,Cell Lines,G41715.JHUEM-1.5.bam,bam,36760080000.0,34.2 GB,RNA-Seq,Live,7bf1907c-bc47-47ee-9eeb-6d0ddf66aa28,b680faac-af24-4590-b206-da028a7d9358


In [34]:
ndf.head()

Unnamed: 0,study,barcode,disease,disease_name,sample_type,sample_type_name,analyte_type,library_type,center,center_name,platform,platform_name,assembly,filename,files_size,checksum,analysis_id,aliquot_id,participant_id,sample_id,tss_id,sample_accession,published,uploaded,modified,state,sample_type_code,analyte_type_code,platform_full_name,file_type,reason,reagent_vendor,reagent_name,catalog_number,is_custom,target_file_url,probe_file_url,files_size_readable
2,CCLE,CCLE-RERF-LC-Ad2-RNA-08,LUSC,Lung squamous cell carcinoma,CELL,Cell Lines,RNA,RNA-Seq,BI,,ILLUMINA,Illumina,HG19_Broad_variant,G27541.RERF-LC-Ad2.2.bam,16474500000.0,1b1cb7fa3724c114e9d69e2289f2ae81,00bc84f8-6053-440f-a27d-f2dbd6701eb2,6f4b6f7e-0b49-4138-acb9-65fe0378da1d,,,,,2013-04-29,2013-04-16,2013-05-16,Live,50,R,Illumina HiSeq 2000,bam,,,,,,,,15.3 GB
3,CCLE,CCLE-Hs 172.T-RNA-08,BLCA,Bladder urothelial carcinoma,CELL,Cell Lines,RNA,RNA-Seq,BI,,ILLUMINA,Illumina,HG19_Broad_variant,G28902.Hs_172.T.3.bam,9494814000.0,fb57d20026ac6eaeed6d23f88fa99a9d,00cadf1a-480b-4404-b137-c875a77bd537,a45490f5-bd54-4b00-987a-eb2abd64eaea,,,,,2013-04-30,2013-04-16,2013-05-16,Live,50,R,Illumina HiSeq 2000,bam,,,,,,,,8.8 GB
7,CCLE,CCLE-WM983B-RNA-08,SKCM,Skin cutaneous melanoma,CELL,Cell Lines,RNA,RNA-Seq,BI,,ILLUMINA,Illumina,HG19_Broad_variant,G30635.WM983B.1.bam,10311010000.0,98d589dbe70629210b3619e75f097a0a,01928742-c5a4-4585-ad40-8d58e7c5e577,569e8fda-7d74-4756-9991-4a308e2c714c,,,,,2013-04-29,2013-04-15,2013-05-16,Live,50,R,Illumina HiSeq 2000,bam,,,,,,,,9.6 GB
8,CCLE,CCLE-Hs 939.T-RNA-08,SKCM,Skin cutaneous melanoma,CELL,Cell Lines,RNA,RNA-Seq,BI,,ILLUMINA,Illumina,HG19_Broad_variant,G28854.Hs_939.T.3.bam,14303280000.0,2055c1ec9fd44ed2e1f976f8cd52d0a8,019a131b-730e-436e-90a8-e7b07adf8055,bdddc6ad-b815-4452-a0b7-f7f02cc31a8e,,,,,2013-04-30,2013-04-16,2013-05-16,Live,50,R,Illumina HiSeq 2000,bam,,,,,,,,13.3 GB
11,CCLE,CCLE-SNU-738-RNA-08,LGG,Brain lower grade glioma,CELL,Cell Lines,RNA,RNA-Seq,BI,,ILLUMINA,Illumina,HG19_Broad_variant,G27459.SNU-738.2.bam,16708580000.0,430674874c77dd1f4c060d4ea9606e62,025cd659-08e0-442f-b7b1-1565ffbad59c,2a806e8d-35a1-41b6-b514-b88537fbc82a,,,,,2013-04-29,2013-04-16,2013-05-16,Live,50,R,Illumina HiSeq 2000,bam,,,,,,,,15.6 GB


In [16]:
ndf.sample_type_name.value_counts()

Cell Lines    934
Name: sample_type_name, dtype: int64

In [38]:
ndf[ndf.disease == 'CESC'].head()

Unnamed: 0,study,barcode,disease,disease_name,sample_type,sample_type_name,analyte_type,library_type,center,center_name,platform,platform_name,assembly,filename,files_size,checksum,analysis_id,aliquot_id,participant_id,sample_id,tss_id,sample_accession,published,uploaded,modified,state,sample_type_code,analyte_type_code,platform_full_name,file_type,reason,reagent_vendor,reagent_name,catalog_number,is_custom,target_file_url,probe_file_url,files_size_readable
201,CCLE,CCLE-Ishikawa (Heraklio) 02 ER--RNA-08,CESC,Cervical squamous cell carcinoma and endocervical adenocarcinoma,CELL,Cell Lines,RNA,RNA-Seq,BI,,ILLUMINA,Illumina,HG19_Broad_variant,G28875.Ishikawa__Heraklio__02_ER-.3.bam,14551030000.0,d844d51616d997a0341afacc5a3889b2,1e3b3fcf-6729-43db-837f-9f298105b041,480c76be-e380-48e8-b05c-6e2b6f5c760a,,,,,2013-04-29,2013-04-17,2013-05-16,Live,50,R,Illumina HiSeq 2000,bam,,,,,,,,13.6 GB
310,CCLE,CCLE-HEC-59-RNA-08,CESC,Cervical squamous cell carcinoma and endocervical adenocarcinoma,CELL,Cell Lines,RNA,RNA-Seq,BI,,ILLUMINA,Illumina,HG19_Broad_variant,G28871.HEC-59.3.bam,12764250000.0,63b00fcf9fc38e22b3249285243faf7e,2d329536-0223-49a1-a9a7-789ad0b9abfa,ca9a8928-e00d-4a71-be2e-1900ef88bea3,,,,,2013-04-30,2013-04-16,2013-05-16,Live,50,R,Illumina HiSeq 2000,bam,,,,,,,,11.9 GB
331,CCLE,CCLE-HEC-265-RNA-08,CESC,Cervical squamous cell carcinoma and endocervical adenocarcinoma,CELL,Cell Lines,RNA,RNA-Seq,BI,,ILLUMINA,Illumina,HG19_Broad_variant,G28824.HEC-265.3.bam,14675620000.0,9b71b6b7d4de68052bc1c73e7e6982d6,3139e2bb-75e3-4a95-8070-45c4ce89e57b,28551ac3-8612-49df-8525-195790e53858,,,,,2013-04-30,2013-04-16,2013-05-16,Live,50,R,Illumina HiSeq 2000,bam,,,,,,,,13.7 GB
377,CCLE,CCLE-HEC-6-RNA-08,CESC,Cervical squamous cell carcinoma and endocervical adenocarcinoma,CELL,Cell Lines,RNA,RNA-Seq,BI,,ILLUMINA,Illumina,HG19_Broad_variant,G28823.HEC-6.3.bam,13962030000.0,428eab66db652e3b7bf616b0b3906d2c,35b5acca-e2ed-4db3-9870-eaf83e2bf4c0,898adc78-3fa1-4a3c-979c-6e76b9afef71,,,,,2013-04-30,2013-04-16,2013-05-16,Live,50,R,Illumina HiSeq 2000,bam,,,,,,,,13.0 GB
381,CCLE,CCLE-HEC-151-RNA-08,CESC,Cervical squamous cell carcinoma and endocervical adenocarcinoma,CELL,Cell Lines,RNA,RNA-Seq,BI,,ILLUMINA,Illumina,HG19_Broad_variant,G28826.HEC-151.3.bam,14457080000.0,22b5cdbdf1623821700954b5469dd915,36150b63-6e59-4f48-8329-8bedb8cf195c,f508423b-bd47-40ff-9050-80f6aff9e569,,,,,2013-04-30,2013-04-16,2013-05-16,Live,50,R,Illumina HiSeq 2000,bam,,,,,,,,13.5 GB


In [37]:
ndf.disease_name.value_counts()

Lung squamous cell carcinoma                                        184
Chronic lymphocytic leukemia                                        81 
Brain lower grade glioma                                            65 
Colon adenocarcinoma                                                58 
Lymphoid neoplasm diffuse large b-cell lymphoma                     57 
Breast invasive carcinoma                                           56 
Skin cutaneous melanoma                                             52 
Ovarian serous cystadenocarcinoma                                   45 
Pancreatic adenocarcinoma                                           41 
Stomach adenocarcinoma                                              41 
Sarcoma                                                             40 
Head and neck squamous cell carcinoma                               33 
Liver hepatocellular carcinoma                                      32 
Bladder urothelial carcinoma                                    