In [1]:
import pandas as pd
df = pd.read_csv('https://cghub.ucsc.edu/reports/SUMMARY_STATS/LATEST_MANIFEST.tsv',
                 low_memory=False, sep='\t')

In [2]:
# Last time it's run:
import datetime
print(datetime.datetime.today().date())

2016-04-22


In [3]:
# only interested in those with state equals Live
ndf = df[(df.study == 'CCLE') & (df.library_type == 'RNA-Seq') & (df.state == 'Live')]

In [4]:
from misc.utils import count

## Library types

In [5]:
count(ndf, 'library_type')

Unnamed: 0_level_0,study,percent
library_type,Unnamed: 1_level_1,Unnamed: 2_level_1
RNA-Seq,935,100.00%


## File types

In [6]:
count(ndf, 'file_type')

Unnamed: 0_level_0,study,percent
file_type,Unnamed: 1_level_1,Unnamed: 2_level_1
bam,935,100.00%


## Disease types

In [7]:
count(ndf, ['disease', 'disease_name'])

Unnamed: 0_level_0,Unnamed: 1_level_0,study,percent
disease,disease_name,Unnamed: 2_level_1,Unnamed: 3_level_1
MESO,MESOTHELIOMA,1,0.11%
UCEC,UTERINE CORPUS ENDOMETRIOID CARCINOMA,3,0.32%
PRAD,PROSTATE ADENOCARCINOMA,7,0.75%
THCA,THYROID CARCINOMA,12,1.28%
CESC,CERVICAL SQUAMOUS CELL CARCINOMA AND ENDOCERVICAL ADENOCARCINOMA,25,2.67%
MM,MULTIPLE MYELOMA,25,2.67%
KIRC,KIDNEY RENAL CLEAR CELL CARCINOMA,25,2.67%
BLCA,BLADDER UROTHELIAL CARCINOMA,26,2.78%
ESCA,ESOPHAGEAL CARCINOMA,26,2.78%
LIHC,LIVER HEPATOCELLULAR CARCINOMA,32,3.42%


## Number of samples

In [8]:
ndf.shape[0]

935

## File sizes

In [9]:
# Note: the count doesn't make sense when converting to filesize, so it's shown separately
from misc.utils import sizeof_fmt
# args: pass a sep character for better readbility. See sizeof_fmt?? for more details
ndf.files_size.describe().apply(sizeof_fmt, args=(' ',))

count    935.0 Bytes
mean         13.8 GB
std           3.3 GB
min           4.8 GB
25%          11.7 GB
50%          13.4 GB
75%          16.1 GB
max          34.2 GB
Name: files_size, dtype: object

#### Would be interesting to find out what the smallest and biggest files are

In [17]:
print(sorted(ndf.columns.tolist()))

['aliquot_id', 'analysis_id', 'analyte_type', 'analyte_type_code', 'assembly', 'barcode', 'catalog_number', 'center', 'center_name', 'checksum', 'disease', 'disease_name', 'file_type', 'filename', 'files_size', 'files_size_readable', 'is_custom', 'library_type', 'modified', 'participant_id', 'platform', 'platform_full_name', 'platform_name', 'probe_file_url', 'published', 'reagent_name', 'reagent_vendor', 'reason', 'sample_accession', 'sample_id', 'sample_type', 'sample_type_code', 'sample_type_name', 'state', 'study', 'target_file_url', 'tss_id', 'uploaded']


In [18]:
cols_to_show = ['aliquot_id', 'disease_name', 'sample_type_name', 'filename', 'file_type',
                'files_size', 'files_size_readable', 'library_type', 'state']
# turn off warning, http://stackoverflow.com/questions/20625582/how-to-deal-with-this-pandas-warning
ndf.is_copy = False
ndf['files_size_readable'] = ndf.files_size.apply(sizeof_fmt)
sorted_ndf = ndf.sort_values('files_size').reset_index()
sorted_ndf.ix[[0, 1, 2, sorted_ndf.shape[0]-1]][cols_to_show]

Unnamed: 0,aliquot_id,disease_name,sample_type_name,filename,file_type,files_size,files_size_readable,library_type,state
0,bdda4bb5-c124-4976-ab0f-5ed93325029e,BREAST INVASIVE CARCINOMA,Cell Lines,G28034.MDA-MB-361.1.bam,bam,5159984000.0,4.8 GB,RNA-Seq,Live
1,b39aceb7-b5a0-4d66-a258-5dab2e605093,BLADDER UROTHELIAL CARCINOMA,Cell Lines,G30630.VM-CUB1.3.bam,bam,5645505000.0,5.3 GB,RNA-Seq,Live
2,94df1b60-37a3-4bbc-8ab0-50d1462d9f20,KIDNEY RENAL CLEAR CELL CARCINOMA,Cell Lines,G30603.TUHR4TKB.1.bam,bam,6278609000.0,5.8 GB,RNA-Seq,Live
934,7bf1907c-bc47-47ee-9eeb-6d0ddf66aa28,UTERINE CORPUS ENDOMETRIOID CARCINOMA,Cell Lines,G41715.JHUEM-1.5.bam,bam,36760080000.0,34.2 GB,RNA-Seq,Live
