In [73]:
import pandas as pd
df = pd.read_csv('https://cghub.ucsc.edu/reports/SUMMARY_STATS/LATEST_MANIFEST.tsv', low_memory=False, sep='\t')

In [74]:
# Last time it's run:
import datetime
print(datetime.datetime.today().date())

2016-03-24


In [75]:
# only interested in those with state equals Live
ndf = df[(df.study == 'CCLE') & (df.library_type == 'RNA-Seq') & (df.state == 'Live')]

In [76]:
def count(df, colname):
    # it can be any column other than 'study'
    res = df[['study', colname]].groupby(colname).count().sort_values('study')
    res['percent'] = (res.study / res.sum().values[0]).apply('{0:.2%}'.format)
    return res

## Library types

In [77]:
count(ndf, 'library_type')

Unnamed: 0_level_0,study,percent
library_type,Unnamed: 1_level_1,Unnamed: 2_level_1
RNA-Seq,935,100.00%


## File types

In [78]:
count(ndf, 'file_type')

Unnamed: 0_level_0,study,percent
file_type,Unnamed: 1_level_1,Unnamed: 2_level_1
bam,935,100.00%


## Number of samples

In [79]:
ndf.shape[0]

935

## File sizes

In [80]:
# Note: the count doesn't make sense when converting to filesize, so it's shown separately
from misc.utils import sizeof_fmt
# args: pass a sep character for better readbility. See sizeof_fmt?? for more details
ndf.files_size.describe().apply(sizeof_fmt, args=(' ',))

count     935.0  
mean     13.8 Gi 
std       3.3 Gi 
min       4.8 Gi 
25%      11.7 Gi 
50%      13.4 Gi 
75%      16.1 Gi 
max      34.2 Gi 
Name: files_size, dtype: object

#### Would be interesting to find out what the smallest and biggest files are

In [81]:
cols_to_show = ['disease_name', 'sample_type_name', 'filename', 'file_type',
                'files_size', 'files_size_readable', 'library_type', 'state']
# turn off warning, http://stackoverflow.com/questions/20625582/how-to-deal-with-this-pandas-warning
ndf.is_copy = False
ndf['files_size_readable'] = ndf.files_size.apply(sizeof_fmt)
sorted_ndf = ndf.sort_values('files_size').reset_index()
sorted_ndf.ix[[0,sorted_ndf.shape[0]-1]][cols_to_show]

Unnamed: 0,disease_name,sample_type_name,filename,file_type,files_size,files_size_readable,library_type,state
0,BREAST INVASIVE CARCINOMA,Cell Lines,G28034.MDA-MB-361.1.bam,bam,5159984000.0,4.8 GiB,RNA-Seq,Live
934,UTERINE CORPUS ENDOMETRIOID CARCINOMA,Cell Lines,G41715.JHUEM-1.5.bam,bam,36760080000.0,34.2 GiB,RNA-Seq,Live
