# Cancer Genomics Hub Summary Statistics

In [1]:
import os

import pandas as pd
pd.set_option('max_columns', 50)
pd.set_option('max_rows', 1000)

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (15, 9)


http://stackoverflow.com/questions/18770504/resize-ipython-notebook-output-window

In [2]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 1000;

<IPython.core.display.Javascript object>

The `LATEST_MANIFEST.tsv` file is being updated daily, see [here](https://cghub.ucsc.edu/summary_stats.html) for more information

In [3]:
df = pd.read_csv('https://cghub.ucsc.edu/reports/SUMMARY_STATS/LATEST_MANIFEST.tsv', low_memory=False, sep='\t')

##### Last time it's run:

In [4]:
import datetime
print(datetime.datetime.now())

2016-04-27 12:38:56.109711


## Available columns

See detailed description of each column [here](https://cghub.ucsc.edu/summary_stats.html)

In [5]:
interval = 3
for k, c in enumerate(sorted(df.columns)):
    if (k + 1) % 4 == 0:
        print
    else:
        print '{0:2d}: {1:25s}'.format(k, c),

 0: aliquot_id                 1: analysis_id                2: analyte_type             
 4: assembly                   5: barcode                    6: catalog_number           
 8: center_name                9: checksum                  10: disease                  
12: file_type                 13: filename                  14: files_size               
16: library_type              17: modified                  18: participant_id           
20: platform_full_name        21: platform_name             22: probe_file_url           
24: reagent_name              25: reagent_vendor            26: reason                   
28: sample_id                 29: sample_type               30: sample_type_code         
32: state                     33: study                     34: target_file_url          
36: uploaded                 


In [7]:
import math
def lower(s):
    if isinstance(s, float) and math.isnan(s):
        return s
    else:
        return s.lower()

# lower case for better readability
df.disease_name = df.disease_name.apply(lower)

## Available studies (e.g. TCGA, CCLE)

sorted by the number of entries

In [8]:
_df = df.study\
    .value_counts()\
    .to_frame()\
    .reset_index()\
    .rename(columns={'study': 'count'})
_df['%'] = (_df['count'] / df.shape[0]).apply('{0:.2%}'.format)
_df

Unnamed: 0,index,count,%
0,TCGA,99372,78.14%
1,TCGA MC3 VARIANT CALLING PROJECT,10532,8.28%
2,TARGET,10453,8.22%
3,PCAWG 2.0,3911,3.08%
4,CCLE,1296,1.02%
5,CGHUB GENERAL TEST,330,0.26%
6,"TCGA STUDY FOR HOLDING AMONG OTHERS, FILES WHI...",291,0.23%
7,CGCI,213,0.17%
8,REBC,202,0.16%
9,TARGET STUDY FOR HOLDING NOT FULLY REPAIRED CG...,194,0.15%


## Total number of samples

In [9]:
df.shape[0]

127173

## Sample distribution by disease name

In [10]:
def count_func(df, study_name):
    """helper function that does the counting"""
    _df = df[df.study == study_name].groupby(['study', 'disease_name'])\
    .count()[['barcode']]\
    .rename(columns={'barcode': 'count'})\
    .reset_index()\
    .sort_values(['study', 'count'])[::-1]\
    .set_index(['study', 'disease_name'])
    _df['%'] = (_df['count'] / df.shape[0]).apply('{0:.2%}'.format)
    return _df

### TCGA

In [11]:
count_func(df, 'TCGA')

Unnamed: 0_level_0,Unnamed: 1_level_0,count,%
study,disease_name,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA,kidney renal clear cell carcinoma,8946,7.03%
TCGA,breast invasive carcinoma,7642,6.01%
TCGA,ovarian serous cystadenocarcinoma,5964,4.69%
TCGA,stomach adenocarcinoma,5384,4.23%
TCGA,colon adenocarcinoma,5360,4.21%
TCGA,lung adenocarcinoma,5031,3.96%
TCGA,uterine corpus endometrioid carcinoma,4554,3.58%
TCGA,head and neck squamous cell carcinoma,4554,3.58%
TCGA,thyroid carcinoma,4428,3.48%
TCGA,lung squamous cell carcinoma,4290,3.37%


### CCLE

In [12]:
count_func(df, 'CCLE')

Unnamed: 0_level_0,Unnamed: 1_level_0,count,%
study,disease_name,Unnamed: 2_level_1,Unnamed: 3_level_1
CCLE,lung squamous cell carcinoma,220,0.17%
CCLE,chronic lymphocytic leukemia,124,0.10%
CCLE,breast invasive carcinoma,101,0.08%
CCLE,lymphoid neoplasm diffuse large b-cell lymphoma,86,0.07%
CCLE,brain lower grade glioma,83,0.07%
CCLE,colon adenocarcinoma,74,0.06%
CCLE,skin cutaneous melanoma,69,0.05%
CCLE,ovarian serous cystadenocarcinoma,60,0.05%
CCLE,stomach adenocarcinoma,57,0.04%
CCLE,pancreatic adenocarcinoma,57,0.04%


### TARGET

In [13]:
count_func(df, 'TARGET')

Unnamed: 0_level_0,Unnamed: 1_level_0,count,%
study,disease_name,Unnamed: 2_level_1,Unnamed: 3_level_1
TARGET,acute myeloid leukemia (non-tcga),3206,2.52%
TARGET,acute lymphoblastic leukemia,2984,2.35%
TARGET,neuroblastoma,2048,1.61%
TARGET,wilms tumor,1425,1.12%
TARGET,osteosarcoma,207,0.16%
TARGET,clear cell sarcoma of the kidney,39,0.03%
