# Cancer Genomics Hub Summary Statistics

In [1]:
import os

import numpy as np
import pandas as pd
pd.set_option('max_columns', 50)
pd.set_option('max_rows', 1000)

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (15, 9)

http://stackoverflow.com/questions/18770504/resize-ipython-notebook-output-window

In [2]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 1000;

<IPython.core.display.Javascript object>

The `LATEST_MANIFEST.tsv` file is being updated daily, see [here](https://cghub.ucsc.edu/summary_stats.html) for more information

In [3]:
df = pd.read_csv('./metadata/LATEST_MANIFEST.final.tsv.gz', low_memory=False, sep='\t', compression='gzip')

In [4]:
import datetime
print(datetime.datetime.now())

2018-05-09 10:27:27.950460


## Available columns

In [5]:
interval = 3
for k, c in enumerate(sorted(df.columns)):
    if (k + 1) % 4 == 0:
        print()
    else:
        print('{0:2d}: {1:25s}'.format(k, c), end='')

 0: aliquot_id                1: analysis_id               2: analyte_type             
 4: assembly                  5: barcode                   6: catalog_number           
 8: center_name               9: checksum                 10: disease                  
12: file_type                13: filename                 14: files_size               
16: library_type             17: modified                 18: participant_id           
20: platform_full_name       21: platform_name            22: probe_file_url           
24: reagent_name             25: reagent_vendor           26: reason                   
28: sample_id                29: sample_type              30: sample_type_code         
32: state                    33: study                    34: target_file_url          
36: uploaded                 

In [6]:
df.study.value_counts()

TCGA                                                                                                 99575
TARGET                                                                                               11392
TCGA MC3 VARIANT CALLING PROJECT                                                                     10532
PCAWG 2.0                                                                                             3911
CCLE                                                                                                  1296
CGHUB GENERAL TEST                                                                                     330
CGCI                                                                                                   297
TCGA STUDY FOR HOLDING AMONG OTHERS, FILES WHICH ARE SUPPRESSED FOR QUALITY REASONS BUT ARE CITED      291
REBC                                                                                                   202
TARGET STUDY FOR HOLDING NOT FULLY RE

In [7]:
df.query('sample_type_name == "Blood Derived Normal"').analyte_type.value_counts()

DNA      21695
WGA       2999
WGA X      131
Name: analyte_type, dtype: int64

In [8]:
df.sample_type_name.value_counts()

Primary solid Tumor                                73547
Blood Derived Normal                               24825
Solid Tissue Normal                                11434
Metastatic                                          2626
Primary Blood Derived Cancer - Peripheral Blood     2029
Cell Lines                                          1280
Recurrent Solid Tumor                                361
Control Analyte                                      143
Additional - New Primary                              61
EBV Immortalized Normal                               24
Bone Marrow Normal                                    15
Buccal Cell Normal                                    12
Additional Metastatic                                  3
Name: sample_type_name, dtype: int64

In [9]:
df[(df.study == 'TCGA') & (df.file_type == 'fasta')].head()

Unnamed: 0,study,barcode,disease,disease_name,sample_type,sample_type_name,analyte_type,library_type,center,center_name,platform,platform_name,assembly,filename,files_size,checksum,analysis_id,aliquot_id,participant_id,sample_id,tss_id,sample_accession,published,uploaded,modified,state,sample_type_code,analyte_type_code,platform_full_name,file_type,reason,reagent_vendor,reagent_name,catalog_number,is_custom,target_file_url,probe_file_url
1937,TCGA,TCGA-CS-6188-01A-11R-1896-07,LGG,BRAIN LOWER GRADE GLIOMA,TP,Primary solid Tumor,RNA,RNA-Seq,UNC-LCCC,,ILLUMINA,Illumina,unaligned,UNCID_2149605.66770b06-2cd6-4773-b8e8-5b38faa4...,6354301000.0,4181ac122b0a09f28cde79a9c3d5af39,3a8e6a74-137f-468f-8987-fa0acdde2836,66770b06-2cd6-4773-b8e8-5b38faa4f5a4,3f70c3e3-0131-466f-92aa-0a63ab3d4258,d35539d2-209c-4782-84b5-04b7d10db7d8,CS,,2013-08-14,2013-08-14,2013-08-14,Live,1,R,Illumina HiSeq 2000,fasta,,,,,,,
1990,TCGA,TCGA-EO-A22Y-01A-11R-A180-07,UCEC,UTERINE CORPUS ENDOMETRIOID CARCINOMA,TP,Primary solid Tumor,RNA,RNA-Seq,UNC-LCCC,,ILLUMINA,Illumina,unaligned,UNCID_2166575.24ee58d3-60c9-4808-b35a-2767594b...,5497637000.0,e0f0b269bcbf35f41ab8d73167ac943d,c63d44f1-c9ad-4391-b1aa-0feb3713441b,24ee58d3-60c9-4808-b35a-2767594b83f8,e6183a43-24b9-4fe1-95c3-4cff67e6c1ed,1fcdc9c7-5448-4a87-9ad5-cfb60837c296,EO,,2013-08-29,2013-08-29,2013-08-29,Live,1,R,Illumina HiSeq 2000,fasta,,,,,,,
1991,TCGA,TCGA-94-A5I4-01A-11R-A26W-07,LUSC,LUNG SQUAMOUS CELL CARCINOMA,TP,Primary solid Tumor,RNA,RNA-Seq,UNC-LCCC,,ILLUMINA,Illumina,unaligned,UNCID_2167520.ee589fc8-daf6-4da9-9708-f18a86b9...,4600318000.0,53d77012055aee11b845da6f5cb9dea9,21912089-1e42-4bcc-9ad9-fe9a9b88fb09,ee589fc8-daf6-4da9-9708-f18a86b96b22,442c412d-6cc5-4ffd-810c-c94c8e643bd6,bf85a46b-d7c8-4753-a4c0-8f774dba9e06,94,,2013-08-29,2013-08-29,2013-08-29,Live,1,R,Illumina HiSeq 2000,fasta,,,,,,,
1992,TCGA,TCGA-N5-A4RV-01A-21R-A28V-07,UCS,UTERINE CARCINOSARCOMA,TP,Primary solid Tumor,RNA,RNA-Seq,UNC-LCCC,,ILLUMINA,Illumina,unaligned,UNCID_2167728.08dbecd4-ba05-4d62-b76c-62fddccd...,4964388000.0,634de81aca55cf48b3058f205e97ccb5,6b86e544-0372-434a-9f28-6fa4075dd228,08dbecd4-ba05-4d62-b76c-62fddccdb167,3af5b391-e72f-463d-a086-a86c6c30a51a,fafb2550-3101-45b8-bfcd-4550eb031621,N5,,2013-08-31,2013-08-30,2013-08-31,Live,1,R,Illumina HiSeq 2000,fasta,,,,,,,
1993,TCGA,TCGA-PG-A5BC-01A-12R-A27V-07,UCEC,UTERINE CORPUS ENDOMETRIOID CARCINOMA,TP,Primary solid Tumor,RNA,RNA-Seq,UNC-LCCC,,ILLUMINA,Illumina,unaligned,UNCID_2168981.47e8bbd2-fd10-4dc8-882e-cb88ca27...,5691728000.0,175a1ea9fc8cef4c4a09403d3fc70b52,9e38e9bd-05fe-4257-b5ea-a23fc1d827d9,47e8bbd2-fd10-4dc8-882e-cb88ca27338d,22669a01-21d3-404c-ac19-cf0a6c03bb1d,0a63bf84-a720-4b8b-8312-4ec59d14982e,PG,,2013-08-31,2013-08-31,2013-08-31,Live,1,R,Illumina HiSeq 2000,fasta,,,,,,,


In [10]:
def lower(s):
    if isinstance(s, float) and np.isnan(s):
        return s
    else:
        return s.lower()

# lower case for better readability
df.disease_name = df.disease_name.apply(lower)

# Number of samples

In [11]:
assert df.analysis_id.unique().shape[0] == 128189

## Available studies (e.g. TCGA, CCLE)

In [12]:
_df = df.study\
    .value_counts()\
    .to_frame()\
    .reset_index()\
    .rename(columns={'study': 'count'})
_df['%'] = (_df['count'] / df.shape[0]).apply('{0:.2%}'.format)
_df

Unnamed: 0,index,count,%
0,TCGA,99575,77.55%
1,TARGET,11392,8.87%
2,TCGA MC3 VARIANT CALLING PROJECT,10532,8.20%
3,PCAWG 2.0,3911,3.05%
4,CCLE,1296,1.01%
5,CGHUB GENERAL TEST,330,0.26%
6,CGCI,297,0.23%
7,"TCGA STUDY FOR HOLDING AMONG OTHERS, FILES WHI...",291,0.23%
8,REBC,202,0.16%
9,TARGET STUDY FOR HOLDING NOT FULLY REPAIRED CG...,194,0.15%


## Sample distribution by disease name

In [13]:
def count_func(df, study_name):
    """helper function that does the counting"""
    _df = df[df.study == study_name].groupby(['study', 'disease_name'])\
    .count()[['barcode']]\
    .rename(columns={'barcode': 'count'})\
    .reset_index()\
    .sort_values(['study', 'count'])[::-1]\
    .set_index(['study', 'disease_name'])
    _df['%'] = (_df['count'] / df.shape[0]).apply('{0:.2%}'.format)
    return _df

### TCGA

In [14]:
count_func(df, 'TCGA')

Unnamed: 0_level_0,Unnamed: 1_level_0,count,%
study,disease_name,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA,kidney renal clear cell carcinoma,8946,6.97%
TCGA,breast invasive carcinoma,7642,5.95%
TCGA,ovarian serous cystadenocarcinoma,5964,4.64%
TCGA,stomach adenocarcinoma,5384,4.19%
TCGA,colon adenocarcinoma,5360,4.17%
TCGA,lung adenocarcinoma,5193,4.04%
TCGA,uterine corpus endometrioid carcinoma,4554,3.55%
TCGA,head and neck squamous cell carcinoma,4554,3.55%
TCGA,thyroid carcinoma,4428,3.45%
TCGA,lung squamous cell carcinoma,4290,3.34%


### CCLE

In [15]:
count_func(df, 'CCLE')

Unnamed: 0_level_0,Unnamed: 1_level_0,count,%
study,disease_name,Unnamed: 2_level_1,Unnamed: 3_level_1
CCLE,lung squamous cell carcinoma,220,0.17%
CCLE,chronic lymphocytic leukemia,124,0.10%
CCLE,breast invasive carcinoma,101,0.08%
CCLE,lymphoid neoplasm diffuse large b-cell lymphoma,86,0.07%
CCLE,brain lower grade glioma,83,0.06%
CCLE,colon adenocarcinoma,74,0.06%
CCLE,skin cutaneous melanoma,69,0.05%
CCLE,ovarian serous cystadenocarcinoma,60,0.05%
CCLE,stomach adenocarcinoma,57,0.04%
CCLE,pancreatic adenocarcinoma,57,0.04%


### TARGET

In [16]:
count_func(df, 'TARGET')

Unnamed: 0_level_0,Unnamed: 1_level_0,count,%
study,disease_name,Unnamed: 2_level_1,Unnamed: 3_level_1
TARGET,acute myeloid leukemia (non-tcga),3630,2.83%
TARGET,acute lymphoblastic leukemia,2997,2.33%
TARGET,neuroblastoma,2057,1.60%
TARGET,wilms tumor,1436,1.12%
TARGET,osteosarcoma,207,0.16%
TARGET,clear cell sarcoma of the kidney,39,0.03%
