In [1]:
#load in packages
import pandas as pd
import numpy as np

###This notebook searches the datahub files/portal API for non-compliant clinical attribute IDs 
####An attribute is compliant if all of the characters are alpha-numeric (contain only r'^[A-Za-z0-9_]+$' characters)

##First check files on the datahub

In [3]:
#load the data
file_list = np.loadtxt("data_clinical_files_all.txt", dtype=str)
study_names = np.asarray([i.split('/', 3)[-2] for i in file_list])

df_list = []
n_attributes = []
i=0
study_data = []
for clinical_file in file_list:
    #if 'sclc_ucologne_2015' in clinical_file:
    #load in individual data from files and append to a list
    df = pd.read_table(clinical_file, skiprows=0)
    if (list(df)[0]!='SAMPLE_ID' and list(df)[0]!='PATIENT_ID'):
        rows_to_skip=df[(df[df.columns[0]]=="SAMPLE_ID") | (df[df.columns[0]]=="PATIENT_ID") | (df[df.columns[0]]=="OTHER_PATIENT_ID") | (df[df.columns[0]]=="OTHER_SAMPLE_ID")].index[0]+1
        df = pd.read_table(clinical_file, skiprows=rows_to_skip)
    df.columns = map(str.upper, df.columns)
    n_attributes.append(df.shape[1])
    df_list.append(df)
    study_name = clinical_file.split('/', 3)[-2]
    col_names = list(df)
    study_data.append((study_name, col_names))
    i+=1

#combine individual dataframes into a single list
all_data = pd.concat(df_list, ignore_index=True)

In [4]:
study_names_data = []
for i in study_data[0:len(study_data)]:
    for j in i[1]:
        study_names_data.append((i[0],j))

study_data_df = pd.DataFrame.from_records(study_names_data, columns = ['study name','col name'])
study_data_df2 = pd.get_dummies(study_data_df['col name'])
study_data_combined = pd.concat([study_data_df['study name'], study_data_df2], axis=1)
study_data_combined = study_data_combined.groupby('study name').sum()
study_data_combined.drop([col for col, val in study_data_combined.sum().iteritems() if val < 0], axis=1, inplace=True)

In [5]:
import re
attributes=list(study_data_combined)

irregular_attributes = []
irregular_studies = []
irregular_data = np.empty([0, 2])
for attribute in attributes:
    if not re.match( r'^[A-Za-z0-9_]+$', attribute):
        studies = study_data_combined[study_data_combined[attribute]>0].index
        for study in studies:
            irregular_data = np.vstack([irregular_data, [study, attribute]])

##Non compliant studies/attributes on the datahub files

In [6]:
pd.DataFrame(np.sort(irregular_data, axis=0), columns=['study', 'attribute'])

Unnamed: 0,study,attribute
0,brca_broad,AGE (Y)
1,brca_tcga_pub,AJCC STAGE
2,brca_tcga_pub,ALB_(G/DL)
3,brca_tcga_pub,ALB_(G/DL)
4,brca_tcga_pub,ALP_(U/L)
5,brca_tcga_pub,ALP_(U/L)
6,brca_tcga_pub,BRS: BRAFV600E_RAF_SCORE
7,brca_tcga_pub,CN CLUSTERS
8,brca_tcga_pub,CONVERTED STAGE
9,brca_tcga_pub,ER STATUS


##Now check files on the api for clompiance

In [9]:
#read in study names
import urllib, json
url = "http://www.cbioportal.org/api/studies"
response = urllib.urlopen(url)
studies = json.loads(response.read())

studyIDs=[]
for study in studies:
    studyIDs.append(study['studyId'])

studies=[]
study_attributes = []

#loop through studies and read the data
for study in studyIDs:
    study = study.strip('\n')
    api_url = 'http://cbioportal.org/api/studies/'+study+'/clinical-attributes'
    df = pd.read_json(api_url)
    if not df.empty:              
        studies.append((study,df)) 
        study_attributes.append((study,df['clinicalAttributeId'].tolist()))

In [11]:
#transform data
study_attribute = []
for i in study_attributes:#[0:studies_limit]:
    for j in i[1]:
        study_attribute.append((i[0],j))

study_data_df = pd.DataFrame.from_records(study_attribute, columns = ['study name','col name'])
study_data_df2 = pd.get_dummies(study_data_df['col name'])
study_data_combined = pd.concat([study_data_df['study name'], study_data_df2], axis=1)
study_data_combined = study_data_combined.groupby('study name').sum()
study_data_combined.drop([col for col, val in study_data_combined.sum().iteritems() if val < 0], axis=1, inplace=True)

In [12]:
import re
attributes=list(study_data_combined)

irregular_attributes = []
irregular_studies = []
irregular_data = np.empty([0, 2])
for attribute in attributes:
    if not re.match(r'^[A-Za-z0-9_]+$', attribute):
        studies = study_data_combined[study_data_combined[attribute]>0].index
        for study in studies:
            irregular_data = np.vstack([irregular_data, [study, attribute]])

##Non compliant studies/attributes on the portal API

In [13]:
pd.DataFrame(np.sort(irregular_data, axis=0), columns=['study', 'attribute'])

Unnamed: 0,study,attribute
0,brca_broad,ALB_(G/DL)
1,gbm_tcga_pub2013,ALP_(U/L)
2,hnc_mskcc_2016,EC-SMG
3,prad_mskcc_cheny1_organoids_2014,G-CIMP_METHYLATION
4,prad_mskcc_cheny1_organoids_2014,GEA-CIN
5,prad_mskcc_cheny1_organoids_2014,GEA-CIN-SMG
6,prad_mskcc_cheny1_organoids_2014,GEA-GEJ
7,prad_mskcc_cheny1_organoids_2014,GEA-SMG
8,stes_tcga_pub,HGB_(G/DL)
9,stes_tcga_pub,LDH_(U/L)
