In [1]:
import pandas as pd
import numpy as np

from pybioportal import studies as std
from pybioportal import clinical_data as cd
from pybioportal import clinical_attributes as ca

In [2]:
path = '/mnt/c/Users/u1531276/Desktop/coursework/bmi6015_applied_ml/final_project/raw_data/'

In [3]:
#Study Information
luad_studies = std.get_all_studies(keyword="luad", projection="DETAILED")
tcga_luad_study = std.get_study(study_id="luad_tcga_pan_can_atlas_2018")
tcga_pan_lung_study = std.get_study(study_id="nsclc_tcga_broad_2016")    # TCGA study
oncosg_luad_study = std.get_study(study_id="luad_oncosg_2020")    # Asian LUAD
sherlock_luad_study = std.get_study(study_id="lung_nci_2022")     # non-smoker Caucasian LUAD

In [4]:
tcga_clin_att = ca.get_all_clinical_attributes_in_study(study_id="nsclc_tcga_broad_2016", projection="DETAILED")
oncosg_clin_att = ca.get_all_clinical_attributes_in_study(study_id="luad_oncosg_2020", projection="DETAILED")
sherlock_clin_att = ca.get_all_clinical_attributes_in_study(study_id="lung_nci_2022", projection="DETAILED")

## Clinical data: TCGA lung cancer study

In [5]:
tcga_df1 = cd.fetch_all_clinical_data_in_study(study_id="luad_tcga_pan_can_atlas_2018",
                                               attribute_ids= ['AGE', 'OS_MONTHS', 'OS_STATUS', 'SEX', 'RACE'] ,
                                               clinical_data_type="PATIENT",
                                               ret_format="WIDE")

tcga_df2 = cd.fetch_all_clinical_data_in_study(study_id="nsclc_tcga_broad_2016",
                                               attribute_ids= ['SMOKING_HISTORY', 'STAGE'] ,
                                               clinical_data_type="PATIENT",
                                               ret_format="WIDE")

tcga_df3 = cd.fetch_all_clinical_data_in_study(study_id="nsclc_tcga_broad_2016",
                                               attribute_ids= ["CANCER_TYPE_DETAILED"],
                                               clinical_data_type="SAMPLE",
                                               ret_format="WIDE")

In [6]:
tcga_df = tcga_df1.merge(tcga_df2, on='patientId', how='inner')
tcga_df = tcga_df.merge(tcga_df3, on='patientId', how='inner')

cols = ['patientId', 'studyId', 'SEX', "AGE", 'RACE', "CANCER_TYPE_DETAILED", 'SMOKING_HISTORY', 'STAGE', "OS_MONTHS", 'OS_STATUS']
tcga_df = tcga_df[cols]

# Rename the columns
tcga_df.rename(columns={
    'patientId': 'patient_id',
    'studyId': 'study_id',
    'SEX': 'sex',
    'AGE': 'age',
    'RACE': 'race',
    'CANCER_TYPE_DETAILED': 'histology',
    'SMOKING_HISTORY': 'smoking_status',
    'STAGE': 'stage',
    'OS_MONTHS': 'survival_months',
    'OS_STATUS': 'survival_status'
}, inplace=True)

tcga_df


clinicalAttributeId,patient_id,study_id,sex,age,race,histology,smoking_status,stage,survival_months,survival_status
0,TCGA-44-4112,nsclc_tcga_broad_2016,Female,60,White,Lung Adenocarcinoma,Current Reformed Smoker For > 15 Years,IB,26.56409245,1:DECEASED
1,TCGA-44-5644,nsclc_tcga_broad_2016,Female,51,White,Lung Adenocarcinoma,Current Smoker,IB,28.37229181,0:LIVING
2,TCGA-44-5645,nsclc_tcga_broad_2016,Female,61,Black or African American,Lung Adenocarcinoma,Current Reformed Smoker For > 15 Years,IA,28.01065194,0:LIVING
3,TCGA-44-5643,nsclc_tcga_broad_2016,Male,53,Black or African American,Lung Adenocarcinoma,Current Smoker,IIIA,33.30374462,0:LIVING
4,TCGA-44-6144,nsclc_tcga_broad_2016,Male,58,White,Lung Adenocarcinoma,Current Smoker,IA,23.76960253,0:LIVING
...,...,...,...,...,...,...,...,...,...,...
490,TCGA-NJ-A55A,nsclc_tcga_broad_2016,Female,76,White,Lung Adenocarcinoma,Current Reformed Smoker For > 15 Years,IB,0.493145281,0:LIVING
491,TCGA-NJ-A55O,nsclc_tcga_broad_2016,Female,56,White,Lung Adenocarcinoma,Current Reformed Smoker For < Or = 15 Years,IIA,0.427392577,0:LIVING
492,TCGA-NJ-A55R,nsclc_tcga_broad_2016,Male,67,White,Lung Adenocarcinoma,Current Reformed Smoker For > 15 Years,IA,19.82444028,0:LIVING
493,TCGA-O1-A52J,nsclc_tcga_broad_2016,Female,74,White,Lung Adenocarcinoma,Current Reformed Smoker For > 15 Years,IA,59.11168097,1:DECEASED


## Clinical data: OncoSG lung cancer study

In [7]:
oncosg_df = cd.fetch_all_clinical_data_in_study(study_id="luad_oncosg_2020",
                                           clinical_data_type="PATIENT", ret_format="WIDE")

oncosg_df["CANCER_TYPE_DETAILED"] = 'LUAD'
cols = ['patientId', 'studyId', 'SEX', "AGE", 'ETHNICITY', "CANCER_TYPE_DETAILED", 'SMOKING_STATUS', 'STAGE', "OS_MONTHS", 'OS_STATUS']


oncosg_df = oncosg_df[cols]

oncosg_df.rename(columns={
    'patientId': 'patient_id',
    'studyId': 'study_id',
    'SEX': 'sex',
    'AGE': 'age',
    'ETHNICITY': 'race',
    'CANCER_TYPE_DETAILED': 'histology',
    'SMOKING_STATUS': 'smoking_status',
    'STAGE': 'stage',
    'OS_MONTHS': 'survival_months',
    'OS_STATUS': 'survival_status'
}, inplace=True)

oncosg_df

clinicalAttributeId,patient_id,study_id,sex,age,race,histology,smoking_status,stage,survival_months,survival_status
0,A062,luad_oncosg_2020,Male,74,Chinese,LUAD,No,IV,53.46666667,0:LIVING
1,A063,luad_oncosg_2020,Female,70,Chinese,LUAD,No,I,51.13333333,0:LIVING
2,A066,luad_oncosg_2020,Female,72,Chinese,LUAD,No,I,51.33333333,0:LIVING
3,A068,luad_oncosg_2020,Female,79,Chinese,LUAD,No,I,51.0,0:LIVING
4,A071,luad_oncosg_2020,Female,76,Chinese,LUAD,No,I,52.36666667,0:LIVING
...,...,...,...,...,...,...,...,...,...,...
300,BGI-WG20,luad_oncosg_2020,Male,62,Chinese,LUAD,No,III,7.0,0:LIVING
301,BGI-WG21,luad_oncosg_2020,Female,41,Chinese,LUAD,No,II,6.0,0:LIVING
302,BGI-WG22,luad_oncosg_2020,Female,34,Chinese,LUAD,No,III,24.0,0:LIVING
303,BGI-WG23,luad_oncosg_2020,Male,73,Chinese,LUAD,No,III,16.0,0:LIVING


## Clinical data: Sherlock lung cancer study

In [8]:
sherlock_df = cd.fetch_all_clinical_data_in_study(study_id="lung_nci_2022",
                                           clinical_data_type="PATIENT", ret_format="WIDE")

In [9]:
sherlock_df['ETHNICITY'] = 'caucasian'
sherlock_df['SMOKING_STATUS'] = 'non_smoker'
cols = ['patientId', 'studyId', 'SEX', "AGE", 'ETHNICITY', "HISTOLOGY", 'SMOKING_STATUS', 'TUMOR_STAGE', "OS_MONTHS", 'OS_STATUS']
sherlock_df = sherlock_df[cols]

sherlock_df.rename(columns={
    'patientId': 'patient_id',
    'studyId': 'study_id',
    'SEX': 'sex',
    'AGE': 'age',
    'ETHNICITY': 'race',
    'HISTOLOGY': 'histology',
    'SMOKING_STATUS': 'smoking_status',
    'TUMOR_STAGE': 'stage',
    'OS_MONTHS': 'survival_months',
    'OS_STATUS': 'survival_status'
}, inplace=True)

sherlock_df

clinicalAttributeId,patient_id,study_id,sex,age,race,histology,smoking_status,stage,survival_months,survival_status
0,NSLC-0004,lung_nci_2022,Female,69,caucasian,Adenocarcinomas,non_smoker,IA,197.9,0:LIVING
1,NSLC-0005,lung_nci_2022,Female,66,caucasian,Adenocarcinomas,non_smoker,III,26.6,1:DECEASED
2,NSLC-0006,lung_nci_2022,Male,36,caucasian,Carcinoids,non_smoker,IA,194.4,0:LIVING
3,NSLC-0007,lung_nci_2022,Female,72,caucasian,Adenocarcinomas,non_smoker,IB,114.3,1:DECEASED
4,NSLC-0008,lung_nci_2022,Female,77,caucasian,Adenocarcinomas,non_smoker,IA,18.6,1:DECEASED
...,...,...,...,...,...,...,...,...,...,...
227,NSLC-0255,lung_nci_2022,Male,,caucasian,Adenocarcinomas,non_smoker,IB,,0:LIVING
228,NSLC-0250,lung_nci_2022,Female,71,caucasian,Adenocarcinomas,non_smoker,IB,,0:LIVING
229,NSLC-0251,lung_nci_2022,Male,59,caucasian,Adenocarcinomas,non_smoker,III,,0:LIVING
230,NSLC-0252,lung_nci_2022,Female,,caucasian,Adenocarcinomas,non_smoker,IA,,0:LIVING


In [10]:
sherlock_df['stage'].value_counts()

stage
IA     94
IB     49
II     44
III    38
Name: count, dtype: int64

## Combine Clinical data

In [80]:
clinical_df = pd.concat([tcga_df, oncosg_df, sherlock_df], axis=0, ignore_index=True)
clinical_df = clinical_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
clinical_df

  clinical_df = clinical_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)


clinicalAttributeId,patient_id,study_id,sex,age,race,histology,smoking_status,stage,survival_months,survival_status
0,tcga-44-4112,nsclc_tcga_broad_2016,female,60,white,lung adenocarcinoma,current reformed smoker for > 15 years,ib,26.56409245,1:deceased
1,tcga-44-5644,nsclc_tcga_broad_2016,female,51,white,lung adenocarcinoma,current smoker,ib,28.37229181,0:living
2,tcga-44-5645,nsclc_tcga_broad_2016,female,61,black or african american,lung adenocarcinoma,current reformed smoker for > 15 years,ia,28.01065194,0:living
3,tcga-44-5643,nsclc_tcga_broad_2016,male,53,black or african american,lung adenocarcinoma,current smoker,iiia,33.30374462,0:living
4,tcga-44-6144,nsclc_tcga_broad_2016,male,58,white,lung adenocarcinoma,current smoker,ia,23.76960253,0:living
...,...,...,...,...,...,...,...,...,...,...
1027,nslc-0255,lung_nci_2022,male,,caucasian,adenocarcinomas,non_smoker,ib,,0:living
1028,nslc-0250,lung_nci_2022,female,71,caucasian,adenocarcinomas,non_smoker,ib,,0:living
1029,nslc-0251,lung_nci_2022,male,59,caucasian,adenocarcinomas,non_smoker,iii,,0:living
1030,nslc-0252,lung_nci_2022,female,,caucasian,adenocarcinomas,non_smoker,ia,,0:living


In [81]:
clinical_df['stage'].value_counts()

stage
ia      225
ib      180
i       141
iii     129
ii      102
iiia     71
iib      68
iia      50
iv       44
iiib     11
Name: count, dtype: int64

In [82]:
clinical_df['race'] = clinical_df['race'].replace({
    'white': 'caucasian',
    'caucasian': 'caucasian',
    'black or african american': 'african_american',
    'chinese': 'asian',
    'asian': 'asian',
    'american indian or alaska native': 'other',
    'hispanic': 'other',
    'other': 'other'
})

clinical_df['histology'] = clinical_df['histology'].replace({
    'lung adenocarcinoma': 'luad',
    'luad': 'luad',
    'adenocarcinomas': 'luad'
})

clinical_df['smoking_status'] = clinical_df['smoking_status'].replace({
    'current reformed smoker for > 15 years': 'smoker',
    'current smoker': 'smoker',
    'current reformed smoker for < or = 15 years': 'smoker',
    'current reformed smoker, duration not specified': 'smoker',
    'yes': 'smoker',
    'lifelong non-smoker': 'non_smoker',
    'no': 'non_smoker'
})

clinical_df['survival_status'] = clinical_df['survival_status'].replace({
    '1:deceased': 1,
    '0:living': 0
})

clinical_df['stage'] = clinical_df['stage'].replace({
    'i': 'stage_1',
    'ia': 'stage_1',
    'ib': 'stage_1',
    'ii': 'stage_2',
    'iia': 'stage_2',
    'iib': 'stage_2',
    'iiia': 'stage_3',
    'iiib': 'stage_3',
    'iii': 'stage_3',
    'iv': 'stage_4'
})

clinical_df = clinical_df[~clinical_df['histology'].str.contains('carcinoids|others', na=False)]


  clinical_df['survival_status'] = clinical_df['survival_status'].replace({


In [83]:
clinical_df['patient_id'] = clinical_df['patient_id'].astype(str)

clinical_df['age'] = pd.to_numeric(clinical_df['age'], errors='coerce')
clinical_df['survival_months'] = pd.to_numeric(clinical_df['survival_months'], errors='coerce')

categorical_columns = ['sex', 'race', 'histology', 'smoking_status', 'stage', 'survival_status']
clinical_df[categorical_columns] = clinical_df[categorical_columns].apply(lambda x: x.astype('category'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinical_df['patient_id'] = clinical_df['patient_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinical_df['age'] = pd.to_numeric(clinical_df['age'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinical_df['survival_months'] = pd.to_numeric(clinical_df['surviva

In [84]:
for col in categorical_columns:
    print(f"Unique values in {col}:")
    print(clinical_df[col].unique())
    print() 

Unique values in sex:
['female', 'male']
Categories (2, object): ['female', 'male']

Unique values in race:
['caucasian', 'african_american', NaN, 'asian', 'other']
Categories (4, object): ['african_american', 'asian', 'caucasian', 'other']

Unique values in histology:
['luad']
Categories (1, object): ['luad']

Unique values in smoking_status:
['smoker', 'non_smoker', NaN]
Categories (2, object): ['non_smoker', 'smoker']

Unique values in stage:
['stage_1', 'stage_3', 'stage_2', 'stage_4', NaN]
Categories (4, object): ['stage_1', 'stage_2', 'stage_3', 'stage_4']

Unique values in survival_status:
[1.0, 0.0, NaN]
Categories (2, float64): [0.0, 1.0]



## Add EGFR positivity data

In [85]:
egfr = pd.read_csv(path + 'EGFR_pos.tsv', sep='\t')
egfr_pos = egfr['Patient ID'].str.lower().tolist()

kras = pd.read_csv(path + 'kras_pos.tsv', sep='\t')
kras_pos = kras['Patient ID'].str.lower().tolist()

tp53 = pd.read_csv(path + 'tp53_pos.tsv', sep='\t')
tp53_pos = tp53['Patient ID'].str.lower().tolist()

In [86]:
clinical_df['egfr_mutated'] = clinical_df['patient_id'].apply(lambda x: 'egfr_pos' if x in egfr_pos else 'egfr_neg')
clinical_df['kras_mutated'] = clinical_df['patient_id'].apply(lambda x: 'kras_pos' if x in kras_pos else 'kras_neg')
clinical_df['tp53_mutated'] = clinical_df['patient_id'].apply(lambda x: 'tp53_pos' if x in tp53_pos else 'tp53_neg')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinical_df['egfr_mutated'] = clinical_df['patient_id'].apply(lambda x: 'egfr_pos' if x in egfr_pos else 'egfr_neg')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clinical_df['kras_mutated'] = clinical_df['patient_id'].apply(lambda x: 'kras_pos' if x in kras_pos else 'kras_neg')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retu

In [88]:
clinical_df['egfr_mutated'].value_counts()

egfr_mutated
egfr_neg    716
egfr_pos    273
Name: count, dtype: int64

In [89]:
clinical_df['kras_mutated'].value_counts()

kras_mutated
kras_neg    786
kras_pos    203
Name: count, dtype: int64

In [90]:
clinical_df['tp53_mutated'].value_counts()

tp53_mutated
tp53_neg    593
tp53_pos    396
Name: count, dtype: int64

## Save data

In [91]:
clinical_df.head()

clinicalAttributeId,patient_id,study_id,sex,age,race,histology,smoking_status,stage,survival_months,survival_status,egfr_mutated,kras_mutated,tp53_mutated
0,tcga-44-4112,nsclc_tcga_broad_2016,female,60.0,caucasian,luad,smoker,stage_1,26.564092,1.0,egfr_neg,kras_neg,tp53_neg
1,tcga-44-5644,nsclc_tcga_broad_2016,female,51.0,caucasian,luad,smoker,stage_1,28.372292,0.0,egfr_neg,kras_neg,tp53_pos
2,tcga-44-5645,nsclc_tcga_broad_2016,female,61.0,african_american,luad,smoker,stage_1,28.010652,0.0,egfr_pos,kras_neg,tp53_pos
3,tcga-44-5643,nsclc_tcga_broad_2016,male,53.0,african_american,luad,smoker,stage_3,33.303745,0.0,egfr_neg,kras_neg,tp53_neg
4,tcga-44-6144,nsclc_tcga_broad_2016,male,58.0,caucasian,luad,smoker,stage_1,23.769603,0.0,egfr_neg,kras_pos,tp53_pos


In [92]:
clinical_df.to_csv(path + 'clinical_data.csv', index=False, na_rep='NA')
