# NCI GDC Data Analysis

Datasets were pulled from https://portal.gdc.cancer.gov/analysis_page?app=CohortBuilder&tab=general_diagnosis 
They were filtered for Breast Cancer before pulled. Need to check with Arjita on the exact process of that

## Load Packages

In [163]:
import pandas as pd
import plotly.express as px
import numpy as np
import requests
from io import StringIO

pd.set_option('future.no_silent_downcasting', True)

## Import Data from Github

Pull in gene dataset. 

In [166]:
response = requests.get('https://raw.githubusercontent.com/aaditya0106/cancer-dashboard/main/Data/HierCluster.2024-11-19.tsv')
if response.status_code == 200:
    gene = pd.read_csv(StringIO(response.text), sep='\t')
    print("Data loaded successfully!")
else:
    print(f"Failed to fetch data: {response.status_code}")

Data loaded successfully!


Pull in clinical dataset.

In [168]:
response = requests.get('https://raw.githubusercontent.com/aaditya0106/cancer-dashboard/main/Data/clinical.tsv')
if response.status_code == 200:
    clinical = pd.read_csv(StringIO(response.text), sep='\t')
    print("Data loaded successfully!")
else:
    print(f"Failed to fetch data: {response.status_code}")

Data loaded successfully!


## Basic gene data exploration

In [170]:
gene.head()

Unnamed: 0,Case,ANO1,CTTN,GAB2,TSKU,PEG10,TSPAN12,VSIG2,INHBB,CLU,...,DSG2,PGGHG,MYH14,PTPRU,CEP170B,TRPM4,AGRN,LAMA5,SLC17A9,SMIM24
0,TCGA-AC-A2QJ,-0.850784,-0.06779,-0.373197,0.353471,-0.424534,-0.627105,-0.22579,-0.428333,-0.396251,...,-1.360506,-0.405373,-1.069204,-0.6424,-0.293741,-0.317442,1.454789,-0.886521,2.645691,0.438836
1,TCGA-FA-A7DS,-0.903327,-0.984412,-0.434487,-0.533308,-0.510474,-0.649774,-0.417569,-0.59152,-0.41501,...,-1.479773,-0.243668,-1.159843,-1.03882,-1.792239,2.025728,-1.518411,-0.80108,4.244228,-0.274578
2,TCGA-A2-A4S1,-0.749352,-0.266099,-0.379331,0.198941,-0.155934,-0.53095,-0.325875,-0.373839,-0.306217,...,-1.429917,0.136463,-1.169063,-0.786902,-0.390244,-0.130008,0.732697,-0.683839,0.121208,-0.107634
3,TCGA-AR-A5QQ,0.292535,-0.049525,-0.475821,0.243683,-0.502936,-0.565909,-0.318526,-0.471878,-0.377541,...,-0.485598,-0.440984,-0.992049,-0.228702,-0.04655,-1.02694,1.468554,-0.556163,-0.100514,-0.221282
4,TCGA-E9-A5FL,-0.7871,0.226868,-0.429163,0.435614,-0.384482,-0.50795,0.562074,-0.48045,-0.251577,...,-0.915629,1.06832,-0.543074,-0.115328,0.107458,-0.455221,0.150628,0.788012,-0.333208,1.21065


In [171]:
gene.shape

(1000, 1001)

In [172]:
gene.dtypes

Case        object
ANO1       float64
CTTN       float64
GAB2       float64
TSKU       float64
            ...   
TRPM4      float64
AGRN       float64
LAMA5      float64
SLC17A9    float64
SMIM24     float64
Length: 1001, dtype: object

Determine if any values are Null. There are no null values and there are no duplicate samples. 

In [174]:
gene.isnull().any().value_counts()

False    1001
Name: count, dtype: int64

In [175]:
gene['Case'].duplicated().sum()

0

## Basic clinical data exploration

clinical.head()

In [178]:
clinical.columns

Index(['case_id', 'case_submitter_id', 'project_id', 'age_at_index',
       'age_is_obfuscated', 'cause_of_death', 'cause_of_death_source',
       'country_of_birth', 'country_of_residence_at_enrollment',
       'days_to_birth',
       ...
       'treatment_dose_units', 'treatment_duration', 'treatment_effect',
       'treatment_effect_indicator', 'treatment_frequency',
       'treatment_intent_type', 'treatment_or_therapy', 'treatment_outcome',
       'treatment_outcome_duration', 'treatment_type'],
      dtype='object', length=219)

In [179]:
clinical.shape

(5268, 219)

Make sure tissue of origin is breast related. These are the types of tissues currently included. 

In [181]:
# Generate the summary table of what tissue_or_organ_of_origin is
summary_table = clinical['tissue_or_organ_of_origin'].value_counts()

# Display the summary table
print(summary_table)

tissue_or_organ_of_origin
Breast, NOS                       5175
Not Reported                        72
Lower-inner quadrant of breast       6
Upper-inner quadrant of breast       4
Upper-outer quadrant of breast       4
Overlapping lesion of breast         4
Lower-outer quadrant of breast       2
'--                                  1
Name: count, dtype: int64


Remove rows where tissue or organ of origin are Not Reported or '--

In [183]:
clinicalBreast = clinical[~clinical['tissue_or_organ_of_origin'].isin(['Not Reported', "'--"])]

# Check the result
print(clinicalBreast['tissue_or_organ_of_origin'].value_counts())


tissue_or_organ_of_origin
Breast, NOS                       5175
Lower-inner quadrant of breast       6
Upper-inner quadrant of breast       4
Upper-outer quadrant of breast       4
Overlapping lesion of breast         4
Lower-outer quadrant of breast       2
Name: count, dtype: int64


In [184]:
clinicalBreast.head()

Unnamed: 0,case_id,case_submitter_id,project_id,age_at_index,age_is_obfuscated,cause_of_death,cause_of_death_source,country_of_birth,country_of_residence_at_enrollment,days_to_birth,...,treatment_dose_units,treatment_duration,treatment_effect,treatment_effect_indicator,treatment_frequency,treatment_intent_type,treatment_or_therapy,treatment_outcome,treatment_outcome_duration,treatment_type
0,00016c8f-a0be-4319-9c42-4f3bcd90ac92,AD1602,FM-AD,'--,'--,'--,'--,'--,'--,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
1,001cef41-ff86-4d3f-a140-a647ac4b10a1,TCGA-E2-A1IU,TCGA-BRCA,60,'--,'--,'--,'--,'--,-22279,...,'--,'--,'--,'--,'--,'--,no,'--,'--,"Radiation Therapy, NOS"
2,001cef41-ff86-4d3f-a140-a647ac4b10a1,TCGA-E2-A1IU,TCGA-BRCA,60,'--,'--,'--,'--,'--,-22279,...,'--,'--,'--,'--,'--,'--,yes,'--,'--,"Pharmaceutical Therapy, NOS"
3,002cdb51-32c0-40be-b92f-60961f091bdf,AD16494,FM-AD,'--,'--,'--,'--,'--,'--,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--
4,0045349c-69d9-4306-a403-c9c1fa836644,TCGA-A1-A0SB,TCGA-BRCA,70,'--,'--,'--,'--,'--,-25833,...,'--,'--,'--,'--,'--,'--,not reported,'--,'--,"Radiation Therapy, NOS"


Check that 73 invalid rows were removed

In [186]:
clinicalBreast.shape

(5195, 219)

Get number of duplicate case_submitter_id numbers. These are patients with multiple rows of data.

In [188]:
clinicalBreast['case_submitter_id'].duplicated().sum()

1148

Get a table of the duplicates

In [190]:
# Identify all rows with duplicates
all_duplicates = clinicalBreast[clinicalBreast['case_submitter_id'].duplicated(keep=False)]

## Merge gene and clinicalBreast

In [192]:
geneClinical = gene.merge(clinicalBreast, left_on='Case', right_on='case_submitter_id', how='inner')

In [193]:
geneClinical.shape

(1854, 1220)

Check results for a duplicate case_submitter_id. We want to make sure each case_submitter_id/case always gets the same gene expression data

In [195]:
geneClinical['case_submitter_id'].duplicated().sum()

855

Get a table of the duplicates

In [197]:
# Identify all rows with duplicates
all_duplicatesMerge = geneClinical[geneClinical['case_submitter_id'].duplicated(keep=False)]

In [198]:
all_duplicatesMerge.head()

Unnamed: 0,Case,ANO1,CTTN,GAB2,TSKU,PEG10,TSPAN12,VSIG2,INHBB,CLU,...,treatment_dose_units,treatment_duration,treatment_effect,treatment_effect_indicator,treatment_frequency,treatment_intent_type,treatment_or_therapy,treatment_outcome,treatment_outcome_duration,treatment_type
0,TCGA-AC-A2QJ,-0.850784,-0.06779,-0.373197,0.353471,-0.424534,-0.627105,-0.22579,-0.428333,-0.396251,...,'--,'--,'--,'--,'--,'--,yes,'--,'--,"Radiation Therapy, NOS"
1,TCGA-AC-A2QJ,-0.850784,-0.06779,-0.373197,0.353471,-0.424534,-0.627105,-0.22579,-0.428333,-0.396251,...,'--,'--,'--,'--,'--,'--,yes,'--,'--,"Pharmaceutical Therapy, NOS"
2,TCGA-FA-A7DS,-0.903327,-0.984412,-0.434487,-0.533308,-0.510474,-0.649774,-0.417569,-0.59152,-0.41501,...,'--,'--,'--,'--,'--,'--,no,'--,'--,"Radiation Therapy, NOS"
3,TCGA-FA-A7DS,-0.903327,-0.984412,-0.434487,-0.533308,-0.510474,-0.649774,-0.417569,-0.59152,-0.41501,...,'--,'--,'--,'--,'--,'--,yes,'--,'--,"Pharmaceutical Therapy, NOS"
4,TCGA-A2-A4S1,-0.749352,-0.266099,-0.379331,0.198941,-0.155934,-0.53095,-0.325875,-0.373839,-0.306217,...,'--,'--,'--,'--,'--,'--,yes,'--,'--,"Radiation Therapy, NOS"


If I drop clinical columns, then drop duplicates. Do I get a table 1000 rows long?

In [200]:
short = geneClinical.iloc[:, 0:1001]
short.shape

(1854, 1001)

In [201]:
short.columns

Index(['Case', 'ANO1', 'CTTN', 'GAB2', 'TSKU', 'PEG10', 'TSPAN12', 'VSIG2',
       'INHBB', 'CLU',
       ...
       'DSG2', 'PGGHG', 'MYH14', 'PTPRU', 'CEP170B', 'TRPM4', 'AGRN', 'LAMA5',
       'SLC17A9', 'SMIM24'],
      dtype='object', length=1001)

In [202]:
shortNoDup = short.drop_duplicates()
shortNoDup.shape

(999, 1001)

I have less than the 1000 samples I started with. Check what happened to one sample. 

In [204]:
# Get indices where 'Case' in 'gene' is not in 'shortNoDup'
indices_not_in_shortNoDup = gene[~gene['Case'].isin(shortNoDup['Case'])].index

# Print the result
print(indices_not_in_shortNoDup)


Index([697], dtype='int64')


In [205]:
# print rows that do not have a case in the merged dataset
gene.iloc[indices_not_in_shortNoDup, :]

Unnamed: 0,Case,ANO1,CTTN,GAB2,TSKU,PEG10,TSPAN12,VSIG2,INHBB,CLU,...,DSG2,PGGHG,MYH14,PTPRU,CEP170B,TRPM4,AGRN,LAMA5,SLC17A9,SMIM24
697,TCGA-BH-A0B2,-0.335215,-0.533468,-0.303364,-0.265946,-0.48188,-0.443199,-0.26824,-0.563447,-0.012233,...,-0.189918,-0.225109,-0.636121,-0.661413,0.303512,-0.515584,0.059583,-0.343663,-0.444788,0.077625


In [206]:
gene[gene['Case'] == 'TCGA-BH-A0B2']

Unnamed: 0,Case,ANO1,CTTN,GAB2,TSKU,PEG10,TSPAN12,VSIG2,INHBB,CLU,...,DSG2,PGGHG,MYH14,PTPRU,CEP170B,TRPM4,AGRN,LAMA5,SLC17A9,SMIM24
697,TCGA-BH-A0B2,-0.335215,-0.533468,-0.303364,-0.265946,-0.48188,-0.443199,-0.26824,-0.563447,-0.012233,...,-0.189918,-0.225109,-0.636121,-0.661413,0.303512,-0.515584,0.059583,-0.343663,-0.444788,0.077625


In [207]:
clinicalBreast[clinicalBreast['case_submitter_id'] == 'TCGA-BH-A0B2']

Unnamed: 0,case_id,case_submitter_id,project_id,age_at_index,age_is_obfuscated,cause_of_death,cause_of_death_source,country_of_birth,country_of_residence_at_enrollment,days_to_birth,...,treatment_dose_units,treatment_duration,treatment_effect,treatment_effect_indicator,treatment_frequency,treatment_intent_type,treatment_or_therapy,treatment_outcome,treatment_outcome_duration,treatment_type


In [208]:
clinical[clinical['case_submitter_id'] == 'TCGA-BH-A0B2']

Unnamed: 0,case_id,case_submitter_id,project_id,age_at_index,age_is_obfuscated,cause_of_death,cause_of_death_source,country_of_birth,country_of_residence_at_enrollment,days_to_birth,...,treatment_dose_units,treatment_duration,treatment_effect,treatment_effect_indicator,treatment_frequency,treatment_intent_type,treatment_or_therapy,treatment_outcome,treatment_outcome_duration,treatment_type
1810,57a1604c-60b7-4b30-a75e-f70939532c5c,TCGA-BH-A0B2,TCGA-BRCA,'--,'--,'--,'--,'--,'--,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--


## Explore the merged data

In [210]:
geneClinical2 = geneClinical.replace("'--", np.NaN)

In [211]:
# Report the number of nulls per column
null_counts = geneClinical2.isnull().sum()


In [212]:
# Filter columns where the number of nulls is <= 1300
columns_to_keep = null_counts[null_counts <= 1800].index

# Select only the columns to keep
filtered_geneClinical = geneClinical2[columns_to_keep]

print(f"Columns removed: {set(geneClinical2.columns) - set(filtered_geneClinical.columns)}")
print(f"Filtered DataFrame shape: {filtered_geneClinical.shape}")


Columns removed: {'clark_level', 'tumor_stage', 'iss_stage', 'breslow_thickness', 'treatment_dose', 'metastasis_at_diagnosis_site', 'double_expressor_lymphoma', 'clinical_trial_indicator', 'route_of_administration', 'melanoma_known_primary', 'therapeutic_level_achieved', 'gleason_grade_tertiary', 'ann_arbor_clinical_stage', 'lymph_node_involved_site', 'secondary_gleason_grade', 'regimen_or_line_of_therapy', 'margin_distance', 'uicc_pathologic_stage', 'lymphatic_invasion_present', 'ajcc_clinical_stage', 'anaplasia_present_type', 'premature_at_birth', 'uicc_pathologic_m', 'tumor_of_origin', 'mitotic_count', 'pretreatment', 'prescribed_dose_units', 'uicc_pathologic_n', 'therapeutic_agents', 'inrg_stage', 'treatment_effect', 'enneking_msts_tumor_site', 'treatment_duration', 'treatment_outcome', 'eln_risk_classification', 'treatment_anatomic_sites', 'international_prognostic_index', 'therapeutic_target_level', 'best_overall_response', 'number_of_fractions', 'percent_tumor_invasion', 'medull

There are 1854 rows. When we removed columns that had >1800 nulls then we removed 184 columns. Now checking if there are other columns that have a high percentage of nulls

In [214]:
filtered_geneClinical.isnull().sum().sort_values(ascending = False)

year_of_death                  1690
days_to_death                  1615
ajcc_staging_system_edition     391
days_to_last_follow_up          332
ajcc_pathologic_stage           180
                               ... 
COL7A1                            0
ITGB4                             0
RGS2                              0
IGSF3                             0
THBD                              0
Length: 1036, dtype: int64

I am going to remove year_of_death and days_to_death as these are at best 87% null. After those that amount of nulls drops sharply

In [216]:
# Drop the specified columns from the DataFrame
filtered_geneClinical2 = filtered_geneClinical.drop(['year_of_death', 'days_to_death'], axis=1)

# Display the updated DataFrame
print(filtered_geneClinical2.shape)


(1854, 1034)


In [217]:
## Output to csv to plot in Tableau

In [253]:
filtered_geneClinical2.to_csv('nciClean.csv')