## ETL Process for CDC data

### Dependencies

In [1]:
import pandas as pd
import pymongo
from pymongo import MongoClient
# pd.set_option('display.max_rows', 1000)

### Extract Data

In [2]:
# Store filepath in a variable
input_file = "../static/data/final_df.csv"

In [3]:
# Read our Data file with the pandas library
raw_df = pd.read_csv(input_file, low_memory=False)

In [4]:
# Show just the header
raw_df.head()

Unnamed: 0,Sex,Year of diagnosis,"Race and origin recode (NHW, NHB, NHAIAN, NHAPI, Hispanic)",Survival months,Vital status recode (study cutoff used),Site recode ICD-O-3/WHO 2008,SEER Combined Summary Stage 2000 (2004-2017),Lymphoma - Ann Arbor Stage (1983-2015),COD to site recode,SEER cause-specific death classification,...,Number of Cores Positive Recode (2010+),Number of Cores Examined Recode (2010+),Number of Examined Pelvic Nodes Recode (2010+),Number of Positive Pelvic Nodes Recode (2010+),Separate Tumor Nodules Ipsilateral Lung Recode (2010+),Tumor Deposits Recode (2010+),Visceral and Parietal Pleural Invasion Recode (2010+),EOD Regional Nodes (2018+),Tumor Size Summary (2016+),Regional nodes examined (1988+)
0,Female,2003,Non-Hispanic White,14,Dead,NHL - Extranodal,Blank(s),Stage II,Breast,Alive or dead of other cause,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),99
1,Female,2018,Non-Hispanic White,11,Alive,Breast,Blank(s),Blank(s),Alive,Alive or dead of other cause,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),070,024,3
2,Female,2016,Non-Hispanic White,35,Alive,Ovary,Regional by direct extension only,Blank(s),Alive,Alive or dead of other cause,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),075,10
3,Male,2016,Non-Hispanic White,35,Alive,Melanoma of the Skin,Localized only,Blank(s),Alive,Alive or dead of other cause,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),008,0
4,Male,2016,Non-Hispanic Black,24,Alive,Other Endocrine including Thymus,Localized only,Blank(s),Alive,Alive or dead of other cause,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),040,0


In [5]:
raw_df.columns

Index(['Sex', 'Year of diagnosis',
       'Race and origin recode (NHW, NHB, NHAIAN, NHAPI, Hispanic)',
       'Survival months', 'Vital status recode (study cutoff used)',
       'Site recode ICD-O-3/WHO 2008',
       'SEER Combined Summary Stage 2000 (2004-2017)',
       'Lymphoma - Ann Arbor Stage (1983-2015)', 'COD to site recode',
       'SEER cause-specific death classification', 'Survival months.1',
       'Vital status recode (study cutoff used).1',
       'First malignant primary indicator',
       'Total number of in situ/malignant tumors for patient',
       'Total number of benign/borderline tumors for patient',
       'Age recode with single ages and 85+', 'Race/ethnicity', 'Patient ID',
       'Rural-Urban Continuum Code',
       'Age Standard for Survival (15-44,45-54,55-64,65-74,75+)',
       'RX Summ--Surg Prim Site (1998+)', 'RX Summ--Scope Reg LN Sur (2003+)',
       'RX Summ--Surg Oth Reg/Dis (2003+)',
       'Reason no cancer-directed surgery', 'Breslow Thickness R

### Transform Data

In [6]:
# Rename the columns
renamed_df = raw_df.rename(
    columns={"Year of diagnosis": "diagnosis_year",
            "Race and origin recode (NHW, NHB, NHAIAN, NHAPI, Hispanic)": "race_origin",
             "Survival months": "survival_months_str",
             "Vital status recode (study cutoff used)": "status",
             "Site recode ICD-O-3/WHO 2008": "tumour_classification",
             "COD to site recode": "death_cause",
             "SEER cause-specific death classification": "death_classification",
             "Tumor Size Summary (2016+)":"tumor_size_str"
            })
renamed_df.head()

Unnamed: 0,Sex,diagnosis_year,race_origin,survival_months_str,status,tumour_classification,SEER Combined Summary Stage 2000 (2004-2017),Lymphoma - Ann Arbor Stage (1983-2015),death_cause,death_classification,...,Number of Cores Positive Recode (2010+),Number of Cores Examined Recode (2010+),Number of Examined Pelvic Nodes Recode (2010+),Number of Positive Pelvic Nodes Recode (2010+),Separate Tumor Nodules Ipsilateral Lung Recode (2010+),Tumor Deposits Recode (2010+),Visceral and Parietal Pleural Invasion Recode (2010+),EOD Regional Nodes (2018+),tumor_size_str,Regional nodes examined (1988+)
0,Female,2003,Non-Hispanic White,14,Dead,NHL - Extranodal,Blank(s),Stage II,Breast,Alive or dead of other cause,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),99
1,Female,2018,Non-Hispanic White,11,Alive,Breast,Blank(s),Blank(s),Alive,Alive or dead of other cause,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),070,024,3
2,Female,2016,Non-Hispanic White,35,Alive,Ovary,Regional by direct extension only,Blank(s),Alive,Alive or dead of other cause,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),075,10
3,Male,2016,Non-Hispanic White,35,Alive,Melanoma of the Skin,Localized only,Blank(s),Alive,Alive or dead of other cause,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),008,0
4,Male,2016,Non-Hispanic Black,24,Alive,Other Endocrine including Thymus,Localized only,Blank(s),Alive,Alive or dead of other cause,...,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),Blank(s),040,0


In [7]:
# Select important features
selected_df = renamed_df[["diagnosis_year", "race_origin", "survival_months_str", "status",
                          "tumour_classification", "death_cause", "death_classification", "tumor_size_str"]]
selected_df.head()

Unnamed: 0,diagnosis_year,race_origin,survival_months_str,status,tumour_classification,death_cause,death_classification,tumor_size_str
0,2003,Non-Hispanic White,14,Dead,NHL - Extranodal,Breast,Alive or dead of other cause,Blank(s)
1,2018,Non-Hispanic White,11,Alive,Breast,Alive,Alive or dead of other cause,024
2,2016,Non-Hispanic White,35,Alive,Ovary,Alive,Alive or dead of other cause,075
3,2016,Non-Hispanic White,35,Alive,Melanoma of the Skin,Alive,Alive or dead of other cause,008
4,2016,Non-Hispanic Black,24,Alive,Other Endocrine including Thymus,Alive,Alive or dead of other cause,040


In [8]:
selected_df.dtypes

diagnosis_year            int64
race_origin              object
survival_months_str      object
status                   object
tumour_classification    object
death_cause              object
death_classification     object
tumor_size_str           object
dtype: object

In [9]:
# Display an overview of the diagnosis_year column
selected_df['diagnosis_year'].value_counts()

2017    426169
2018    424400
2016    419876
2003     16800
2002     16469
2001     16089
2000     15744
Name: diagnosis_year, dtype: int64

In [10]:
# Display an overview of the race_origin column
selected_df['race_origin'].value_counts()

Non-Hispanic White                            917102
Hispanic (All Races)                          153966
Non-Hispanic Black                            139808
Non-Hispanic Asian or Pacific Islander         95882
Non-Hispanic Unknown Race                      20977
Non-Hispanic American Indian/Alaska Native      7812
Name: race_origin, dtype: int64

In [11]:
# Display an overview of the status column
selected_df['status'].value_counts()

Alive    992819
Dead     342728
Name: status, dtype: int64

In [12]:
# Display an overview of the tumour_classification column
selected_df['tumour_classification'].value_counts()

Breast                              206525
Prostate                            169436
Lung and Bronchus                   156449
NHL - Nodal                          75098
Melanoma of the Skin                 71834
                                     ...  
Other Lymphocytic Leukemia              74
Other Acute Leukemia                    72
Other Myeloid/Monocytic Leukemia        64
Pleura                                  64
Acute Monocytic Leukemia                25
Name: tumour_classification, Length: 80, dtype: int64

In [13]:
# Display an overview of the death_cause column
selected_df['death_cause'].value_counts()

Alive                                                 992819
Lung and Bronchus                                      70223
Non-Hodgkin Lymphoma                                   28329
Pancreas                                               25664
Miscellaneous Malignant Cancer                         22315
                                                       ...  
Other Male Genital Organs                                 34
Tuberculosis                                              29
Acute Monocytic Leukemia                                  21
Certain Conditions Originating in Perinatal Period         3
Syphilis                                                   1
Name: death_cause, Length: 94, dtype: int64

In [14]:
# Display an overview of the death_classification column
selected_df['death_classification'].value_counts()

Alive or dead of other cause             1066921
Dead (attributable to this cancer dx)     263913
Dead (missing/unknown COD)                  4611
N/A not seq 0-59                             102
Name: death_classification, dtype: int64

In [15]:
# Display an overview of the tumor_size column
selected_df['tumor_size_str'].value_counts()

999         460708
Blank(s)     65102
030          32045
020          30308
015          29494
             ...  
386              1
932              1
741              1
339              1
870              1
Name: tumor_size_str, Length: 486, dtype: int64

In [16]:
# Replace 'Blank(s)' with '1022'
cleaned_df = selected_df.copy()
# cleaned_df.loc[:,'tumor_size'] = cleaned_df['tumor_size'].replace({'Blank(s)': '1022'})
cleaned_df.loc[:,'tumor_size_int'] = cleaned_df['tumor_size_str'].replace({'Blank(s)': '1022'}).astype(int)
columns=['tumor_size_int']
filter_ = (cleaned_df[columns] <= 988).all(axis=1)
cleaned_df = cleaned_df[filter_]
cleaned_df

Unnamed: 0,diagnosis_year,race_origin,survival_months_str,status,tumour_classification,death_cause,death_classification,tumor_size_str,tumor_size_int
1,2018,Non-Hispanic White,0011,Alive,Breast,Alive,Alive or dead of other cause,024,24
2,2016,Non-Hispanic White,0035,Alive,Ovary,Alive,Alive or dead of other cause,075,75
3,2016,Non-Hispanic White,0035,Alive,Melanoma of the Skin,Alive,Alive or dead of other cause,008,8
4,2016,Non-Hispanic Black,0024,Alive,Other Endocrine including Thymus,Alive,Alive or dead of other cause,040,40
5,2016,Non-Hispanic White,0028,Alive,Hodgkin - Nodal,Alive,Alive or dead of other cause,988,988
...,...,...,...,...,...,...,...,...,...
1335538,2018,Non-Hispanic Black,0000,Alive,Cecum,Alive,Alive or dead of other cause,060,60
1335540,2018,Non-Hispanic Black,0000,Alive,Splenic Flexure,Alive,Alive or dead of other cause,027,27
1335541,2018,Non-Hispanic Black,0001,Alive,Sigmoid Colon,Alive,Alive or dead of other cause,115,115
1335543,2018,Non-Hispanic Black,0000,Alive,Thyroid,Alive,Alive or dead of other cause,032,32


In [17]:
# cleaned_df = cleaned_df[cleaned_df.tumor_size != '1022']
cleaned_df = cleaned_df.loc[cleaned_df.diagnosis_year == 2018 ]
cleaned_df = cleaned_df.loc[cleaned_df.tumour_classification == 'Breast' ]
# print(f"cleaned_df size: {len(cleaned_df.index)}")
print(f"cleaned_df shape: {cleaned_df.shape}")
# print(f"selected_df size: {len(selected_df.index)}")
print(f"selected_df shape: {selected_df.shape}")
cleaned_df

cleaned_df shape: (65723, 9)
selected_df shape: (1335547, 8)


Unnamed: 0,diagnosis_year,race_origin,survival_months_str,status,tumour_classification,death_cause,death_classification,tumor_size_str,tumor_size_int
1,2018,Non-Hispanic White,0011,Alive,Breast,Alive,Alive or dead of other cause,024,24
19,2018,Non-Hispanic White,0006,Alive,Breast,Alive,Alive or dead of other cause,008,8
33,2018,Hispanic (All Races),0008,Alive,Breast,Alive,Alive or dead of other cause,020,20
34,2018,Non-Hispanic White,0001,Alive,Breast,Alive,Alive or dead of other cause,021,21
67,2018,Non-Hispanic White,0006,Alive,Breast,Alive,Alive or dead of other cause,005,5
...,...,...,...,...,...,...,...,...,...
1335513,2018,Non-Hispanic White,0009,Alive,Breast,Alive,Alive or dead of other cause,016,16
1335516,2018,Non-Hispanic White,0000,Alive,Breast,Alive,Alive or dead of other cause,006,6
1335522,2018,Non-Hispanic White,0003,Alive,Breast,Alive,Alive or dead of other cause,015,15
1335526,2018,Non-Hispanic White,0003,Alive,Breast,Alive,Alive or dead of other cause,015,15


In [18]:
# Display an overview of the survival_months column
cleaned_df['survival_months_str'].value_counts()

0002       5899
0004       5703
0001       5377
0000       5195
0005       5103
0006       5097
0008       5082
0007       5020
0010       4916
0003       4769
0009       4762
0011       4536
2           404
4           373
10          364
1           363
8           358
11          356
7           351
0           347
9           346
6           343
5           333
3           318
Unknown       8
Name: survival_months_str, dtype: int64

In [19]:
# Replace 'Unknown' with '9999'
tmp_df = cleaned_df.copy()
tmp_df.loc[:,'survival_months_str'] = tmp_df['survival_months_str'].replace({'Unknown': '9999'})
# Use pd.to_numeric() method to convert the datatype of the Amount column
cleaned_df = tmp_df.copy()
# Release memory assigned to tmp_df 
del tmp_df
# Display an overview of the survival_months column
cleaned_df['survival_months_str'].value_counts() #.sort_values()

0002    5899
0004    5703
0001    5377
0000    5195
0005    5103
0006    5097
0008    5082
0007    5020
0010    4916
0003    4769
0009    4762
0011    4536
2        404
4        373
10       364
1        363
8        358
11       356
7        351
0        347
9        346
6        343
5        333
3        318
9999       8
Name: survival_months_str, dtype: int64

In [20]:
# Fill leading zeros up to four (4) digits
tmp_df = cleaned_df.copy()
tmp_df.loc[:,'survival_months_str'] = tmp_df['survival_months_str'].str.zfill(4)
tmp_df.loc[:,'survival_months_int'] = tmp_df['survival_months_str'].astype(int)
tmp_df = tmp_df.loc[tmp_df.survival_months_int != 9999 ]
cleaned_df = tmp_df.copy()
# Release memory assigned to tmp_df 
del tmp_df
# Display an overview of the survival_months column
cleaned_df['survival_months_str'].value_counts() #.sort_values()

0002    6303
0004    6076
0001    5740
0000    5542
0008    5440
0006    5440
0005    5436
0007    5371
0010    5280
0009    5108
0003    5087
0011    4892
Name: survival_months_str, dtype: int64

In [21]:
# Grouping the DataFrame by "death_classification"
death_classification_group = cleaned_df.groupby("death_classification")

# Count how many of each component Assignees worked on and create DataFrame
grouped_work_df = pd.DataFrame(death_classification_group["death_cause"].value_counts())
grouped_work_df

Unnamed: 0_level_0,Unnamed: 1_level_0,death_cause
death_classification,death_cause,Unnamed: 2_level_1
Alive or dead of other cause,Alive,64609
Alive or dead of other cause,Diseases of Heart,88
Alive or dead of other cause,Other Cause of Death,72
Alive or dead of other cause,Cerebrovascular Diseases,23
Alive or dead of other cause,Lung and Bronchus,20
Alive or dead of other cause,Diabetes Mellitus,18
Alive or dead of other cause,Chronic Obstructive Pulmonary Disease and Allied Cond,17
Alive or dead of other cause,Accidents and Adverse Effects,15
Alive or dead of other cause,Septicemia,13
Alive or dead of other cause,Pneumonia and Influenza,11


In [22]:
# Identify incomplete rows
cleaned_df.count()

diagnosis_year           65715
race_origin              65715
survival_months_str      65715
status                   65715
tumour_classification    65715
death_cause              65715
death_classification     65715
tumor_size_str           65715
tumor_size_int           65715
survival_months_int      65715
dtype: int64

In [23]:
cleaned_df = cleaned_df.drop("survival_months_str", axis=1)
cleaned_df = cleaned_df.drop("tumor_size_str", axis=1)
# Rename the columns
cleaned_df = cleaned_df.rename(
    columns={"survival_months_int": "survival_months",
            "tumor_size_int": "tumor_size"
            })
cleaned_df.head()

Unnamed: 0,diagnosis_year,race_origin,status,tumour_classification,death_cause,death_classification,tumor_size,survival_months
1,2018,Non-Hispanic White,Alive,Breast,Alive,Alive or dead of other cause,24,11
19,2018,Non-Hispanic White,Alive,Breast,Alive,Alive or dead of other cause,8,6
33,2018,Hispanic (All Races),Alive,Breast,Alive,Alive or dead of other cause,20,8
34,2018,Non-Hispanic White,Alive,Breast,Alive,Alive or dead of other cause,21,1
67,2018,Non-Hispanic White,Alive,Breast,Alive,Alive or dead of other cause,5,6


In [24]:
# Identify incomplete rows
cleaned_df.count()

diagnosis_year           65715
race_origin              65715
status                   65715
tumour_classification    65715
death_cause              65715
death_classification     65715
tumor_size               65715
survival_months          65715
dtype: int64

### Load Data

Use mongo

In [25]:
from pymongo import MongoClient
import sys
sys.path.insert(0, '../')
from private import usr, pwd, cluster, db_name # private information, cannot be shared
MONGO_URI = f"mongodb+srv://{usr}:{pwd}@{cluster}/{db_name}"
client = MongoClient(MONGO_URI)

# Define the 'cancer_db' database in Mongo
db = client.cancer_db
collection = db['seer_data']

In [26]:
new_df = cleaned_df.reset_index(drop=True)
data_dict = new_df.to_dict("records")
new_df

Unnamed: 0,diagnosis_year,race_origin,status,tumour_classification,death_cause,death_classification,tumor_size,survival_months
0,2018,Non-Hispanic White,Alive,Breast,Alive,Alive or dead of other cause,24,11
1,2018,Non-Hispanic White,Alive,Breast,Alive,Alive or dead of other cause,8,6
2,2018,Hispanic (All Races),Alive,Breast,Alive,Alive or dead of other cause,20,8
3,2018,Non-Hispanic White,Alive,Breast,Alive,Alive or dead of other cause,21,1
4,2018,Non-Hispanic White,Alive,Breast,Alive,Alive or dead of other cause,5,6
...,...,...,...,...,...,...,...,...
65710,2018,Non-Hispanic White,Alive,Breast,Alive,Alive or dead of other cause,16,9
65711,2018,Non-Hispanic White,Alive,Breast,Alive,Alive or dead of other cause,6,0
65712,2018,Non-Hispanic White,Alive,Breast,Alive,Alive or dead of other cause,15,3
65713,2018,Non-Hispanic White,Alive,Breast,Alive,Alive or dead of other cause,15,3


In [27]:
collection.insert_many(data_dict)

<pymongo.results.InsertManyResult at 0x1d821c11280>