In [1]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import time
import random

### Import Dataset 1
Using Pandas to import the CSV file from data.gov with information on Chronic Disease Indicators, which was created March 9th, 2024 and had the metadata updated on February 3rd, 2025.

Then displaying the dataset, and using .dtypes to show the data types in order to create the data dictionary for the project proposal. 

In [2]:
chronic_data = pd.read_csv('U.S._Chronic_Disease_Indicators.csv')

In [3]:
display(chronic_data)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,TopicID,QuestionID,ResponseID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2020,2020,US,United States,BRFSS,Health Status,Recent activity limitation among adults,,Number,Age-adjusted Mean,...,HEA,HEA04,,AGEADJMEAN,SEX,SEXF,,,,
1,2015,2019,AR,Arkansas,US Cancer DVT,Cancer,"Invasive cancer (all sites combined), incidence",,Number,Number,...,CAN,CAN07,,NMBR,SEX,SEXM,,,,
2,2015,2019,CA,California,US Cancer DVT,Cancer,"Cervical cancer mortality among all females, u...",,Number,Number,...,CAN,CAN03,,NMBR,OVERALL,OVR,,,,
3,2015,2019,CO,Colorado,US Cancer DVT,Cancer,"Invasive cancer (all sites combined), incidence",,Number,Number,...,CAN,CAN07,,NMBR,RACE,HIS,,,,
4,2015,2019,GA,Georgia,US Cancer DVT,Cancer,"Prostate cancer mortality among all males, und...",,Number,Number,...,CAN,CAN05,,NMBR,RACE,WHT,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309210,2022,2022,VI,Virgin Islands,BRFSS,Tobacco,Quit attempts in the past year among adult cur...,,%,Age-adjusted Prevalence,...,TOB,TOB06,,AGEADJPREV,RACE,AIAN,,,,
309211,2022,2022,WV,West Virginia,BRFSS,Chronic Obstructive Pulmonary Disease,Chronic obstructive pulmonary disease among ad...,,%,Crude Prevalence,...,COPD,COPD01,,CRDPREV,OVERALL,OVR,,,,
309212,2022,2022,WI,Wisconsin,BRFSS,Immunization,Pneumococcal vaccination among adults aged 65 ...,,%,Crude Prevalence,...,IMM,IMM04,,CRDPREV,RACE,BLK,,,,
309213,2022,2022,VT,Vermont,BRFSS,Social Determinants of Health,Lack of health insurance among adults aged 18-...,,%,Crude Prevalence,...,SDOH,SDH09,,CRDPREV,RACE,HAPI,,,,


In [24]:
print(chronic_data.dtypes)

YearStart                      int64
YearEnd                        int64
LocationAbbr                  object
LocationDesc                  object
DataSource                    object
Topic                         object
Question                      object
Response                     float64
DataValueUnit                 object
DataValueType                 object
DataValue                    float64
DataValueAlt                 float64
DataValueFootnoteSymbol       object
DataValueFootnote             object
LowConfidenceLimit           float64
HighConfidenceLimit          float64
StratificationCategory1       object
Stratification1               object
StratificationCategory2      float64
Stratification2              float64
StratificationCategory3      float64
Stratification3              float64
Geolocation                   object
LocationID                     int64
TopicID                       object
QuestionID                    object
ResponseID                   float64
D

In [28]:
print("Missing Data Summary: \n", "-" *50)
print(chronic_data.isna().sum())
print('\n')
print(f'Total missing data: {chronic_data.isna().sum().sum()}')

Missing Data Summary: 
 --------------------------------------------------
YearStart                         0
YearEnd                           0
LocationAbbr                      0
LocationDesc                      0
DataSource                        0
Topic                             0
Question                          0
Response                     309215
DataValueUnit                     0
DataValueType                     0
DataValue                    100019
DataValueAlt                 100019
DataValueFootnoteSymbol      207499
DataValueFootnote            207499
LowConfidenceLimit           120330
HighConfidenceLimit          120325
StratificationCategory1           0
Stratification1                   0
StratificationCategory2      309215
Stratification2              309215
StratificationCategory3      309215
Stratification3              309215
Geolocation                    5763
LocationID                        0
TopicID                           0
QuestionID               

Dropping the missing values in DataValue so that it can be used for analysis without error. 

In [42]:
clean_chronic_data = chronic_data.dropna(subset = ['DataValue','LowConfidenceLimit','HighConfidenceLimit'])

In [44]:
print("Missing Data Summary: \n", "-" *50)
print(clean_chronic_data.isna().sum())
print('\n')
print(f'Total missing data: {clean_chronic_data.isna().sum().sum()}')

Missing Data Summary: 
 --------------------------------------------------
YearStart                         0
YearEnd                           0
LocationAbbr                      0
LocationDesc                      0
DataSource                        0
Topic                             0
Question                          0
Response                     188885
DataValueUnit                     0
DataValueType                     0
DataValue                         0
DataValueAlt                      0
DataValueFootnoteSymbol      187361
DataValueFootnote            187361
LowConfidenceLimit                0
HighConfidenceLimit               0
StratificationCategory1           0
Stratification1                   0
StratificationCategory2      188885
Stratification2              188885
StratificationCategory3      188885
Stratification3              188885
Geolocation                    4537
LocationID                        0
TopicID                           0
QuestionID               

Dropping columns with exclusively missing values. 

In [46]:
clean_chronic_data = clean_chronic_data.dropna(axis=1)
display(clean_chronic_data)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,...,LowConfidenceLimit,HighConfidenceLimit,StratificationCategory1,Stratification1,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
0,2019,2019,AR,Arkansas,BRFSS,Diabetes,Diabetes among adults,%,Crude Prevalence,13.6,...,12.1,15.4,Sex,Male,5,DIA,DIA01,CRDPREV,SEX,SEXM
1,2019,2019,ID,Idaho,BRFSS,Diabetes,Diabetes among adults,%,Crude Prevalence,10.6,...,9.1,12.2,Sex,Male,16,DIA,DIA01,CRDPREV,SEX,SEXM
4,2019,2019,IA,Iowa,BRFSS,Asthma,Current asthma among adults,%,Crude Prevalence,10.3,...,9.1,11.7,Age,Age 18-44,19,AST,AST02,CRDPREV,AGE,AGE1844
6,2019,2019,IA,Iowa,BRFSS,Health Status,Recent activity limitation among adults,Number,Crude Mean,2.3,...,2.1,2.5,Sex,Female,19,HEA,HEA04,CRDMEAN,SEX,SEXF
7,2019,2019,IA,Iowa,BRFSS,Mental Health,Depression among adults,%,Crude Prevalence,31.0,...,20.6,43.7,Race/Ethnicity,"Multiracial, non-Hispanic",19,MEN,MEN02,CRDPREV,RACE,MRC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309208,2022,2022,VT,Vermont,BRFSS,Sleep,Short sleep duration among adults,%,Crude Prevalence,26.5,...,16.8,39.3,Race/Ethnicity,"Asian, non-Hispanic",50,SLEP,SLP03,CRDPREV,RACE,ASN
309209,2022,2022,VI,Virgin Islands,BRFSS,Immunization,Influenza vaccination among adults,%,Crude Prevalence,34.2,...,24.2,45.8,Age,Age >=65,78,IMM,IMM01,CRDPREV,AGE,AGE65P
309211,2022,2022,WV,West Virginia,BRFSS,Chronic Obstructive Pulmonary Disease,Chronic obstructive pulmonary disease among ad...,%,Crude Prevalence,14.0,...,12.8,15.2,Overall,Overall,54,COPD,COPD01,CRDPREV,OVERALL,OVR
309212,2022,2022,WI,Wisconsin,BRFSS,Immunization,Pneumococcal vaccination among adults aged 65 ...,%,Crude Prevalence,64.2,...,52.2,74.6,Race/Ethnicity,"Black, non-Hispanic",55,IMM,IMM04,CRDPREV,RACE,BLK


In [48]:
print(clean_chronic_data.dtypes)

YearStart                      int64
YearEnd                        int64
LocationAbbr                  object
LocationDesc                  object
DataSource                    object
Topic                         object
Question                      object
DataValueUnit                 object
DataValueType                 object
DataValue                    float64
DataValueAlt                 float64
LowConfidenceLimit           float64
HighConfidenceLimit          float64
StratificationCategory1       object
Stratification1               object
LocationID                     int64
TopicID                       object
QuestionID                    object
DataValueTypeID               object
StratificationCategoryID1     object
StratificationID1             object
dtype: object
