In [66]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import time
import random

### Import Dataset 1
Using Pandas to import the CSV file from data.gov with information on Chronic Disease Indicators, which was created March 9th, 2024 and had the metadata updated on February 3rd, 2025.

Then displaying the dataset, and using .dtypes to show the data types in order to create the data dictionary for the project proposal. 

In [69]:
chronic_data = pd.read_csv('U.S._Chronic_Disease_Indicators.csv')

In [70]:
display(chronic_data)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,TopicID,QuestionID,ResponseID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2019,2019,AR,Arkansas,BRFSS,Diabetes,Diabetes among adults,,%,Crude Prevalence,...,DIA,DIA01,,CRDPREV,SEX,SEXM,,,,
1,2019,2019,ID,Idaho,BRFSS,Diabetes,Diabetes among adults,,%,Crude Prevalence,...,DIA,DIA01,,CRDPREV,SEX,SEXM,,,,
2,2019,2019,IN,Indiana,YRBSS,Sleep,Short sleep duration among high school students,,%,Crude Prevalence,...,SLEP,SLP02,,CRDPREV,GRADE,GRD12,,,,
3,2019,2019,IA,Iowa,NVSS,Asthma,"Asthma mortality among all people, underlying ...",,Number,Number,...,AST,AST01,,NMBR,OVERALL,OVR,,,,
4,2019,2019,IA,Iowa,BRFSS,Asthma,Current asthma among adults,,%,Crude Prevalence,...,AST,AST02,,CRDPREV,AGE,AGE1844,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309210,2022,2022,VI,Virgin Islands,BRFSS,Tobacco,Quit attempts in the past year among adult cur...,,%,Age-adjusted Prevalence,...,TOB,TOB06,,AGEADJPREV,RACE,AIAN,,,,
309211,2022,2022,WV,West Virginia,BRFSS,Chronic Obstructive Pulmonary Disease,Chronic obstructive pulmonary disease among ad...,,%,Crude Prevalence,...,COPD,COPD01,,CRDPREV,OVERALL,OVR,,,,
309212,2022,2022,WI,Wisconsin,BRFSS,Immunization,Pneumococcal vaccination among adults aged 65 ...,,%,Crude Prevalence,...,IMM,IMM04,,CRDPREV,RACE,BLK,,,,
309213,2022,2022,VT,Vermont,BRFSS,Social Determinants of Health,Lack of health insurance among adults aged 18-...,,%,Crude Prevalence,...,SDOH,SDH09,,CRDPREV,RACE,HAPI,,,,


In [72]:
print(chronic_data.dtypes)

YearStart                      int64
YearEnd                        int64
LocationAbbr                  object
LocationDesc                  object
DataSource                    object
Topic                         object
Question                      object
Response                     float64
DataValueUnit                 object
DataValueType                 object
DataValue                    float64
DataValueAlt                 float64
DataValueFootnoteSymbol       object
DataValueFootnote             object
LowConfidenceLimit           float64
HighConfidenceLimit          float64
StratificationCategory1       object
Stratification1               object
StratificationCategory2      float64
Stratification2              float64
StratificationCategory3      float64
Stratification3              float64
Geolocation                   object
LocationID                     int64
TopicID                       object
QuestionID                    object
ResponseID                   float64
D

In [75]:
print("Missing Data Summary: \n", "-" *50)
print(chronic_data.isna().sum())
print('\n')
print(f'Total missing data: {chronic_data.isna().sum().sum()}')

Missing Data Summary: 
 --------------------------------------------------
YearStart                         0
YearEnd                           0
LocationAbbr                      0
LocationDesc                      0
DataSource                        0
Topic                             0
Question                          0
Response                     309215
DataValueUnit                     0
DataValueType                     0
DataValue                    100019
DataValueAlt                 100019
DataValueFootnoteSymbol      207499
DataValueFootnote            207499
LowConfidenceLimit           120330
HighConfidenceLimit          120325
StratificationCategory1           0
Stratification1                   0
StratificationCategory2      309215
Stratification2              309215
StratificationCategory3      309215
Stratification3              309215
Geolocation                    5763
LocationID                        0
TopicID                           0
QuestionID               

Dropping the missing values in DataValue so that it can be used for analysis without error. 

In [78]:
clean_chronic_data = chronic_data.dropna(subset = ['DataValue','LowConfidenceLimit','HighConfidenceLimit'])

In [80]:
print("Missing Data Summary: \n", "-" *50)
print(clean_chronic_data.isna().sum())
print('\n')
print(f'Total missing data: {clean_chronic_data.isna().sum().sum()}')

Missing Data Summary: 
 --------------------------------------------------
YearStart                         0
YearEnd                           0
LocationAbbr                      0
LocationDesc                      0
DataSource                        0
Topic                             0
Question                          0
Response                     188885
DataValueUnit                     0
DataValueType                     0
DataValue                         0
DataValueAlt                      0
DataValueFootnoteSymbol      187361
DataValueFootnote            187361
LowConfidenceLimit                0
HighConfidenceLimit               0
StratificationCategory1           0
Stratification1                   0
StratificationCategory2      188885
Stratification2              188885
StratificationCategory3      188885
Stratification3              188885
Geolocation                    4537
LocationID                        0
TopicID                           0
QuestionID               

Dropping columns with exclusively missing values. 

In [83]:
clean_chronic_data = clean_chronic_data.dropna(axis=1)
display(clean_chronic_data)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,...,LowConfidenceLimit,HighConfidenceLimit,StratificationCategory1,Stratification1,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
0,2019,2019,AR,Arkansas,BRFSS,Diabetes,Diabetes among adults,%,Crude Prevalence,13.6,...,12.1,15.4,Sex,Male,5,DIA,DIA01,CRDPREV,SEX,SEXM
1,2019,2019,ID,Idaho,BRFSS,Diabetes,Diabetes among adults,%,Crude Prevalence,10.6,...,9.1,12.2,Sex,Male,16,DIA,DIA01,CRDPREV,SEX,SEXM
4,2019,2019,IA,Iowa,BRFSS,Asthma,Current asthma among adults,%,Crude Prevalence,10.3,...,9.1,11.7,Age,Age 18-44,19,AST,AST02,CRDPREV,AGE,AGE1844
6,2019,2019,IA,Iowa,BRFSS,Health Status,Recent activity limitation among adults,Number,Crude Mean,2.3,...,2.1,2.5,Sex,Female,19,HEA,HEA04,CRDMEAN,SEX,SEXF
7,2019,2019,IA,Iowa,BRFSS,Mental Health,Depression among adults,%,Crude Prevalence,31.0,...,20.6,43.7,Race/Ethnicity,"Multiracial, non-Hispanic",19,MEN,MEN02,CRDPREV,RACE,MRC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309208,2022,2022,VT,Vermont,BRFSS,Sleep,Short sleep duration among adults,%,Crude Prevalence,26.5,...,16.8,39.3,Race/Ethnicity,"Asian, non-Hispanic",50,SLEP,SLP03,CRDPREV,RACE,ASN
309209,2022,2022,VI,Virgin Islands,BRFSS,Immunization,Influenza vaccination among adults,%,Crude Prevalence,34.2,...,24.2,45.8,Age,Age >=65,78,IMM,IMM01,CRDPREV,AGE,AGE65P
309211,2022,2022,WV,West Virginia,BRFSS,Chronic Obstructive Pulmonary Disease,Chronic obstructive pulmonary disease among ad...,%,Crude Prevalence,14.0,...,12.8,15.2,Overall,Overall,54,COPD,COPD01,CRDPREV,OVERALL,OVR
309212,2022,2022,WI,Wisconsin,BRFSS,Immunization,Pneumococcal vaccination among adults aged 65 ...,%,Crude Prevalence,64.2,...,52.2,74.6,Race/Ethnicity,"Black, non-Hispanic",55,IMM,IMM04,CRDPREV,RACE,BLK


In [85]:
print(clean_chronic_data.dtypes)

YearStart                      int64
YearEnd                        int64
LocationAbbr                  object
LocationDesc                  object
DataSource                    object
Topic                         object
Question                      object
DataValueUnit                 object
DataValueType                 object
DataValue                    float64
DataValueAlt                 float64
LowConfidenceLimit           float64
HighConfidenceLimit          float64
StratificationCategory1       object
Stratification1               object
LocationID                     int64
TopicID                       object
QuestionID                    object
DataValueTypeID               object
StratificationCategoryID1     object
StratificationID1             object
dtype: object


In [87]:
display(clean_chronic_data[clean_chronic_data['Topic']=='Diabetes'])

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,...,LowConfidenceLimit,HighConfidenceLimit,StratificationCategory1,Stratification1,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
0,2019,2019,AR,Arkansas,BRFSS,Diabetes,Diabetes among adults,%,Crude Prevalence,13.6,...,12.1,15.4,Sex,Male,5,DIA,DIA01,CRDPREV,SEX,SEXM
1,2019,2019,ID,Idaho,BRFSS,Diabetes,Diabetes among adults,%,Crude Prevalence,10.6,...,9.1,12.2,Sex,Male,16,DIA,DIA01,CRDPREV,SEX,SEXM
17,2019,2019,NE,Nebraska,BRFSS,Diabetes,Diabetes among adults,%,Crude Prevalence,9.4,...,8.7,10.3,Sex,Female,31,DIA,DIA01,CRDPREV,SEX,SEXF
33,2019,2019,OK,Oklahoma,BRFSS,Diabetes,Diabetes among adults,%,Crude Prevalence,11.8,...,8.7,15.8,Race/Ethnicity,Hispanic,40,DIA,DIA01,CRDPREV,RACE,HIS
40,2019,2019,RI,Rhode Island,BRFSS,Diabetes,Diabetes among adults,%,Crude Prevalence,21.4,...,19.2,23.7,Age,Age >=65,44,DIA,DIA01,CRDPREV,AGE,AGE65P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309077,2022,2022,WA,Washington,BRFSS,Diabetes,Diabetes among adults,%,Crude Prevalence,9.7,...,9.2,10.1,Overall,Overall,53,DIA,DIA01,CRDPREV,OVERALL,OVR
309079,2022,2022,WY,Wyoming,BRFSS,Diabetes,Diabetes among adults,%,Crude Prevalence,2.4,...,1.6,3.5,Age,Age 18-44,56,DIA,DIA01,CRDPREV,AGE,AGE1844
309102,2022,2022,WA,Washington,BRFSS,Diabetes,Diabetes among adults,%,Crude Prevalence,13.2,...,10.3,16.7,Race/Ethnicity,"Black, non-Hispanic",53,DIA,DIA01,CRDPREV,RACE,BLK
309108,2022,2022,VA,Virginia,BRFSS,Diabetes,Diabetes among adults,%,Age-adjusted Prevalence,11.4,...,7.2,17.7,Race/Ethnicity,"Asian, non-Hispanic",51,DIA,DIA01,AGEADJPREV,RACE,ASN


In [89]:
display(clean_chronic_data[clean_chronic_data['Topic']=='Cardiovascular Disease'])

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,...,LowConfidenceLimit,HighConfidenceLimit,StratificationCategory1,Stratification1,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
14192,2019,2019,AK,Alaska,NVSS,Cardiovascular Disease,Diseases of the heart mortality among all peop...,"cases per 100,000",Age-adjusted Rate,159.70,...,144.60,174.80,Sex,Male,2,CVD,CVD09,AGEADJRATE,SEX,SEXM
14246,2019,2019,CA,California,BRFSS,Cardiovascular Disease,Taking medicine to control high blood pressure...,%,Age-adjusted Prevalence,47.20,...,34.50,60.30,Race/Ethnicity,"Multiracial, non-Hispanic",6,CVD,CVD02,AGEADJPREV,RACE,MRC
14249,2019,2019,AL,Alabama,CMS Part A Claims Data,Cardiovascular Disease,Hospitalization for heart failure as principal...,"cases per 1,000",Crude Rate,71.43,...,44.97,97.89,Race/Ethnicity,"American Indian or Alaska Native, non-Hispanic",1,CVD,CVD06,CRDRATE,RACE,AIAN
14251,2019,2019,CA,California,BRFSS,Cardiovascular Disease,Taking medicine for high cholesterol among adults,%,Crude Prevalence,31.60,...,30.00,33.20,Sex,Male,6,CVD,CVD04,CRDPREV,SEX,SEXM
14283,2019,2019,AZ,Arizona,NVSS,Cardiovascular Disease,Cerebrovascular disease (stroke) mortality amo...,"cases per 100,000",Crude Rate,26.10,...,20.80,32.30,Race/Ethnicity,"Black, non-Hispanic",4,CVD,CVD07,CRDRATE,RACE,BLK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274653,2021,2021,WY,Wyoming,BRFSS,Cardiovascular Disease,High cholesterol among adults who have been sc...,%,Crude Prevalence,32.80,...,29.60,36.00,Age,Age 45-64,56,CVD,CVD03,CRDPREV,AGE,AGE4564
274656,2021,2021,WY,Wyoming,BRFSS,Cardiovascular Disease,High blood pressure among adults,%,Age-adjusted Prevalence,39.90,...,31.20,49.30,Race/Ethnicity,Hispanic,56,CVD,CVD01,AGEADJPREV,RACE,HIS
274761,2021,2021,WY,Wyoming,BRFSS,Cardiovascular Disease,High blood pressure among adults,%,Crude Prevalence,32.20,...,19.60,48.00,Race/Ethnicity,"Multiracial, non-Hispanic",56,CVD,CVD01,CRDPREV,RACE,MRC
274773,2021,2021,WY,Wyoming,NVSS,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Crude Rate,50.70,...,34.50,72.00,Race/Ethnicity,Hispanic,56,CVD,CVD08,CRDRATE,RACE,HIS


In [91]:
clean_chronic_data = clean_chronic_data.drop(clean_chronic_data[clean_chronic_data['Topic'] != 'Cardiovascular Disease'].index)

In [93]:
display(clean_chronic_data)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,...,LowConfidenceLimit,HighConfidenceLimit,StratificationCategory1,Stratification1,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
14192,2019,2019,AK,Alaska,NVSS,Cardiovascular Disease,Diseases of the heart mortality among all peop...,"cases per 100,000",Age-adjusted Rate,159.70,...,144.60,174.80,Sex,Male,2,CVD,CVD09,AGEADJRATE,SEX,SEXM
14246,2019,2019,CA,California,BRFSS,Cardiovascular Disease,Taking medicine to control high blood pressure...,%,Age-adjusted Prevalence,47.20,...,34.50,60.30,Race/Ethnicity,"Multiracial, non-Hispanic",6,CVD,CVD02,AGEADJPREV,RACE,MRC
14249,2019,2019,AL,Alabama,CMS Part A Claims Data,Cardiovascular Disease,Hospitalization for heart failure as principal...,"cases per 1,000",Crude Rate,71.43,...,44.97,97.89,Race/Ethnicity,"American Indian or Alaska Native, non-Hispanic",1,CVD,CVD06,CRDRATE,RACE,AIAN
14251,2019,2019,CA,California,BRFSS,Cardiovascular Disease,Taking medicine for high cholesterol among adults,%,Crude Prevalence,31.60,...,30.00,33.20,Sex,Male,6,CVD,CVD04,CRDPREV,SEX,SEXM
14283,2019,2019,AZ,Arizona,NVSS,Cardiovascular Disease,Cerebrovascular disease (stroke) mortality amo...,"cases per 100,000",Crude Rate,26.10,...,20.80,32.30,Race/Ethnicity,"Black, non-Hispanic",4,CVD,CVD07,CRDRATE,RACE,BLK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274653,2021,2021,WY,Wyoming,BRFSS,Cardiovascular Disease,High cholesterol among adults who have been sc...,%,Crude Prevalence,32.80,...,29.60,36.00,Age,Age 45-64,56,CVD,CVD03,CRDPREV,AGE,AGE4564
274656,2021,2021,WY,Wyoming,BRFSS,Cardiovascular Disease,High blood pressure among adults,%,Age-adjusted Prevalence,39.90,...,31.20,49.30,Race/Ethnicity,Hispanic,56,CVD,CVD01,AGEADJPREV,RACE,HIS
274761,2021,2021,WY,Wyoming,BRFSS,Cardiovascular Disease,High blood pressure among adults,%,Crude Prevalence,32.20,...,19.60,48.00,Race/Ethnicity,"Multiracial, non-Hispanic",56,CVD,CVD01,CRDPREV,RACE,MRC
274773,2021,2021,WY,Wyoming,NVSS,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Crude Rate,50.70,...,34.50,72.00,Race/Ethnicity,Hispanic,56,CVD,CVD08,CRDRATE,RACE,HIS


In [95]:
clean_chronic_data.to_csv('Cleaned_Chronic_Data.csv', header=True, index=False, encoding="utf-8")