# Project Report: Webscraping and Dataset Merging
Cassady Jackson and Baylie Schnieder

In [44]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import time
import random

### Import Dataset 1
Using Pandas to import the CSV file from data.gov with information on Chronic Disease Indicators, which was created March 9th, 2024 and had the metadata updated on February 3rd, 2025.

Then displaying the dataset, and using .dtypes to show the data types in order to create the data dictionary for the project proposal. 

In [46]:
clean_chronic_data = pd.read_csv('Cleaned_Chronic_Data.csv')

Dropping the columns that contain years that we do not have education ranking data for. This means keeping only the data from 2022. 

In [49]:
clean_chronic_data = clean_chronic_data.drop(columns='YearStart')

In [50]:
display(clean_chronic_data)

Unnamed: 0,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,LowConfidenceLimit,HighConfidenceLimit,StratificationCategory1,Stratification1,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
0,2021,AL,Alabama,NVSS,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Crude Rate,5.6,5.6,4.7,6.4,Age,Age 0-44,1,CVD,CVD08,CRDRATE,AGE,AGE0_44
1,2021,AL,Alabama,NVSS,Cardiovascular Disease,Diseases of the heart mortality among all peop...,"cases per 100,000",Crude Rate,22.1,22.1,20.4,23.9,Age,Age 0-44,1,CVD,CVD09,CRDRATE,AGE,AGE0_44
2,2021,AL,Alabama,NVSS,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Age-adjusted Rate,85.8,85.8,80.6,90.9,Race/Ethnicity,"Black, non-Hispanic",1,CVD,CVD08,AGEADJRATE,RACE,BLK
3,2021,AL,Alabama,NVSS,Cardiovascular Disease,Cerebrovascular disease (stroke) mortality amo...,"cases per 100,000",Age-adjusted Rate,58.0,58.0,55.0,61.0,Sex,Male,1,CVD,CVD07,AGEADJRATE,SEX,SEXM
4,2021,AL,Alabama,NVSS,Cardiovascular Disease,Diseases of the heart mortality among all peop...,"cases per 100,000",Crude Rate,273.2,273.2,264.4,282.1,Race/Ethnicity,"Black, non-Hispanic",1,CVD,CVD09,CRDRATE,RACE,BLK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2697,2021,WY,Wyoming,NVSS,Cardiovascular Disease,Cerebrovascular disease (stroke) mortality amo...,"cases per 100,000",Crude Rate,44.6,44.6,36.8,52.4,Sex,Female,56,CVD,CVD07,CRDRATE,SEX,SEXF
2698,2021,WI,Wisconsin,NVSS,Cardiovascular Disease,Diseases of the heart mortality among all peop...,"cases per 100,000",Crude Rate,18.2,18.2,11.1,28.1,Race/Ethnicity,"Multiracial, non-Hispanic",55,CVD,CVD09,CRDRATE,RACE,MRC
2699,2021,WY,Wyoming,NVSS,Cardiovascular Disease,Diseases of the heart mortality among all peop...,"cases per 100,000",Age-adjusted Rate,159.4,159.4,149.8,169.0,Overall,Overall,56,CVD,CVD09,AGEADJRATE,OVERALL,OVR
2700,2021,WI,Wisconsin,NVSS,Cardiovascular Disease,Cerebrovascular disease (stroke) mortality amo...,"cases per 100,000",Age-adjusted Rate,34.6,34.6,33.2,36.1,Race/Ethnicity,"White, non-Hispanic",55,CVD,CVD07,AGEADJRATE,RACE,WHT


In [51]:
clean_chronic_data.to_csv('Cleaned_Chronic_Data.csv', header=True, index=False, encoding="utf-8")

### Scraping for the Second Data Set

In [53]:
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = "https://web.archive.org/web/20210317083238/https://worldpopulationreview.com/state-rankings/public-school-rankings-by-state"
browser.get(url)
time.sleep(random.uniform(1,3))

In [54]:
table = browser.find_element(By.TAG_NAME, "table")
rows = table.find_elements(By.TAG_NAME, "tr")

data = []
for row in rows:
    cols = row.find_elements(By.TAG_NAME, "td")
    cols = [col.text.strip() for col in cols]
    if cols:
        data.append(cols)

public_ed_data = pd.DataFrame(data)
display(public_ed_data)

Unnamed: 0,0,1,2,3
0,Massachusetts,1,1,1
1,Connecticut,2,2,19
2,New Jersey,3,3,15
3,Virginia,4,4,3
4,Vermont,5,8,4
5,New Hampshire,6,7,6
6,Minnesota,7,6,22
7,Wisconsin,8,5,24
8,Delaware,9,15,2
9,Maryland,10,11,9


In [55]:
browser.quit()

In [56]:
public_ed_data.columns = ['State','Overall Public School Rank 2021', 'Higher Ed Quality 2021', 'School Safety Rank 2021']

In [57]:
display(public_ed_data)

Unnamed: 0,State,Overall Public School Rank 2021,Higher Ed Quality 2021,School Safety Rank 2021
0,Massachusetts,1,1,1
1,Connecticut,2,2,19
2,New Jersey,3,3,15
3,Virginia,4,4,3
4,Vermont,5,8,4
5,New Hampshire,6,7,6
6,Minnesota,7,6,22
7,Wisconsin,8,5,24
8,Delaware,9,15,2
9,Maryland,10,11,9


Printing the datatypes of the scraped dataset to add to the data dictionary

In [59]:
print(public_ed_data.dtypes)

State                              object
Overall Public School Rank 2021    object
Higher Ed Quality 2021             object
School Safety Rank 2021            object
dtype: object


In [60]:
public_ed_data.to_csv('State_Ed_Rankings.csv', header=True, index=False, encoding="utf-8")

# Integrating the Datasets

In order to integrate the datasets we will need to ensure both dataframes include columns that are similar enough to combine. Based on the data in each set, the easiest method would be to combine the data based on the 'State' column in the scraped dataset, and the 'LocationDesc' column in the other dataset.
- The first thing that will need to be done to combine these two datasets is the renaming of the 'LocationDesc' column to match the 'State' column.
- They will then need to be combined, preserving the data in both sets using merge.

In [63]:
clean_chronic_data = clean_chronic_data.rename(columns={'LocationDesc' : 'State'})

In [64]:
data_inner = pd.merge(clean_chronic_data, public_ed_data, on='State', how='inner')
display(data_inner)

Unnamed: 0,YearEnd,LocationAbbr,State,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,...,Stratification1,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,Overall Public School Rank 2021,Higher Ed Quality 2021,School Safety Rank 2021
0,2021,AL,Alabama,NVSS,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Crude Rate,5.6,5.6,...,Age 0-44,1,CVD,CVD08,CRDRATE,AGE,AGE0_44,44,46,17
1,2021,AL,Alabama,NVSS,Cardiovascular Disease,Diseases of the heart mortality among all peop...,"cases per 100,000",Crude Rate,22.1,22.1,...,Age 0-44,1,CVD,CVD09,CRDRATE,AGE,AGE0_44,44,46,17
2,2021,AL,Alabama,NVSS,Cardiovascular Disease,Coronary heart disease mortality among all peo...,"cases per 100,000",Age-adjusted Rate,85.8,85.8,...,"Black, non-Hispanic",1,CVD,CVD08,AGEADJRATE,RACE,BLK,44,46,17
3,2021,AL,Alabama,NVSS,Cardiovascular Disease,Cerebrovascular disease (stroke) mortality amo...,"cases per 100,000",Age-adjusted Rate,58.0,58.0,...,Male,1,CVD,CVD07,AGEADJRATE,SEX,SEXM,44,46,17
4,2021,AL,Alabama,NVSS,Cardiovascular Disease,Diseases of the heart mortality among all peop...,"cases per 100,000",Crude Rate,273.2,273.2,...,"Black, non-Hispanic",1,CVD,CVD09,CRDRATE,RACE,BLK,44,46,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2587,2021,WY,Wyoming,NVSS,Cardiovascular Disease,Cerebrovascular disease (stroke) mortality amo...,"cases per 100,000",Crude Rate,44.6,44.6,...,Female,56,CVD,CVD07,CRDRATE,SEX,SEXF,19,18,29
2588,2021,WI,Wisconsin,NVSS,Cardiovascular Disease,Diseases of the heart mortality among all peop...,"cases per 100,000",Crude Rate,18.2,18.2,...,"Multiracial, non-Hispanic",55,CVD,CVD09,CRDRATE,RACE,MRC,8,5,24
2589,2021,WY,Wyoming,NVSS,Cardiovascular Disease,Diseases of the heart mortality among all peop...,"cases per 100,000",Age-adjusted Rate,159.4,159.4,...,Overall,56,CVD,CVD09,AGEADJRATE,OVERALL,OVR,19,18,29
2590,2021,WI,Wisconsin,NVSS,Cardiovascular Disease,Cerebrovascular disease (stroke) mortality amo...,"cases per 100,000",Age-adjusted Rate,34.6,34.6,...,"White, non-Hispanic",55,CVD,CVD07,AGEADJRATE,RACE,WHT,8,5,24


In [65]:
data_inner.to_csv('State_Unique_Data.csv', header=True, index=False, encoding="utf-8")

In [66]:
print(data_inner.dtypes)

YearEnd                              int64
LocationAbbr                        object
State                               object
DataSource                          object
Topic                               object
Question                            object
DataValueUnit                       object
DataValueType                       object
DataValue                          float64
DataValueAlt                       float64
LowConfidenceLimit                 float64
HighConfidenceLimit                float64
StratificationCategory1             object
Stratification1                     object
LocationID                           int64
TopicID                             object
QuestionID                          object
DataValueTypeID                     object
StratificationCategoryID1           object
StratificationID1                   object
Overall Public School Rank 2021     object
Higher Ed Quality 2021              object
School Safety Rank 2021             object
dtype: obje