In [20]:
import pandas as pd
import numpy as np
import scipy.stats as st
import math
from sklearn.neighbors import KNeighborsClassifier

In [24]:
# File Paths
nyt_colleges_path = "../datasets/nyt/covid-19-data-master/colleges/colleges.csv"
mask_path = "../datasets/nyt/covid-19-data-master/mask-use/mask-use-by-county.csv"
ipeds_path = "../datasets/illinois/ipeds/ipeds.csv"
fips_path = "../datasets/misc/FIPS_county_codes.csv"

# Read CSVs
df_nyt_colleges = pd.read_csv(nyt_colleges_path)
df_mask = pd.read_csv(mask_path)
df_ipeds = pd.read_csv(ipeds_path)
df_fips = pd.read_csv(fips_path)

# Add county names to mask usage via merge
df_mask_counties = pd.merge(df_mask, df_fips, left_on="COUNTYFP", right_on="FIPS")

# Remove redundant columns, rename county name column for easier use
df_mask_counties = df_mask_counties.drop(columns=["COUNTYFP"]).rename(columns={"Name": "county"})

# Preparing EDA DataFrame
df_case_percapita = pd.DataFrame()

#categories I think will be useful for EDA here
df_case_percapita["cases"] = df_nyt_colleges["cases"]
df_case_percapita["Enrollment"] = df_ipeds["Enrollment"]
df_case_percapita["Urbanization"] = df_ipeds["Urbanization"]
df_case_percapita["college"]= df_nyt_IL_colleges["college"]
df_case_percapita["cases per capita"] = df_case_percapita["cases"] / df_case_percapita["Enrollment"]

# Merging mask data with college data
df_mask_usage = pd.merge(df_nyt_colleges, df_mask_counties, on="county").drop(columns=["State", "FIPS"])

# Also bringing in additional IPEDS data
df_mask_usage = pd.merge(df_mask_usage, df_ipeds, on="ipeds_id")

# Also adding the per capita data calculated earlier
df_mask_usage = pd.merge(df_mask_usage, df_case_percapita[["cases per capita", "college"]], on="college")

In [29]:
df_mask_usage.head(1)

Unnamed: 0,date,state,county,city,ipeds_id,college,cases,notes,NEVER,RARELY,...,"Tuition and fees, 2019-20",Institution size category,Sector of institution,Level of institution,Control of institution,Degree-granting status,Degree of urbanization,Urbanization,Enrollment,cases per capita
0,2020-12-11,Illinois,Madison,Edwardsville,149231.0,Southern Illinois University Edwardsville,238.0,,0.062,0.05,...,12219.0,"10,000 - 19,999","Public, 4-year or above",Four or more years,Public,Degree-granting,Suburb: Large,Suburb,11754,0.030117


In [16]:
df_mask_corr = pd.DataFrame()
df_mask_corr["NEVER"] = df_mask_usage["NEVER"]
df_mask_corr["RARELY"] = df_mask_usage["RARELY"]
df_mask_corr["SOMETIMES"] = df_mask_usage["SOMETIMES"]
df_mask_corr["FREQUENTLY"] = df_mask_usage["FREQUENTLY"]
df_mask_corr["ALWAYS"] = df_mask_usage["ALWAYS"]
df_mask_corr["cases per capita"] = df_mask_usage["cases per capita"]
df_mask_corr["per capita > 0.1"] = df_mask_usage["cases per capita"] > 0.1
df_mask_corr = df_mask_corr.dropna()

df_mask_corr.head(1)

Unnamed: 0,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS,cases per capita,per capita > 0.1
0,0.078,0.025,0.066,0.235,0.596,0.0514,False


In [15]:
never_percapita = st.spearmanr(df_mask_corr["NEVER"], df_mask_corr["cases per capita"])
rarely_percapita = st.spearmanr(df_mask_corr["RARELY"], df_mask_corr["cases per capita"])
sometimes_percapita = st.spearmanr(df_mask_corr["SOMETIMES"], df_mask_corr["cases per capita"])
frequently_percapita = st.spearmanr(df_mask_corr["FREQUENTLY"], df_mask_corr["cases per capita"])
always_percapita = st.spearmanr(df_mask_corr["ALWAYS"], df_mask_corr["cases per capita"])

print(never_percapita)
print(rarely_percapita)
print(sometimes_percapita)
print(frequently_percapita)
print(always_percapita)

SpearmanrResult(correlation=0.05643936682889105, pvalue=0.4308422095197091)
SpearmanrResult(correlation=0.14039483745259557, pvalue=0.049096541894164174)
SpearmanrResult(correlation=0.14122734181546376, pvalue=0.047755397631542094)
SpearmanrResult(correlation=0.07485421604366518, pvalue=0.29583331927689577)
SpearmanrResult(correlation=-0.1218315936838477, pvalue=0.08810777995531095)


In [17]:
never_percapita_p = st.pearsonr(df_mask_corr["NEVER"], df_mask_corr["cases per capita"])
rarely_percapita_p = st.pearsonr(df_mask_corr["RARELY"], df_mask_corr["cases per capita"])
sometimes_percapita_p = st.pearsonr(df_mask_corr["SOMETIMES"], df_mask_corr["cases per capita"])
frequently_percapita_p = st.pearsonr(df_mask_corr["FREQUENTLY"], df_mask_corr["cases per capita"])
always_percapita_p = st.pearsonr(df_mask_corr["ALWAYS"], df_mask_corr["cases per capita"])

print(never_percapita_p)
print(rarely_percapita_p)
print(sometimes_percapita_p)
print(frequently_percapita_p)
print(always_percapita_p)

(0.10698275016233674, 0.13457122021883167)
(0.17937199173712073, 0.011666538589512747)
(0.17067258687282805, 0.016490279759656005)
(0.05464695564779808, 0.4456449263447581)
(-0.20735041350123992, 0.0034598362056174983)


In [58]:
df_knn = df_mask_usage[["ALWAYS", "Enrollment", "Urbanization"]].copy()
df_knn["Urbanization"] = df_knn["Urbanization"].map({"City": 3, "Suburb": 2, "Town": 1})
df_knn["cases per capita"] = df_mask_usage["cases per capita"] > 0.1

df_knn = df_knn.dropna()

df_knn.head(1)

Unnamed: 0,ALWAYS,Enrollment,Urbanization,cases per capita
0,0.589,11754,2.0,False


In [95]:
df_knn = df_knn.sample(frac=1)

x = df_knn[["ALWAYS", "Urbanization", "Enrollment"]]
y = df_knn["cases per capita"]
knn_size = len(df_knn)
split = int(knn_size * 2/3)

knn = KNeighborsClassifier(n_neighbors=3, weights="uniform")
knn.fit(x[:split], y[:split])
knn.score(x[split:], y[split:])

0.9666666666666667