In [14]:
import pandas as pd
import numpy as np
import scipy.stats as st
import math
from sklearn.neighbors import KNeighborsClassifier

In [15]:
# File Paths
nyt_colleges_path = "../datasets/nyt/covid-19-data-master/colleges/colleges.csv"
mask_path = "../datasets/nyt/covid-19-data-master/mask-use/mask-use-by-county.csv"
ipeds_path = "../datasets/ipeds_nationwide/ipeds_usa.csv"
fips_path = "../datasets/misc/FIPS_county_codes.csv"

# Read CSVs
df_nyt_colleges = pd.read_csv(nyt_colleges_path)
df_mask = pd.read_csv(mask_path)
df_ipeds = pd.read_csv(ipeds_path)
df_fips = pd.read_csv(fips_path)

# Add county names to mask usage via merge
df_mask_counties = pd.merge(df_mask, df_fips, left_on="COUNTYFP", right_on="FIPS")

# Remove redundant columns, rename county name column for easier use
df_mask_counties = df_mask_counties.drop(columns=["COUNTYFP"]).rename(columns={"Name": "county", "State": "state"})

# Map state acronyms to state names
state_map = {
    "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas", "CA": "California", "CO": "Colorado",
    "CT": "Connecticut", "DE": "Delaware", "DC": "Washington, D.C.", "FL": "Florida", "GA": "Georgia",
    "HI": "Hawaii", "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa", "KS": "Kansas", 
    "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MD": "Maryland", "MA": "Massachusetts",   
    "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi", "MO": "Missouri", "MT": "Montana", 
    "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico", "NY": "New York",   
    "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio", "OK": "Oklahoma", "OR": "Oregon",
    "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina", "SD": "South Dakota", 
    "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VT": "Vermont", "VA": "Virginia", "WA": "Washington",
    "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming"
}

df_mask_counties["state"] = df_mask_counties["state"].map(state_map)

# Preparing EDA DataFrame
df_case_percapita = pd.DataFrame()

#categories I think will be useful for EDA here
df_case_percapita["cases"] = df_nyt_colleges["cases"]
df_case_percapita["Enrollment"] = df_ipeds["Enrollment"]
df_case_percapita["Urbanization"] = df_ipeds["Urbanization"]
df_case_percapita["college"]= df_nyt_colleges["college"]
df_case_percapita["cases per capita"] = df_case_percapita["cases"] / df_case_percapita["Enrollment"]

# Merging mask data with college data
df_mask_usage = pd.merge(df_nyt_colleges, df_mask_counties, on=["county", "state"]).drop(columns=["FIPS"])

# Also bringing in additional IPEDS data
df_mask_usage = pd.merge(df_mask_usage, df_ipeds, on="ipeds_id")

# Also adding the per capita data calculated earlier
df_mask_usage = pd.merge(df_mask_usage, df_case_percapita[["cases per capita", "college"]], on="college")

In [16]:
df_mask_usage

Unnamed: 0,date,state,county,city,ipeds_id,college,cases,notes,NEVER,RARELY,...,"Tuition and fees, 2019-20",Institution size category,State abbreviation,Sector of institution,Level of institution,Control of institution,Urbanization,Enrollment,City,cases per capita
0,2020-12-11,Alabama,Madison,Huntsville,100654.0,Alabama A&M University,41.0,,0.062,0.050,...,10024.0,"5,000 - 9,999",Alabama,"Public, 4-year or above",Four or more years,Public,City,5943.0,Normal,0.006899
1,2020-12-11,Alabama,Madison,Huntsville,100706.0,University of Alabama in Huntsville,210.0,,0.062,0.050,...,11122.0,"5,000 - 9,999",Alabama,"Public, 4-year or above",Four or more years,Public,City,8400.0,Huntsville,0.029732
2,2020-12-11,Alabama,Montgomery,Montgomery,100724.0,Alabama State University,2.0,,0.056,0.095,...,11068.0,"1,000 - 4,999",Alabama,"Public, 4-year or above",Four or more years,Public,City,4243.0,Montgomery,0.000101
3,2020-12-11,Alabama,Montgomery,Montgomery,100830.0,Auburn University at Montgomery,140.0,,0.056,0.095,...,8620.0,"5,000 - 9,999",Alabama,"Public, 4-year or above",Four or more years,Public,City,5004.0,Montgomery,0.032996
4,2020-12-11,Alabama,Montgomery,Montgomery,101435.0,Huntingdon College,,,0.056,0.095,...,27400.0,"1,000 - 4,999",Alabama,"Private not-for-profit, 4-year or above",Four or more years,Private not-for-profit,City,990.0,Montgomery,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1751,2020-12-11,Wisconsin,Walworth,Whitewater,240189.0,University of Wisconsin-Whitewater,739.0,,0.116,0.146,...,7695.0,"10,000 - 19,999",Wisconsin,"Public, 4-year or above",Four or more years,Public,Town,11530.0,Whitewater,0.048216
1752,2020-12-11,Wyoming,Natrona,Casper,240505.0,Casper College,299.0,,0.100,0.084,...,3834.0,"1,000 - 4,999",Wyoming,"Public, 2-year",At least 2 but less than 4 years,Public,City,2364.0,Casper,0.302326
1753,2020-12-11,Wyoming,Goshen,Torrington,240596.0,Eastern Wyoming College,12.0,,0.201,0.169,...,4110.0,"1,000 - 4,999",Wyoming,"Public, 2-year",At least 2 but less than 4 years,Public,Town,886.0,Torrington,0.015605
1754,2020-12-11,Wyoming,Albany,Laramie,240727.0,University of Wyoming,1795.0,,0.136,0.100,...,5581.0,"10,000 - 19,999",Wyoming,"Public, 4-year or above",Four or more years,Public,Town,10878.0,Laramie,1.338553


In [17]:
df_mask_corr = pd.DataFrame()
df_mask_corr["NEVER"] = df_mask_usage["NEVER"]
df_mask_corr["RARELY"] = df_mask_usage["RARELY"]
df_mask_corr["SOMETIMES"] = df_mask_usage["SOMETIMES"]
df_mask_corr["FREQUENTLY"] = df_mask_usage["FREQUENTLY"]
df_mask_corr["ALWAYS"] = df_mask_usage["ALWAYS"]
df_mask_corr["cases per capita"] = df_mask_usage["cases per capita"]
df_mask_corr["per capita > 0.1"] = df_mask_usage["cases per capita"] > 0.1
df_mask_corr = df_mask_corr.dropna()

df_mask_corr

Unnamed: 0,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS,cases per capita,per capita > 0.1
0,0.062,0.050,0.123,0.177,0.589,0.006899,False
1,0.062,0.050,0.123,0.177,0.589,0.029732,False
2,0.056,0.095,0.123,0.212,0.513,0.000101,False
3,0.056,0.095,0.123,0.212,0.513,0.032996,False
5,0.102,0.034,0.133,0.336,0.395,0.069444,False
...,...,...,...,...,...,...,...
1751,0.116,0.146,0.137,0.191,0.410,0.048216,False
1752,0.100,0.084,0.094,0.325,0.398,0.302326,True
1753,0.201,0.169,0.111,0.223,0.296,0.015605,False
1754,0.136,0.100,0.151,0.181,0.432,1.338553,True


In [18]:
never_percapita = st.spearmanr(df_mask_corr["NEVER"], df_mask_corr["cases per capita"])
rarely_percapita = st.spearmanr(df_mask_corr["RARELY"], df_mask_corr["cases per capita"])
sometimes_percapita = st.spearmanr(df_mask_corr["SOMETIMES"], df_mask_corr["cases per capita"])
frequently_percapita = st.spearmanr(df_mask_corr["FREQUENTLY"], df_mask_corr["cases per capita"])
always_percapita = st.spearmanr(df_mask_corr["ALWAYS"], df_mask_corr["cases per capita"])

print(never_percapita)
print(rarely_percapita)
print(sometimes_percapita)
print(frequently_percapita)
print(always_percapita)

SpearmanrResult(correlation=0.07210391701389125, pvalue=0.0033748207793625456)
SpearmanrResult(correlation=0.12583387301718912, pvalue=2.905542057223833e-07)
SpearmanrResult(correlation=0.1069901664703313, pvalue=1.3216175961964785e-05)
SpearmanrResult(correlation=0.159273407650974, pvalue=7.597420631224243e-11)
SpearmanrResult(correlation=-0.13846129319269632, pvalue=1.6138697596476346e-08)


In [19]:
never_percapita_p = st.pearsonr(df_mask_corr["NEVER"], df_mask_corr["cases per capita"])
rarely_percapita_p = st.pearsonr(df_mask_corr["RARELY"], df_mask_corr["cases per capita"])
sometimes_percapita_p = st.pearsonr(df_mask_corr["SOMETIMES"], df_mask_corr["cases per capita"])
frequently_percapita_p = st.pearsonr(df_mask_corr["FREQUENTLY"], df_mask_corr["cases per capita"])
always_percapita_p = st.pearsonr(df_mask_corr["ALWAYS"], df_mask_corr["cases per capita"])

print(never_percapita_p)
print(rarely_percapita_p)
print(sometimes_percapita_p)
print(frequently_percapita_p)
print(always_percapita_p)

(-0.012205860011397774, 0.6201784645323444)
(0.005947027194235785, 0.8091972988833592)
(0.006605469101134773, 0.7885481721872571)
(0.060262693929067454, 0.014325534168661155)
(-0.022987316651518232, 0.350588682449259)


In [20]:
df_knn = df_mask_usage[["NEVER", "RARELY", "SOMETIMES", "FREQUENTLY", "ALWAYS", "Enrollment", "Urbanization"]].copy()
df_knn["Urbanization"] = df_knn["Urbanization"].map({"City": 3, "Suburb": 2, "Town": 1})
df_knn["cases per capita"] = df_mask_usage["cases per capita"] > 0.1

df_knn = df_knn.dropna()

In [21]:
df_knn = df_knn.sample(frac=1)

x = df_knn[["NEVER", "RARELY", "SOMETIMES", "FREQUENTLY", "ALWAYS", "Urbanization", "Enrollment"]]
y = df_knn["cases per capita"]
knn_size = len(df_knn)
split = int(knn_size * 2/3)

knn = KNeighborsClassifier(n_neighbors=5, weights="distance")
knn.fit(x[:split], y[:split])
knn.score(x[split:], y[split:])

0.6224299065420561

In [22]:
df_knn_simple = df_knn.sample(frac=1)

x = df_knn_simple[["Enrollment"]]
y = df_knn_simple["cases per capita"]
knn_size_simple = len(df_knn_simple)
split = int(knn_size_simple * 2/3)

knn_simple = KNeighborsClassifier(n_neighbors=5, weights="distance")
knn_simple.fit(x[:split], y[:split])
knn_simple.score(x[split:], y[split:])

0.6018691588785047

### Linear SVM

Prepare target where cases per capita > 0.1

In [11]:
case_percapita = df_case_percapita["cases per capita"]
case_percapita = case_percapita.dropna()
train_percapita =[i > .1 for i in case_percapita]

### Prepare features
'mask used always' 'urbanization' 'enrollment'

In [26]:
#df_mask_urban_enroll = df_mask_usage[["ALWAYS"]]
#df_mask_urban_enroll = y

df_mask_urban_enroll= x[["Enrollment"]]

df_mask_urban_enroll=df_mask_urban_enroll.join(df_mask_corr["ALWAYS"])
df_mask_urban_enroll=df_mask_urban_enroll.join(df_knn["Urbanization"])
df_mask_urban_enroll=df_mask_urban_enroll.join(y)


df_mask_urban_enroll=df_mask_urban_enroll.dropna()
df_mask_class_target = df_mask_urban_enroll[["cases per capita"]]
del df_mask_urban_enroll['cases per capita']




Search for best parameters

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

parameters={'C':[200,300,400,500,1000],'penalty':['l1','l2'],'tol':[1e-2,1e-3,1e-4,1e-5]}
gs_model = GridSearchCV(LinearSVC(),parameters,scoring='roc_auc',cv=5)
gs_model.fit(df_mask_urban_enroll,df_mask_class_target)

  return f(*args, **kwargs)
Traceback (most recent call last):
  File "/Library/Python/3.7/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Python/3.7/site-packages/sklearn/svm/_classes.py", line 238, in fit
    self.loss, sample_weight=sample_weight)
  File "/Library/Python/3.7/site-packages/sklearn/svm/_base.py", line 974, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
  File "/Library/Python/3.7/site-packages/sklearn/svm/_base.py", line 832, in _get_liblinear_solver_type
    % (error_string, penalty, loss, dual))
ValueError: Unsupported set of arguments: The combination of penalty='l1' and loss='squared_hinge' are not supported when dual=True, Parameters: penalty='l1', loss='squared_hinge', dual=True

  return f(*args, **kwargs)
Traceback (most recent call last):
  File "/Library/Python/3.7/site-packages/sklearn/model_selection/_valid

GridSearchCV(cv=5, estimator=LinearSVC(),
             param_grid={'C': [200, 300, 400, 500, 1000],
                         'penalty': ['l1', 'l2'],
                         'tol': [0.01, 0.001, 0.0001, 1e-05]},
             scoring='roc_auc')

In [28]:
print(gs_model.cv_results_)
print("")
print(gs_model.best_score_)
print("")
print(gs_model.best_params_)

{'mean_fit_time': array([0.00724382, 0.00463424, 0.00425086, 0.00400882, 0.07274065,
       0.08770571, 0.09933977, 0.12035604, 0.00834661, 0.01235385,
       0.01324911, 0.0105782 , 0.12748814, 0.13505006, 0.14466324,
       0.12046485, 0.00953779, 0.00836883, 0.00745091, 0.00704002,
       0.0974966 , 0.08463659, 0.07373633, 0.081003  , 0.0046433 ,
       0.00565424, 0.00725145, 0.00670176, 0.08253183, 0.07616811,
       0.08103251, 0.08672342, 0.00568638, 0.00607781, 0.00695562,
       0.00628638, 0.08922348, 0.12060509, 0.09159822, 0.08313446]), 'std_fit_time': array([0.00378608, 0.00085356, 0.00078098, 0.00043776, 0.01260272,
       0.01172499, 0.02254661, 0.01421944, 0.00077141, 0.00236145,
       0.00260847, 0.00107744, 0.01356297, 0.01285827, 0.02210693,
       0.02611538, 0.00133853, 0.000855  , 0.00105736, 0.00083338,
       0.01153678, 0.00613623, 0.0053591 , 0.00478515, 0.00018316,
       0.00142908, 0.00068152, 0.00068176, 0.00339135, 0.00698309,
       0.00480113, 0.00712