# COVID Modeling 

In [1040]:
# data handling
import pandas as pd
import numpy as np

# seaborn and matplotlib for visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [1041]:
train = pd.read_csv('../data/training_data.csv', index_col="Unnamed: 0")
train['Date'] = pd.to_datetime(train['Date'])
val = pd.read_csv('../data/validation_data.csv', index_col="Unnamed: 0")
test = pd.read_csv('../data/testing_data.csv', index_col="Unnamed: 0")

In [1042]:
train.columns

Index(['Date', 'State', 'Total Pop', 'Day_of_Wk', 'Confirmed',
       'Confirmed_diff', 'Confirmed_rate', 'Confirmed_rate_diff', 'Deaths',
       'Deaths_diff', 'Deaths_rate', 'Deaths_rate_diff', 'Recovered',
       'Recovered_rate', 'Recovered_diff', 'Recovered_rate_diff', 'Active',
       'Active_diff', 'Active_rate_diff', 'Active_rate', 'Case_Fatality_Ratio',
       'Administered', 'Series_Complete_Yes', 'Month', 'Year',
       'Monthly Temp (F)', 'Monthly Avg Temp (F)'],
      dtype='object')

In [1043]:
train.head()

Unnamed: 0,Date,State,Total Pop,Day_of_Wk,Confirmed,Confirmed_diff,Confirmed_rate,Confirmed_rate_diff,Deaths,Deaths_diff,...,Active_diff,Active_rate_diff,Active_rate,Case_Fatality_Ratio,Administered,Series_Complete_Yes,Month,Year,Monthly Temp (F),Monthly Avg Temp (F)
0,2020-04-12,Alabama,4903185,Sunday,3667,,0.000748,,93,,...,,,0.000708,2.61016,0.0,0.0,4,2020,61.55,63.096875
1,2020-04-13,Alabama,4903185,Monday,3870,203.0,0.000789,4.1e-05,99,6.0,...,165.0,3.4e-05,0.000741,2.651312,0.0,0.0,4,2020,61.55,63.096875
2,2020-04-14,Alabama,4903185,Tuesday,4041,171.0,0.000824,3.5e-05,114,15.0,...,204.0,4.2e-05,0.000783,2.883886,0.0,0.0,4,2020,61.55,63.096875
3,2020-04-15,Alabama,4903185,Wednesday,4307,266.0,0.000878,5.4e-05,118,4.0,...,118.0,2.4e-05,0.000807,2.895706,0.0,0.0,4,2020,61.55,63.096875
4,2020-04-16,Alabama,4903185,Thursday,4465,158.0,0.000911,3.2e-05,133,15.0,...,255.0,5.2e-05,0.000859,3.06099,0.0,0.0,4,2020,61.55,63.096875


## Modeling Methods and Metrics

Since our modeling will take and predict data at the _state_ level, we want our metrics to also be computed at the state level. In this case, we want to compute the root mean squared error, so we aggregate our real and predicted values by state, compute the RSME within the state, and then average the RSME accross all states. 

In [1044]:
def RSME_df(df, col_names): 
    '''
    df has two columns, one with predictions and one with actual values,
    passed as `col_names` (order irrelevant)
    Returns the RSME of the predictions w.r.t the actual values 
    '''
    return np.sqrt(np.mean((df[col_names[0]] - df[col_names[1]])**2))

def compute_RSME_by_state(model, X, Y):
    '''
    Y must have 'State' in the index 
    '''
    Y_pred = pd.DataFrame(data=model.predict(X), index=Y.index, columns=Y.columns)
    combined_data = Y_pred.merge(Y, left_index=True, right_index=True, suffixes=('_pred','_actual'))    
    
    state_pred = combined_data.groupby('State').agg(RSME_df, col_names = combined_data.columns)
    state_pred.columns = ['RSME',"_"]
    state_pred = state_pred[['RSME']]
    return state_pred

def avg_state_RSME(model, X, Y):
    state_RSMEs = compute_RSME_by_state(model, X, Y)
    return state_RSMEs.mean()

In [1045]:
metrics = ['Avg RSME']
datasets = ["Training", "Validation"]


def compute_stats(model, X, Y):
    avg_RSME = avg_state_RSME(model, X, Y)
    # more computed values go here
    return (avg_RSME)

def compute_model_stats(model, X_train, Y_train, X_val, Y_val):
    model_stats = {}
    train_metrics = compute_stats(model, X_train, Y_train)
    model_stats['training'] = dict(zip(metrics, train_metrics))
    val_metrics = compute_stats(model, X_val, Y_val)
    model_stats['validation'] =  dict(zip(metrics, val_metrics))
    return model_stats
    
def print_model_stats(model_stats):
    print("Model Statistics:")
    print('                | ',' | '.join(list(model_stats.keys()), ))
    print('-----------------------------------------')
    for var in model_stats['training'].keys():
        print("{var:<15} |   {train:.3f}   |   {val:.3f}".format(var = var,
                                      train = model_stats['training'][var], 
                                      val   = model_stats['validation'][var]))


In [1046]:
iterables = [datasets, metrics]
col_idx = pd.MultiIndex.from_product(iterables, names=["", ""])
    
def make_fresh_record():
    record = pd.DataFrame(columns=col_idx)
    record.index.name = "Model"
    return record 

def record_model_stats(record, model_stats, model_name, override=False): 
    model_stats_df = pd.json_normalize(model_stats, sep='_')
    model_stats_df.columns = col_idx
    model_data = model_stats_df.iloc[0]
    model_data.name = model_name
    new_record = record.copy()
    # override or new entry
    if override or model_name not in record.index:
        new_record.loc[model_name,:] = model_data
    #exists and don't overide 
    else:
        print("Warning: A model with the name '{}' already exists in this record.".format(model_name))
        print("         Either change model_name or set 'override=True'.")
        return record
    return new_record

def make_and_record_model(record, processing_fun, train, val, name, override=False):
    model, (X_train, Y_train), (X_val, Y_val) = processing_fun(train, val)
    model_stats = compute_model_stats(model, X_train, Y_train, X_val, Y_val)
    record = record_model_stats(record, model_stats, name, override)
    return (record, model, model_stats, {"train_data": (X_train, Y_train), "val_data":(X_val, Y_val)})

### COVID Cases Modeling



In [1047]:
def relabel_timeseries_data(X, Y, W, col_name="input"):
    timeseries_names = [col_name+'_day_'+str(i) for i in range(1-W,1)]

    target_day = Y.name
    Y.name = 'target_'+col_name
    Y = Y.reset_index()
    X = X.set_axis(timeseries_names, axis=1, inplace=False)
    X['Target_day'] = target_day
    Y['Target_day'] = target_day
    X = X.reset_index()
    X = X.set_index(['Target_day','State'])
    Y = Y.set_index(['Target_day','State'])
    return (X, Y)

def create_timeseries(df, col):
    return df.pivot_table(index = 'State', columns='Date',
                   values=col).sort_values(by = 'Date', axis='columns')


def convert_timeseries_to_data(df, W, col_name='input'):
    '''
    df is a dataframe, with columns sorted in increasing order by date
    splits rows into timeseries data with W columns of 'input' associated 
    with the W+1 column of 'output' and combined for all rows 
    '''
    d = df.shape[1]
    X = df.iloc[:, 0:W]
    Y = df.iloc[:,W]
    X, Y = relabel_timeseries_data(X, Y, W, col_name)

    for i in range(1,d-W):#1,3,..., d-W-1
        X_data = df.iloc[:, i:i+W] # i+W-1 = W+1,W+2,... d-1
        Y_data = df.iloc[:,i+W] # i+W = W+2,W+3,..., W+d-W = d
        X_data, Y_data = relabel_timeseries_data(X_data, Y_data, W, col_name)
        X = X.append(X_data)
        Y = Y.append(Y_data)

    return (X, Y)

### Model 1 

Feed in confirmed cases for the previous 14 days (since 2 weeks is a standard COVID incubation period) and predict confirmed cases for the next day. 

In [1048]:
from sklearn.linear_model import LinearRegression

def model_1_pipeline(data, test_data=False): 
    window_size = 14
    conf_timeseries_data = create_timeseries(data,'Confirmed_diff')
    X_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_diff")
    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_1_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_1_pipeline(train)
    X_val, Y_val = model_1_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1049]:
model_record = make_fresh_record()

In [1050]:
model_record, m1, m1_stats, m1_data = make_and_record_model(model_record, 
                                                  model_1_processing, train, val, 
                                                  "Confirmed_diff only, 14D")
model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only, 14D",866.541095,412.442779


### Model 2 

In [1051]:
def model_2_pipeline(data, test_data=False): 
    window_size = 14
    active_timeseries_data = create_timeseries(data,'Active_diff')
    X_active_data, _  = convert_timeseries_to_data(active_timeseries_data, 
                                                window_size, 
                                                col_name="Active_diff")
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    _, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")
    
    #X_data = X_active_data.merge(X_conf_data, left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_active_data, Y_data)

def model_2_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_2_pipeline(train)
    X_val, Y_val = model_2_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1052]:
model_record, m2, m2_stats, m2_data = make_and_record_model(model_record, 
                                                  model_2_processing, train, val, 
                                                  "Active_diff only, 14D", override=True)
model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only, 14D",866.541095,412.442779
"Active_diff only, 14D",1604.214612,2885.770755


### Model 3

In [1053]:
def model_3_pipeline(data, test_data=False): 
    window_size = 14
    active_timeseries_data = create_timeseries(data,'Active_diff')
    X_active_data, _  = convert_timeseries_to_data(active_timeseries_data, 
                                                window_size, 
                                                col_name="Active_diff")
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")
    
    X_data = X_active_data.merge(X_conf_data, left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_3_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_3_pipeline(train)
    X_val, Y_val     = model_3_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1054]:
model_record, m3, m3_stats, m3_data = make_and_record_model(model_record, 
                                                  model_3_processing, train, val, 
                                                  "Active_diff & Conf_diff, 14D", override=True)
model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only, 14D",866.541095,412.442779
"Active_diff only, 14D",1604.214612,2885.770755
"Active_diff & Conf_diff, 14D",866.674825,513.717232


### Model 4

In [1055]:
def model_4_pipeline(data, test_data=False): 
    window_size = 14
    recovered_timeseries_data = create_timeseries(data.fillna(0),'Recovered')
    X_rec_data, _  = convert_timeseries_to_data(recovered_timeseries_data, 
                                                window_size, 
                                                col_name="Recovered")
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")
    
    X_data = X_rec_data.merge(X_conf_data, left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_4_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_4_pipeline(train)
    X_val, Y_val     = model_4_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1056]:
model_record, m4, m4_stats, m4_data = make_and_record_model(model_record, 
                                                  model_4_processing, train, val, 
                                                  "Recovered & Conf_diff, 14D", override=True)
model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only, 14D",866.541095,412.442779
"Active_diff only, 14D",1604.214612,2885.770755
"Active_diff & Conf_diff, 14D",866.674825,513.717232
"Recovered & Conf_diff, 14D",865.920995,522.681359


### Model 5 

In [1057]:
train.head()

Unnamed: 0,Date,State,Total Pop,Day_of_Wk,Confirmed,Confirmed_diff,Confirmed_rate,Confirmed_rate_diff,Deaths,Deaths_diff,...,Active_diff,Active_rate_diff,Active_rate,Case_Fatality_Ratio,Administered,Series_Complete_Yes,Month,Year,Monthly Temp (F),Monthly Avg Temp (F)
0,2020-04-12,Alabama,4903185,Sunday,3667,,0.000748,,93,,...,,,0.000708,2.61016,0.0,0.0,4,2020,61.55,63.096875
1,2020-04-13,Alabama,4903185,Monday,3870,203.0,0.000789,4.1e-05,99,6.0,...,165.0,3.4e-05,0.000741,2.651312,0.0,0.0,4,2020,61.55,63.096875
2,2020-04-14,Alabama,4903185,Tuesday,4041,171.0,0.000824,3.5e-05,114,15.0,...,204.0,4.2e-05,0.000783,2.883886,0.0,0.0,4,2020,61.55,63.096875
3,2020-04-15,Alabama,4903185,Wednesday,4307,266.0,0.000878,5.4e-05,118,4.0,...,118.0,2.4e-05,0.000807,2.895706,0.0,0.0,4,2020,61.55,63.096875
4,2020-04-16,Alabama,4903185,Thursday,4465,158.0,0.000911,3.2e-05,133,15.0,...,255.0,5.2e-05,0.000859,3.06099,0.0,0.0,4,2020,61.55,63.096875


In [1058]:
def model_5_pipeline(data, test_data=False): 
    window_size = 14
    series_timeseries_data = create_timeseries(data.fillna(0),'Series_Complete_Yes')
    X_series_data, _  = convert_timeseries_to_data(series_timeseries_data, 
                                                window_size, 
                                                col_name="Series_Complete_Yes")
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")
    
    X_data = X_series_data.merge(X_conf_data, left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_5_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_5_pipeline(train)
    X_val, Y_val     = model_5_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1059]:
model_record, m5, m5_stats, m5_data = make_and_record_model(model_record, 
                                                  model_5_processing, train, val, 
                                                  "Series_complete & Conf_diff, 14D", override=True)
model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only, 14D",866.541095,412.442779
"Active_diff only, 14D",1604.214612,2885.770755
"Active_diff & Conf_diff, 14D",866.674825,513.717232
"Recovered & Conf_diff, 14D",865.920995,522.681359
"Series_complete & Conf_diff, 14D",866.541095,412.442779


### Model 6

In [1060]:
def model_6_pipeline(data, test_data=False): 
    window_size = 14
    admin_timeseries_data = create_timeseries(data.fillna(0),'Administered')
    X_vax_data, _  = convert_timeseries_to_data(admin_timeseries_data, 
                                                window_size, 
                                                col_name="Administered")
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")
    
    X_data = X_vax_data.merge(X_conf_data, left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_6_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_6_pipeline(train)
    X_val, Y_val     = model_6_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1061]:
model_record, m6, m6_stats, m6_data = make_and_record_model(model_record, 
                                                  model_6_processing, train, val, 
                                                  "Administered & Conf_diff, 14D", override=True)
model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only, 14D",866.541095,412.442779
"Active_diff only, 14D",1604.214612,2885.770755
"Active_diff & Conf_diff, 14D",866.674825,513.717232
"Recovered & Conf_diff, 14D",865.920995,522.681359
"Series_complete & Conf_diff, 14D",866.541095,412.442779
"Administered & Conf_diff, 14D",859.673793,1439.257229


In [1062]:
train.head()
train[['Administered', 'Series_Complete_Yes']].describe()

Unnamed: 0,Administered,Series_Complete_Yes
count,14455.0,14455.0
mean,31035.23,0.0
std,149466.4,0.0
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,3285351.0,0.0


Both of these variables are entirely empty for the time covered by "training" data, so neither changes the model... from just having "Confirmed_diff". 

### Model 7

In [1063]:
def model_7_pipeline(data, test_data=False): 
    window_size = 14
    recovered_timeseries_data = create_timeseries(data.fillna(0),'Recovered')
    X_rec_data, _  = convert_timeseries_to_data(recovered_timeseries_data, 
                                                window_size, 
                                                col_name="Recovered")
    
    active_timeseries_data = create_timeseries(data,'Active_diff')
    X_active_data, _  = convert_timeseries_to_data(active_timeseries_data, 
                                                window_size, 
                                                col_name="Active_diff")
    
    timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")
    
    X_data = X_conf_data.merge(X_rec_data, left_index=True, right_index=True)
    X_data = X_data.merge(X_active_data, left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_7_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_7_pipeline(train)
    X_val, Y_val     = model_7_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1064]:
model_record, m7, m7_stats, m7_data = make_and_record_model(model_record, 
                                                  model_7_processing, train, val, 
                                                  "Active, Recovered, & Conf_diff, 14D", override=True)
model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only, 14D",866.541095,412.442779
"Active_diff only, 14D",1604.214612,2885.770755
"Active_diff & Conf_diff, 14D",866.674825,513.717232
"Recovered & Conf_diff, 14D",865.920995,522.681359
"Series_complete & Conf_diff, 14D",866.541095,412.442779
"Administered & Conf_diff, 14D",859.673793,1439.257229
"Active, Recovered, & Conf_diff, 14D",865.318707,561.007786


Doesn't really help to include more, this is likely because of colinearity and not much new data being added. 

## Weather Models

First we should check for null values in our weather data.

In [1065]:
null_temps_by_state = train[['State','Monthly Avg Temp (F)','Monthly Temp (F)']].set_index('State').isna().groupby(level=0).sum()
null_temps_by_state[null_temps_by_state.any(1)]

Unnamed: 0_level_0,Monthly Avg Temp (F),Monthly Temp (F)
State,Unnamed: 1_level_1,Unnamed: 2_level_1
District of Columbia,295,295


We see that D.C. has lots of null values, but all other states have data for all rows. We saw in our weather data analysis that D.C. was not included as a Division for weather collection. For our weather models we will exclude D.C. from analysis, and restrict to proper states. 

In [1066]:
weather_train = train[train['State']!='District of Columbia']
weather_val   = val[val['State']!='District of Columbia']
weather_test  = test[test['State']!='District of Columbia']
null_temps_by_state = weather_train[['State','Monthly Avg Temp (F)','Monthly Temp (F)']].set_index('State').isna().groupby(level=0).sum()
display(null_temps_by_state[null_temps_by_state.any(1)])
print("Missing Weather Training Values:   ",weather_train[['State','Monthly Avg Temp (F)','Monthly Temp (F)']].isna().sum().sum())
print("Missing Weather Validation Values: ",weather_val[['State','Monthly Avg Temp (F)','Monthly Temp (F)']].isna().sum().sum())


Unnamed: 0_level_0,Monthly Avg Temp (F),Monthly Temp (F)
State,Unnamed: 1_level_1,Unnamed: 2_level_1


Missing Weather Training Values:    0
Missing Weather Validation Values:  0


After dropping D.C. we no longer have missing weather data. 

In [1067]:
weather_model_record = make_fresh_record()
weather_model_record, m1_weather, m1_weather_stats, m1_weather_data = make_and_record_model(weather_model_record, 
                                                  model_1_processing, weather_train, weather_val, 
                                                  "Confirmed_diff only (weather), 14D")
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967


### Model 8 

Let's naively plug in the average monthly temperatures for each state based on the recent monthly averages (2000-2019). 

In [1068]:
def model_8_pipeline(data, test_data=False): 
    window_size = 14
    
    X_temp_data = data[['State','Date','Monthly Avg Temp (F)']].set_index(['Date','State'])
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")
    
    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_8_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_8_pipeline(train)
    X_val, Y_val     = model_8_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1069]:
weather_model_record, m8, m8_stats, m8_data = make_and_record_model(weather_model_record, 
                                                  model_8_processing, weather_train, weather_val, 
                                                  "Avg Mon. Temp & Conf_diff, 14D", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384


### Model 9 
As we saw from our EDA (`analysis/data_merge_and_eda.ipynb`), absolute temperatures do not seem to follow a linear relationship with confirmed cases, however we did see increases for particularly cold or hot temperatures. To model this is our data, we can compute how far a temperature is from "room temperature", which we will take to be 70 degrees Fahrenheit. 

In [1070]:
weather_train[['State','Monthly Avg Temp (F)','Monthly Temp (F)']].head()

Unnamed: 0,State,Monthly Avg Temp (F),Monthly Temp (F)
0,Alabama,63.096875,61.55
1,Alabama,63.096875,61.55
2,Alabama,63.096875,61.55
3,Alabama,63.096875,61.55
4,Alabama,63.096875,61.55


In [1071]:
weather_train['month_diff_rm_temp'] = np.abs(weather_train['Monthly Temp (F)'] - 70)
weather_train['month_avg_diff_rm_temp'] = weather_train['Monthly Avg Temp (F)'] - 70
weather_train[['State','Monthly Avg Temp (F)','Monthly Temp (F)','month_diff_rm_temp','month_avg_diff_rm_temp']].head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_train['month_diff_rm_temp'] = np.abs(weather_train['Monthly Temp (F)'] - 70)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_train['month_avg_diff_rm_temp'] = weather_train['Monthly Avg Temp (F)'] - 70


Unnamed: 0,State,Monthly Avg Temp (F),Monthly Temp (F),month_diff_rm_temp,month_avg_diff_rm_temp
0,Alabama,63.096875,61.55,8.45,-6.903125
1,Alabama,63.096875,61.55,8.45,-6.903125
2,Alabama,63.096875,61.55,8.45,-6.903125
3,Alabama,63.096875,61.55,8.45,-6.903125
4,Alabama,63.096875,61.55,8.45,-6.903125


In [1072]:
def model_9_pipeline(data, test_data=False): 
    window_size = 14
    
    data.loc[:,'month_diff_rm_temp']     = np.abs(data['Monthly Temp (F)'] - 70)
    data.loc[:,'month_avg_diff_rm_temp'] = np.abs(data['Monthly Avg Temp (F)'] - 70)

    X_temp_data = data[['State','Date','month_avg_diff_rm_temp']].set_index(['Date','State'])
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")
    
    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_9_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_9_pipeline(train)
    X_val, Y_val     = model_9_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1073]:
weather_model_record, m9, m9_stats, m9_data = make_and_record_model(weather_model_record, 
                                                  model_9_processing, weather_train, weather_val, 
                                                  "Abs Mon Avg Temp diff & Conf_diff, 14D", override=True)
weather_model_record

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047


This doesn't really seem to improve the model. 

### Model 10 

We also saw that different states had different sensitivity to "cold" (e.g. Calfornia saw the cold effect between 40 and 50 degrees, but Vermont saw a less pronounced effect only below 40 degrees). We can incporate this into the data by computing the annual average temperature for each state and computing the difference for each month from the _state's_ annual average. 

In [1074]:
hist_weather_data = pd.read_csv('../data/historical_monthly_temp_avgs_by_state.csv', index_col=0)
hist_weather_data.head()
state_avg_temps = hist_weather_data.groupby('State').mean().rename(columns={'Monthly Avg Temp (F)':"State Avg Temp"})[['State Avg Temp']]
state_avg_temps.head()

Unnamed: 0_level_0,State Avg Temp
State,Unnamed: 1_level_1
Alabama,63.428698
Arizona,63.623631
Arkansas,61.144722
California,57.130774
Colorado,46.451083


In [1075]:
with_state_avg = weather_train.merge(state_avg_temps, how="left", left_on='State',right_index=True)
with_state_avg[['State','Monthly Avg Temp (F)','Monthly Temp (F)','State Avg Temp']]

Unnamed: 0,State,Monthly Avg Temp (F),Monthly Temp (F),State Avg Temp
0,Alabama,63.096875,61.55,63.428698
1,Alabama,63.096875,61.55,63.428698
2,Alabama,63.096875,61.55,63.428698
3,Alabama,63.096875,61.55,63.428698
4,Alabama,63.096875,61.55,63.428698
...,...,...,...,...
17282,Wyoming,22.051000,23.39,42.090167
17283,Wyoming,22.051000,23.39,42.090167
17284,Wyoming,22.051000,23.39,42.090167
17285,Wyoming,22.051000,23.39,42.090167


In [1076]:
with_state_avg['Diff from Avg'] = np.abs(with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp'])
with_state_avg.sample(5)

Unnamed: 0,Date,State,Total Pop,Day_of_Wk,Confirmed,Confirmed_diff,Confirmed_rate,Confirmed_rate_diff,Deaths,Deaths_diff,...,Administered,Series_Complete_Yes,Month,Year,Monthly Temp (F),Monthly Avg Temp (F),month_diff_rm_temp,month_avg_diff_rm_temp,State Avg Temp,Diff from Avg
13986,2020-10-09,South Dakota,884659,Friday,27215,774.0,0.030763,0.000875,277,5.0,...,0.0,0.0,10,2020,41.155556,47.130556,28.844444,22.869444,45.714259,1.416296
8227,2020-07-06,Missouri,6137428,Monday,24870,460.0,0.004052,7.5e-05,1062,0.0,...,0.0,0.0,7,2020,79.516667,77.88,9.516667,7.88,55.945486,21.934514
5431,2020-08-11,Kentucky,4467673,Tuesday,35793,539.0,0.008012,0.000121,783,8.0,...,0.0,0.0,8,2020,74.85,75.6775,4.85,5.6775,56.549375,19.128125
8942,2020-07-13,Nebraska,1934408,Monday,21399,227.0,0.011062,0.000117,288,3.0,...,0.0,0.0,7,2020,75.65,75.454375,5.65,5.454375,50.119792,25.334583
9577,2020-05-01,New Hampshire,1359711,Friday,2310,164.0,0.001699,0.000121,81,9.0,...,0.0,0.0,5,2020,52.65,53.365,17.35,16.635,43.458542,9.906458


In [1077]:
def model_10_pipeline(data, test_data=False): 
    window_size = 14
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = np.abs(with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp'])


    X_temp_data = with_state_avg[['State','Date','Diff from Avg']].set_index(['Date','State'])
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")
    
    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_10_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_10_pipeline(train)
    X_val, Y_val     = model_10_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1078]:
weather_model_record, m10, m10_stats, m10_data = make_and_record_model(weather_model_record, 
                                                  model_10_processing, weather_train, weather_val, 
                                                  "Temp diff from State Avg & Conf_diff, 14D", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085


In [1079]:
m10_data['train_data'][0].columns

Index(['Confirmed_Diff_day_-13', 'Confirmed_Diff_day_-12',
       'Confirmed_Diff_day_-11', 'Confirmed_Diff_day_-10',
       'Confirmed_Diff_day_-9', 'Confirmed_Diff_day_-8',
       'Confirmed_Diff_day_-7', 'Confirmed_Diff_day_-6',
       'Confirmed_Diff_day_-5', 'Confirmed_Diff_day_-4',
       'Confirmed_Diff_day_-3', 'Confirmed_Diff_day_-2',
       'Confirmed_Diff_day_-1', 'Confirmed_Diff_day_0', 'Diff from Avg'],
      dtype='object')

In [1080]:
m10.coef_

array([[ 0.03737115, -0.04240132, -0.06089067, -0.05297856, -0.0371887 ,
        -0.08170519, -0.01487443,  0.34846662,  0.15481001,  0.1517594 ,
         0.09222063,  0.14026837,  0.1891298 ,  0.15931647, -5.12407079]])

### Model 11 

We saw that colder weather seemed to have a larger impact that warmer weather. To try and capture this in our model, let's compute a feature that measures how distance below average temperature, with values above average zeroed out. 

In [1081]:
def zero_positive_values(val):
    if val < 0:
        return val
    else:
        return 0
    
def model_11_pipeline(data, test_data=False): 
    window_size = 14
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp']
    with_state_avg['Diff below Avg'] = with_state_avg['Diff from Avg'].apply(zero_positive_values)

    X_temp_data = with_state_avg[['State','Date','Diff below Avg']].set_index(['Date','State'])
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")
    
    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_11_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_11_pipeline(train)
    X_val, Y_val     = model_11_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1082]:
weather_model_record, m11, m11_stats, m11_data = make_and_record_model(weather_model_record, 
                                                  model_11_processing, weather_train, weather_val, 
                                                  "Temp diff below State Avg & Conf_diff, 14D", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106


This also doesn't seem to improve the model much...  out of curiosity, lets reduce the number of days being used for confirmed case data. 

In [1083]:
def model_12_pipeline(data, test_data=False): 
    window_size = 7
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp']
    with_state_avg['Diff below Avg'] = with_state_avg['Diff from Avg'].apply(zero_positive_values)

    X_temp_data = with_state_avg[['State','Date','Diff below Avg']].set_index(['Date','State'])
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")

    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_12_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_12_pipeline(train)
    X_val, Y_val     = model_12_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1084]:
weather_model_record, m12, m12_stats, m12_data = make_and_record_model(weather_model_record, 
                                                  model_12_processing, weather_train, weather_val, 
                                                  "Temp diff below State Avg & Conf_diff, 7D", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 7D",890.387396,497.623461


As another approach, let's add some regularization to the model. 

In [1085]:
from sklearn.linear_model import Ridge
def model_13_pipeline(data, test_data=False): 
    window_size = 7
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp']
    with_state_avg['Diff below Avg'] = with_state_avg['Diff from Avg'].apply(zero_positive_values)

    X_temp_data = with_state_avg[['State','Date','Diff below Avg']].set_index(['Date','State'])
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")

    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_13_processing(train, val):
    model = Ridge(alpha=1)
    X_train, Y_train = model_13_pipeline(train)
    X_val, Y_val     = model_13_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1086]:
weather_model_record, m13, m13_stats, m13_data = make_and_record_model(weather_model_record, 
                                                  model_13_processing, weather_train, weather_val, 
                                                  "Temp diff below State Avg & Conf_diff, 7D, RIDGE", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 7D",890.387396,497.623461
"Temp diff below State Avg & Conf_diff, 7D, RIDGE",890.387396,497.62347


In [1087]:
m13.coef_


array([[0.30843933, 0.10073388, 0.10032248, 0.05143328, 0.10953492,
        0.15998837, 0.16407515, 1.490917  ]])

In [1088]:
m12.coef_

array([[0.30843933, 0.10073388, 0.10032248, 0.05143328, 0.10953492,
        0.15998837, 0.16407515, 1.49091871]])

In [1089]:
from sklearn.linear_model import Lasso

def model_14_processing(train, val):
    model = Lasso(alpha=5)
    X_train, Y_train = model_13_pipeline(train)
    X_val, Y_val     = model_13_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1090]:
weather_model_record, m14, m14_stats, m14_data = make_and_record_model(weather_model_record, 
                                                  model_14_processing, weather_train, weather_val, 
                                                  "Temp diff below State Avg & Conf_diff, 7D, Lasso", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 7D",890.387396,497.623461
"Temp diff below State Avg & Conf_diff, 7D, RIDGE",890.387396,497.62347
"Temp diff below State Avg & Conf_diff, 7D, Lasso",890.393648,498.037558


In [1091]:
print(m14_data['train_data'][0].columns)
m14.coef_

Index(['Confirmed_Diff_day_-6', 'Confirmed_Diff_day_-5',
       'Confirmed_Diff_day_-4', 'Confirmed_Diff_day_-3',
       'Confirmed_Diff_day_-2', 'Confirmed_Diff_day_-1',
       'Confirmed_Diff_day_0', 'Diff below Avg'],
      dtype='object')


array([0.30841822, 0.10071897, 0.10031553, 0.051426  , 0.10952721,
       0.15998001, 0.16406904, 1.41200397])

In [1092]:
print(m13_data['train_data'][0].columns)
m13.coef_

Index(['Confirmed_Diff_day_-6', 'Confirmed_Diff_day_-5',
       'Confirmed_Diff_day_-4', 'Confirmed_Diff_day_-3',
       'Confirmed_Diff_day_-2', 'Confirmed_Diff_day_-1',
       'Confirmed_Diff_day_0', 'Diff below Avg'],
      dtype='object')


array([[0.30843933, 0.10073388, 0.10032248, 0.05143328, 0.10953492,
        0.15998837, 0.16407515, 1.490917  ]])

In [1093]:
print(m11_data['train_data'][0].columns)
m11.coef_

Index(['Confirmed_Diff_day_-13', 'Confirmed_Diff_day_-12',
       'Confirmed_Diff_day_-11', 'Confirmed_Diff_day_-10',
       'Confirmed_Diff_day_-9', 'Confirmed_Diff_day_-8',
       'Confirmed_Diff_day_-7', 'Confirmed_Diff_day_-6',
       'Confirmed_Diff_day_-5', 'Confirmed_Diff_day_-4',
       'Confirmed_Diff_day_-3', 'Confirmed_Diff_day_-2',
       'Confirmed_Diff_day_-1', 'Confirmed_Diff_day_0', 'Diff below Avg'],
      dtype='object')


array([[ 0.03623187, -0.04336849, -0.06166351, -0.05356423, -0.03768618,
        -0.08214346, -0.01523711,  0.34887922,  0.15529221,  0.15231912,
         0.09259055,  0.14077615,  0.18973595,  0.16007107, -0.47790624]])

Let's try a model with lots of weather data, then again with regulariazation. 

In [1094]:
def model_15_pipeline(data, test_data=False): 
    window_size = 7
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp']
    with_state_avg['Diff below Avg'] = with_state_avg['Diff from Avg'].apply(zero_positive_values)

    with_state_avg['month_diff_rm_temp']     = np.abs(data['Monthly Temp (F)'] - 70)
    with_state_avg['month_avg_diff_rm_temp'] = np.abs(data['Monthly Avg Temp (F)'] - 70)
    X_temp_data = with_state_avg[['State','Date',
                                  'Diff below Avg','month_diff_rm_temp', 
                                  'month_avg_diff_rm_temp', 'State Avg Temp', 
                                  'Diff from Avg', 'Diff below Avg']].set_index(['Date','State'])
     
    
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")

    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_15_linear_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_15_pipeline(train)
    X_val, Y_val     = model_15_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1095]:
weather_model_record, m15_lin, m15_lin_stats, m15_data = make_and_record_model(weather_model_record, 
                                                  model_15_linear_processing, weather_train, weather_val, 
                                                  "lots of weather & Conf_diff, 7D", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 7D",890.387396,497.623461
"Temp diff below State Avg & Conf_diff, 7D, RIDGE",890.387396,497.62347
"Temp diff below State Avg & Conf_diff, 7D, Lasso",890.393648,498.037558
"lots of weather & Conf_diff, 7D",889.490097,502.976026


In [1096]:
def model_15_ridge_processing(train, val):
    model = Ridge(alpha=1000000000)
    X_train, Y_train = model_15_pipeline(train)
    X_val, Y_val     = model_15_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1097]:
weather_model_record, m15_ridge, m15_ridge_stats, m15_data = make_and_record_model(weather_model_record, 
                                                  model_15_ridge_processing, weather_train, weather_val, 
                                                  "lots of weather & Conf_diff, 7D, RIDGE", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 7D",890.387396,497.623461
"Temp diff below State Avg & Conf_diff, 7D, RIDGE",890.387396,497.62347
"Temp diff below State Avg & Conf_diff, 7D, Lasso",890.393648,498.037558
"lots of weather & Conf_diff, 7D",889.490097,502.976026
"lots of weather & Conf_diff, 7D, RIDGE",891.252446,506.82556


In [1098]:
m15_ridge.coef_

array([[ 0.29985441,  0.1024381 ,  0.10212371,  0.05536974,  0.11036439,
         0.1589347 ,  0.16306243,  0.00115764, -0.00154613, -0.00151603,
         0.00370082, -0.00263536,  0.00115764]])

In [1099]:
def model_15_lasso_processing(train, val):
    model = Lasso(alpha=100000)
    X_train, Y_train = model_15_pipeline(train)
    X_val, Y_val     = model_15_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1100]:
weather_model_record, m15_lasso, m15_lasso_stats, m15_data = make_and_record_model(weather_model_record, 
                                                  model_15_lasso_processing, weather_train, weather_val, 
                                                  "lots of weather & Conf_diff, 7D, LASSO", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 7D",890.387396,497.623461
"Temp diff below State Avg & Conf_diff, 7D, RIDGE",890.387396,497.62347
"Temp diff below State Avg & Conf_diff, 7D, Lasso",890.393648,498.037558
"lots of weather & Conf_diff, 7D",889.490097,502.976026
"lots of weather & Conf_diff, 7D, RIDGE",891.252446,506.82556


In [1101]:
m15_lasso.coef_

array([ 0.30629636,  0.09902594,  0.09886218,  0.04986287,  0.10802291,
        0.15873683,  0.16313514,  0.        , -0.        , -0.        ,
        0.        , -0.        ,  0.        ])

Tweaking the regularization constant until we get improvements on the validation score without significant loss to training, we get a LASSO model that completely ignores the weather data we input into the model and only utilizes the confirmed cases data. One interpretation of this is that the weather effects we saw were coincidental, aligning with external factors such as holiday travel or arrival of variants. Another potential issue with this model is that monthly weather temperatures would have already affected the confirmed case data going into the model. That is the cases from recent days already mediates the effect of temperature. To investigate this second idea further, let's compose a model that drops the case data entirely and only uses weather, and consider the effectiveness of such a model. 

In [1102]:
def model_16_pipeline(data, test_data=False): 
    window_size = 7
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp']
    with_state_avg['Diff below Avg'] = with_state_avg['Diff from Avg'].apply(zero_positive_values)

    with_state_avg['month_diff_rm_temp']     = np.abs(data['Monthly Temp (F)'] - 70)
    with_state_avg['month_avg_diff_rm_temp'] = np.abs(data['Monthly Avg Temp (F)'] - 70)
    X_temp_data = with_state_avg[['State','Date',
                                  'Diff below Avg','month_diff_rm_temp', 
                                  'month_avg_diff_rm_temp', 'State Avg Temp', 
                                  'Diff from Avg', 'Diff below Avg']].set_index(['Date','State'])
     
    
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")

    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)
    X_data = X_data[['Diff below Avg','month_diff_rm_temp', 
            'month_avg_diff_rm_temp', 'State Avg Temp', 'Diff from Avg', 'Diff below Avg']]
    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_16_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_16_pipeline(train)
    X_val, Y_val     = model_16_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

def model_16_ridge_processing(train, val, alpha = 1):
    model = Ridge(alpha=alpha)
    X_train, Y_train = model_16_pipeline(train)
    X_val, Y_val     = model_16_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

def model_16_lasso_processing(train, val, alpha = 1):
    model = Lasso(alpha=alpha)
    X_train, Y_train = model_16_pipeline(train)
    X_val, Y_val     = model_16_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [1103]:
def regularization_plot(alphas, model_processing, train, val):
    for alpha in alphas:
        model, (X_train, Y_train), (X_val, Y_val) = model_processing(train, val, alpha)
        stats = compute_model_stats(model, X_train, Y_train, X_val, Y_val)
        print(stats)
        

In [1104]:
alphas = [1,10,100,1000,3000,10000,30000, 100000, 1000000]
regularization_plot(alphas, model_16_lasso_processing, weather_train, weather_val)

{'training': {'Avg RSME': 2202.353209846267}, 'validation': {'Avg RSME': 3250.00619238348}}
{'training': {'Avg RSME': 2201.7247324489376}, 'validation': {'Avg RSME': 3224.4728312426228}}
{'training': {'Avg RSME': 2198.2088848111125}, 'validation': {'Avg RSME': 2970.6684013391914}}
{'training': {'Avg RSME': 2186.598758286038}, 'validation': {'Avg RSME': 2653.4304156193575}}
{'training': {'Avg RSME': 2191.4489406476628}, 'validation': {'Avg RSME': 2455.6361248141443}}
{'training': {'Avg RSME': 2282.9173344544142}, 'validation': {'Avg RSME': 2087.3966636692053}}
{'training': {'Avg RSME': 2472.7956201612965}, 'validation': {'Avg RSME': 1581.691988203184}}
{'training': {'Avg RSME': 2472.7956201612965}, 'validation': {'Avg RSME': 1581.691988203184}}
{'training': {'Avg RSME': 2472.7956201612965}, 'validation': {'Avg RSME': 1581.691988203184}}


In [1105]:
weather_model_record, m16, m16_stats, m16_data = make_and_record_model(weather_model_record, 
                                                  model_16_processing, weather_train, weather_val, 
                                                  "lots of weather only", override=True)
weather_model_record
weather_model_record, m16_ridge, m16_ridge_stats, m16_data = make_and_record_model(weather_model_record, 
                                                  model_16_ridge_processing, weather_train, weather_val, 
                                                  "lots of weather only, RIDGE", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 7D",890.387396,497.623461
"Temp diff below State Avg & Conf_diff, 7D, RIDGE",890.387396,497.62347
"Temp diff below State Avg & Conf_diff, 7D, Lasso",890.393648,498.037558
"lots of weather & Conf_diff, 7D",889.490097,502.976026
"lots of weather & Conf_diff, 7D, RIDGE",891.252446,506.82556


In [1106]:
train[['State','Confirmed_diff']].groupby('State').std().mean()

Confirmed_diff    1845.130758
dtype: float64

The weather only models have significantly higher RSME and when compared to the inherent variability of the the confirmed_diff data, these don't seem to have a significant effect on the model's predictive capability. In particular we also see a larger gap between the training and validation RSMEs. 

### Weather Interaction Models 

In [1133]:
def model_17_pipeline(data, test_data=False): 
    window_size = 14
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp']
    with_state_avg['Diff below Avg'] = np.abs(with_state_avg['Diff from Avg'].apply(zero_positive_values))

    X_temp_data = with_state_avg[['State', 'Date', 'Diff below Avg']].set_index(['Date','State'])
     
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")

    X_conf_data_scaled = X_conf_data.mul(X_temp_data['Diff below Avg'], axis=0)
    X_data = X_conf_data.merge(X_conf_data_scaled, how="left", 
                               left_index=True, right_index=True,
                              suffixes=('_orig','_temp_scaled'))
    #X_data = X_data[['Diff below Avg','month_diff_rm_temp', 
    #        'month_avg_diff_rm_temp', 'State Avg Temp', 'Diff from Avg', 'Diff below Avg']]
    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_17_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_17_pipeline(train)
    X_val, Y_val     = model_17_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

def model_17_Lasso_processing(train, val, alpha = 1000):
    model = Lasso(alpha=alpha)
    X_train, Y_train = model_17_pipeline(train)
    X_val, Y_val     = model_17_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))


def model_17_Ridge_processing(train, val, alpha = 10000000):
    model = Ridge(alpha=alpha)
    X_train, Y_train = model_17_pipeline(train)
    X_val, Y_val     = model_17_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))


In [1134]:
weather_model_record, m17_ridge, m17_ridge_stats, m17_data = make_and_record_model(weather_model_record, 
                                                  model_17_Ridge_processing, weather_train, weather_val, 
                                                  "conf diff, and conf diff temp scaled, 14D, RIDGE", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 7D",890.387396,497.623461
"Temp diff below State Avg & Conf_diff, 7D, RIDGE",890.387396,497.62347
"Temp diff below State Avg & Conf_diff, 7D, Lasso",890.393648,498.037558
"lots of weather & Conf_diff, 7D",889.490097,502.976026
"lots of weather & Conf_diff, 7D, RIDGE",891.252446,506.82556


In [1117]:
weather_model_record, m17, m17_stats, m17_data = make_and_record_model(weather_model_record, 
                                                  model_17_processing, weather_train, weather_val, 
                                                  "conf diff, and conf diff temp scaled, 14D", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 7D",890.387396,497.623461
"Temp diff below State Avg & Conf_diff, 7D, RIDGE",890.387396,497.62347
"Temp diff below State Avg & Conf_diff, 7D, Lasso",890.393648,498.037558
"lots of weather & Conf_diff, 7D",889.490097,502.976026
"lots of weather & Conf_diff, 7D, RIDGE",891.252446,506.82556


In [1128]:
weather_model_record, m17_lasso, m17_lasso_stats, m17_data = make_and_record_model(weather_model_record, 
                                                  model_17_Lasso_processing, weather_train, weather_val, 
                                                  "conf diff, and conf diff temp scaled, 14D, LASSO", override=True)
weather_model_record


  model = cd_fast.enet_coordinate_descent(


Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 7D",890.387396,497.623461
"Temp diff below State Avg & Conf_diff, 7D, RIDGE",890.387396,497.62347
"Temp diff below State Avg & Conf_diff, 7D, Lasso",890.393648,498.037558
"lots of weather & Conf_diff, 7D",889.490097,502.976026
"lots of weather & Conf_diff, 7D, RIDGE",891.252446,506.82556


Still doesn't seem to help. Confusingly, the weather models seem to do worse on the training and validation datasets, despite having _more_ variables available. 

In [1119]:
m17_lasso.coef_

array([ 5.15879893e-02, -6.39230764e-02, -1.56405811e-02, -4.65233384e-02,
       -3.67352297e-02, -1.20290557e-01,  4.62596923e-02,  1.92255667e-01,
        1.75088889e-01,  1.25533640e-01,  1.64201180e-01,  1.60700114e-01,
        1.72500169e-01,  1.85729548e-01, -1.56160800e-03,  2.00763870e-03,
       -3.37230229e-03, -7.79916684e-05, -3.00026143e-05,  2.59126197e-03,
       -4.55574689e-03,  1.13311918e-02, -1.70849798e-03,  1.94195098e-03,
       -5.59621739e-03, -1.23655318e-03,  1.20451002e-03, -1.49694352e-03])