# Weather Modeling

In [125]:
# data handling
import pandas as pd
import numpy as np

# seaborn and matplotlib for visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [126]:
train = pd.read_csv('../data/training_data.csv', index_col="Unnamed: 0")
train['Date'] = pd.to_datetime(train['Date'])
val = pd.read_csv('../data/validation_data.csv', index_col="Unnamed: 0")
test = pd.read_csv('../data/testing_data.csv', index_col="Unnamed: 0")

In [127]:
train.columns

Index(['Date', 'State', 'Total Pop', 'Day_of_Wk', 'Confirmed',
       'Confirmed_diff', 'Confirmed_rate', 'Confirmed_rate_diff', 'Deaths',
       'Deaths_diff', 'Deaths_rate', 'Deaths_rate_diff', 'Recovered',
       'Recovered_rate', 'Recovered_diff', 'Recovered_rate_diff', 'Active',
       'Active_diff', 'Active_rate_diff', 'Active_rate', 'Case_Fatality_Ratio',
       'Administered', 'Series_Complete_Yes', 'Month', 'Year',
       'Monthly Temp (F)', 'Monthly Avg Temp (F)'],
      dtype='object')

In [128]:
train.head()

Unnamed: 0,Date,State,Total Pop,Day_of_Wk,Confirmed,Confirmed_diff,Confirmed_rate,Confirmed_rate_diff,Deaths,Deaths_diff,...,Active_diff,Active_rate_diff,Active_rate,Case_Fatality_Ratio,Administered,Series_Complete_Yes,Month,Year,Monthly Temp (F),Monthly Avg Temp (F)
0,2020-04-12,Alabama,4903185,Sunday,3667,,0.000748,,93,,...,,,0.000708,2.61016,0.0,0.0,4,2020,61.55,63.096875
1,2020-04-13,Alabama,4903185,Monday,3870,203.0,0.000789,4.1e-05,99,6.0,...,165.0,3.4e-05,0.000741,2.651312,0.0,0.0,4,2020,61.55,63.096875
2,2020-04-14,Alabama,4903185,Tuesday,4041,171.0,0.000824,3.5e-05,114,15.0,...,204.0,4.2e-05,0.000783,2.883886,0.0,0.0,4,2020,61.55,63.096875
3,2020-04-15,Alabama,4903185,Wednesday,4307,266.0,0.000878,5.4e-05,118,4.0,...,118.0,2.4e-05,0.000807,2.895706,0.0,0.0,4,2020,61.55,63.096875
4,2020-04-16,Alabama,4903185,Thursday,4465,158.0,0.000911,3.2e-05,133,15.0,...,255.0,5.2e-05,0.000859,3.06099,0.0,0.0,4,2020,61.55,63.096875


## Modeling Methods and Metrics

Since our modeling will take and predict data at the _state_ level, we want our metrics to also be computed at the state level. In this case, we want to compute the root mean squared error, so we aggregate our real and predicted values by state, compute the RSME within the state, and then average the RSME accross all states. 

In [129]:
def RSME_df(df, col_names): 
    '''
    df has two columns, one with predictions and one with actual values,
    passed as `col_names` (order irrelevant)
    Returns the RSME of the predictions w.r.t the actual values 
    '''
    return np.sqrt(np.mean((df[col_names[0]] - df[col_names[1]])**2))

def compute_RSME_by_state(model, X, Y):
    '''
    Y must have 'State' in the index 
    '''
    Y_pred = pd.DataFrame(data=model.predict(X), index=Y.index, columns=Y.columns)
    combined_data = Y_pred.merge(Y, left_index=True, right_index=True, suffixes=('_pred','_actual'))    
    
    state_pred = combined_data.groupby('State').agg(RSME_df, col_names = combined_data.columns)
    state_pred.columns = ['RSME',"_"]
    state_pred = state_pred[['RSME']]
    return state_pred

def avg_state_RSME(model, X, Y):
    state_RSMEs = compute_RSME_by_state(model, X, Y)
    return state_RSMEs.mean()

In [130]:
metrics = ['Avg RSME']
datasets = ["Training", "Validation"]


def compute_stats(model, X, Y):
    avg_RSME = avg_state_RSME(model, X, Y)
    # more computed values go here
    return (avg_RSME)

def compute_model_stats(model, X_train, Y_train, X_val, Y_val):
    model_stats = {}
    train_metrics = compute_stats(model, X_train, Y_train)
    model_stats['training'] = dict(zip(metrics, train_metrics))
    val_metrics = compute_stats(model, X_val, Y_val)
    model_stats['validation'] =  dict(zip(metrics, val_metrics))
    return model_stats
    
def print_model_stats(model_stats):
    print("Model Statistics:")
    print('                | ',' | '.join(list(model_stats.keys()), ))
    print('-----------------------------------------')
    for var in model_stats['training'].keys():
        print("{var:<15} |   {train:.3f}   |   {val:.3f}".format(var = var,
                                      train = model_stats['training'][var], 
                                      val   = model_stats['validation'][var]))


In [131]:
iterables = [datasets, metrics]
col_idx = pd.MultiIndex.from_product(iterables, names=["", ""])
    
def make_fresh_record():
    record = pd.DataFrame(columns=col_idx)
    record.index.name = "Model"
    return record 

def record_model_stats(record, model_stats, model_name, override=False): 
    model_stats_df = pd.json_normalize(model_stats, sep='_')
    model_stats_df.columns = col_idx
    model_data = model_stats_df.iloc[0]
    model_data.name = model_name
    new_record = record.copy()
    # override or new entry
    if override or model_name not in record.index:
        new_record.loc[model_name,:] = model_data
    #exists and don't overide 
    else:
        print("Warning: A model with the name '{}' already exists in this record.".format(model_name))
        print("         Either change model_name or set 'override=True'.")
        return record
    return new_record

def make_and_record_model(record, processing_fun, train, val, name, override=False):
    model, (X_train, Y_train), (X_val, Y_val) = processing_fun(train, val)
    model_stats = compute_model_stats(model, X_train, Y_train, X_val, Y_val)
    record = record_model_stats(record, model_stats, name, override)
    return (record, model, model_stats, {"train_data": (X_train, Y_train), "val_data":(X_val, Y_val)})

### COVID Cases Modeling



In [132]:
def relabel_timeseries_data(X, Y, W, col_name="input"):
    timeseries_names = [col_name+'_day_'+str(i) for i in range(1-W,1)]

    target_day = Y.name
    Y.name = 'target_'+col_name
    Y = Y.reset_index()
    X = X.set_axis(timeseries_names, axis=1, inplace=False)
    X['Target_day'] = target_day
    Y['Target_day'] = target_day
    X = X.reset_index()
    X = X.set_index(['Target_day','State'])
    Y = Y.set_index(['Target_day','State'])
    return (X, Y)

def create_timeseries(df, col):
    return df.pivot_table(index = 'State', columns='Date',
                   values=col).sort_values(by = 'Date', axis='columns')


def convert_timeseries_to_data(df, W, col_name='input'):
    '''
    df is a dataframe, with columns sorted in increasing order by date
    splits rows into timeseries data with W columns of 'input' associated 
    with the W+1 column of 'output' and combined for all rows 
    '''
    d = df.shape[1]
    X = df.iloc[:, 0:W]
    Y = df.iloc[:,W]
    X, Y = relabel_timeseries_data(X, Y, W, col_name)

    for i in range(1,d-W):#1,3,..., d-W-1
        X_data = df.iloc[:, i:i+W] # i+W-1 = W+1,W+2,... d-1
        Y_data = df.iloc[:,i+W] # i+W = W+2,W+3,..., W+d-W = d
        X_data, Y_data = relabel_timeseries_data(X_data, Y_data, W, col_name)
        X = X.append(X_data)
        Y = Y.append(Y_data)

    return (X, Y)

### Model 1 

A repeat of Model 1 from initial modeling. 

In [133]:
from sklearn.linear_model import LinearRegression

def model_1_pipeline(data, test_data=False): 
    window_size = 14
    conf_timeseries_data = create_timeseries(data,'Confirmed_diff')
    X_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_diff")
    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_1_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_1_pipeline(train)
    X_val, Y_val = model_1_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [134]:
model_record = make_fresh_record()

In [135]:
model_record, m1, m1_stats, m1_data = make_and_record_model(model_record, 
                                                  model_1_processing, train, val, 
                                                  "Confirmed_diff only, 14D")
model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only, 14D",866.541095,412.442779


## Weather Models

First we should check for null values in our weather data.

In [136]:
null_temps_by_state = train[['State','Monthly Avg Temp (F)','Monthly Temp (F)']].set_index('State').isna().groupby(level=0).sum()
null_temps_by_state[null_temps_by_state.any(1)]

Unnamed: 0_level_0,Monthly Avg Temp (F),Monthly Temp (F)
State,Unnamed: 1_level_1,Unnamed: 2_level_1
District of Columbia,295,295


We see that D.C. has lots of null values, but all other states have data for all rows. We saw in our weather data analysis that D.C. was not included as a Division for weather collection. For our weather models we will exclude D.C. from analysis, and restrict to proper states. 

In [137]:
weather_train = train[train['State']!='District of Columbia']
weather_val   = val[val['State']!='District of Columbia']
weather_test  = test[test['State']!='District of Columbia']
null_temps_by_state = weather_train[['State','Monthly Avg Temp (F)','Monthly Temp (F)']].set_index('State').isna().groupby(level=0).sum()
display(null_temps_by_state[null_temps_by_state.any(1)])
print("Missing Weather Training Values:   ",weather_train[['State','Monthly Avg Temp (F)','Monthly Temp (F)']].isna().sum().sum())
print("Missing Weather Validation Values: ",weather_val[['State','Monthly Avg Temp (F)','Monthly Temp (F)']].isna().sum().sum())


Unnamed: 0_level_0,Monthly Avg Temp (F),Monthly Temp (F)
State,Unnamed: 1_level_1,Unnamed: 2_level_1


Missing Weather Training Values:    0
Missing Weather Validation Values:  0


After dropping D.C. we no longer have missing weather data. 

In [138]:
weather_model_record = make_fresh_record()
weather_model_record, m1_weather, m1_weather_stats, m1_weather_data = make_and_record_model(weather_model_record, 
                                                  model_1_processing, weather_train, weather_val, 
                                                  "Confirmed_diff only (weather), 14D")
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967


### Model 8 

Let's naively plug in the average monthly temperatures for each state based on the recent monthly averages (2000-2019). 

In [139]:
def model_8_pipeline(data, test_data=False): 
    window_size = 14
    
    X_temp_data = data[['State','Date','Monthly Avg Temp (F)']].set_index(['Date','State'])
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")
    
    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_8_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_8_pipeline(train)
    X_val, Y_val     = model_8_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [140]:
weather_model_record, m8, m8_stats, m8_data = make_and_record_model(weather_model_record, 
                                                  model_8_processing, weather_train, weather_val, 
                                                  "Avg Mon. Temp & Conf_diff, 14D", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384


### Model 9 
As we saw from our EDA (`analysis/data_merge_and_eda.ipynb`), absolute temperatures do not seem to follow a linear relationship with confirmed cases, however we did see increases for particularly cold or hot temperatures. To model this is our data, we can compute how far a temperature is from "room temperature", which we will take to be 70 degrees Fahrenheit. 

In [141]:
weather_train[['State','Monthly Avg Temp (F)','Monthly Temp (F)']].head()

Unnamed: 0,State,Monthly Avg Temp (F),Monthly Temp (F)
0,Alabama,63.096875,61.55
1,Alabama,63.096875,61.55
2,Alabama,63.096875,61.55
3,Alabama,63.096875,61.55
4,Alabama,63.096875,61.55


In [142]:
weather_train['month_diff_rm_temp'] = np.abs(weather_train['Monthly Temp (F)'] - 70)
weather_train['month_avg_diff_rm_temp'] = weather_train['Monthly Avg Temp (F)'] - 70
weather_train[['State','Monthly Avg Temp (F)','Monthly Temp (F)','month_diff_rm_temp','month_avg_diff_rm_temp']].head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_train['month_diff_rm_temp'] = np.abs(weather_train['Monthly Temp (F)'] - 70)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  weather_train['month_avg_diff_rm_temp'] = weather_train['Monthly Avg Temp (F)'] - 70


Unnamed: 0,State,Monthly Avg Temp (F),Monthly Temp (F),month_diff_rm_temp,month_avg_diff_rm_temp
0,Alabama,63.096875,61.55,8.45,-6.903125
1,Alabama,63.096875,61.55,8.45,-6.903125
2,Alabama,63.096875,61.55,8.45,-6.903125
3,Alabama,63.096875,61.55,8.45,-6.903125
4,Alabama,63.096875,61.55,8.45,-6.903125


In [143]:
def model_9_pipeline(data, test_data=False): 
    window_size = 14
    
    data.loc[:,'month_diff_rm_temp']     = np.abs(data['Monthly Temp (F)'] - 70)
    data.loc[:,'month_avg_diff_rm_temp'] = np.abs(data['Monthly Avg Temp (F)'] - 70)

    X_temp_data = data[['State','Date','month_avg_diff_rm_temp']].set_index(['Date','State'])
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")
    
    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_9_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_9_pipeline(train)
    X_val, Y_val     = model_9_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [144]:
weather_model_record, m9, m9_stats, m9_data = make_and_record_model(weather_model_record, 
                                                  model_9_processing, weather_train, weather_val, 
                                                  "Abs Mon Avg Temp diff & Conf_diff, 14D", override=True)
weather_model_record

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047


This doesn't really seem to improve the model. 

### Model 10 

We also saw that different states had different sensitivity to "cold" (e.g. Calfornia saw the cold effect between 40 and 50 degrees, but Vermont saw a less pronounced effect only below 40 degrees). We can incporate this into the data by computing the annual average temperature for each state and computing the difference for each month from the _state's_ annual average. 

In [145]:
hist_weather_data = pd.read_csv('../data/historical_monthly_temp_avgs_by_state.csv', index_col=0)
hist_weather_data.head()
state_avg_temps = hist_weather_data.groupby('State').mean().rename(columns={'Monthly Avg Temp (F)':"State Avg Temp"})[['State Avg Temp']]
state_avg_temps.head()

Unnamed: 0_level_0,State Avg Temp
State,Unnamed: 1_level_1
Alabama,63.428698
Arizona,63.623631
Arkansas,61.144722
California,57.130774
Colorado,46.451083


In [146]:
with_state_avg = weather_train.merge(state_avg_temps, how="left", left_on='State',right_index=True)
with_state_avg[['State','Monthly Avg Temp (F)','Monthly Temp (F)','State Avg Temp']]

Unnamed: 0,State,Monthly Avg Temp (F),Monthly Temp (F),State Avg Temp
0,Alabama,63.096875,61.55,63.428698
1,Alabama,63.096875,61.55,63.428698
2,Alabama,63.096875,61.55,63.428698
3,Alabama,63.096875,61.55,63.428698
4,Alabama,63.096875,61.55,63.428698
...,...,...,...,...
17282,Wyoming,22.051000,23.39,42.090167
17283,Wyoming,22.051000,23.39,42.090167
17284,Wyoming,22.051000,23.39,42.090167
17285,Wyoming,22.051000,23.39,42.090167


In [147]:
with_state_avg['Diff from Avg'] = np.abs(with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp'])
with_state_avg.sample(5)

Unnamed: 0,Date,State,Total Pop,Day_of_Wk,Confirmed,Confirmed_diff,Confirmed_rate,Confirmed_rate_diff,Deaths,Deaths_diff,...,Administered,Series_Complete_Yes,Month,Year,Monthly Temp (F),Monthly Avg Temp (F),month_diff_rm_temp,month_avg_diff_rm_temp,State Avg Temp,Diff from Avg
10834,2020-11-12,New York,19453561,Thursday,550678,5225.0,0.028307,0.000269,33975,14.0,...,0.0,0.0,11,2020,42.18,38.563,27.82,31.437,46.551417,7.988417
1322,2020-12-28,California,39512223,Monday,2222669,43735.0,0.056253,0.001107,24546,257.0,...,259059.0,0.0,12,2020,44.514286,43.107143,25.485714,26.892857,57.130774,14.023631
7648,2020-11-12,Minnesota,5639632,Thursday,201795,7225.0,0.035782,0.001281,2849,39.0,...,0.0,0.0,11,2020,34.766667,31.498889,35.233333,38.501111,42.497824,10.998935
8002,2020-11-12,Mississippi,2976149,Thursday,130665,1271.0,0.043904,0.000427,3514,17.0,...,0.0,0.0,11,2020,58.36,54.4625,11.64,15.5375,64.242042,9.779542
15370,2020-09-07,Vermont,623989,Monday,1651,3.0,0.002646,5e-06,58,0.0,...,0.0,0.0,9,2020,57.366667,58.773333,12.633333,11.226667,43.532639,15.240694


In [148]:
def model_10_pipeline(data, test_data=False): 
    window_size = 14
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = np.abs(with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp'])


    X_temp_data = with_state_avg[['State','Date','Diff from Avg']].set_index(['Date','State'])
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")
    
    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_10_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_10_pipeline(train)
    X_val, Y_val     = model_10_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [149]:
weather_model_record, m10, m10_stats, m10_data = make_and_record_model(weather_model_record, 
                                                  model_10_processing, weather_train, weather_val, 
                                                  "Temp diff from State Avg & Conf_diff, 14D", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085


In [150]:
m10_data['train_data'][0].columns

Index(['Confirmed_Diff_day_-13', 'Confirmed_Diff_day_-12',
       'Confirmed_Diff_day_-11', 'Confirmed_Diff_day_-10',
       'Confirmed_Diff_day_-9', 'Confirmed_Diff_day_-8',
       'Confirmed_Diff_day_-7', 'Confirmed_Diff_day_-6',
       'Confirmed_Diff_day_-5', 'Confirmed_Diff_day_-4',
       'Confirmed_Diff_day_-3', 'Confirmed_Diff_day_-2',
       'Confirmed_Diff_day_-1', 'Confirmed_Diff_day_0', 'Diff from Avg'],
      dtype='object')

In [151]:
m10.coef_

array([[ 0.03737115, -0.04240132, -0.06089067, -0.05297856, -0.0371887 ,
        -0.08170519, -0.01487443,  0.34846662,  0.15481001,  0.1517594 ,
         0.09222063,  0.14026837,  0.1891298 ,  0.15931647, -5.12407079]])

### Model 11 

We saw that colder weather seemed to have a larger impact that warmer weather. To try and capture this in our model, let's compute a feature that measures how distance below average temperature, with values above average zeroed out. 

In [152]:
def zero_positive_values(val):
    if val < 0:
        return val
    else:
        return 0
    
def model_11_pipeline(data, test_data=False): 
    window_size = 14
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp']
    with_state_avg['Diff below Avg'] = with_state_avg['Diff from Avg'].apply(zero_positive_values)

    X_temp_data = with_state_avg[['State','Date','Diff below Avg']].set_index(['Date','State'])
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")
    
    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_11_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_11_pipeline(train)
    X_val, Y_val     = model_11_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [153]:
weather_model_record, m11, m11_stats, m11_data = make_and_record_model(weather_model_record, 
                                                  model_11_processing, weather_train, weather_val, 
                                                  "Temp diff below State Avg & Conf_diff, 14D", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106


This also doesn't seem to improve the model much...  out of curiosity, lets reduce the number of days being used for confirmed case data. 

In [154]:
def model_12_pipeline(data, test_data=False): 
    window_size = 14
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp']
    with_state_avg['Diff below Avg'] = with_state_avg['Diff from Avg'].apply(zero_positive_values)

    X_temp_data = with_state_avg[['State','Date','Diff below Avg']].set_index(['Date','State'])
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")

    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_12_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_12_pipeline(train)
    X_val, Y_val     = model_12_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [155]:
weather_model_record, m12, m12_stats, m12_data = make_and_record_model(weather_model_record, 
                                                  model_12_processing, weather_train, weather_val, 
                                                  "Temp diff below State Avg & Conf_diff, 14D", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106


As another approach, let's add some regularization to the model. 

In [156]:
from sklearn.linear_model import Ridge
def model_13_pipeline(data, test_data=False): 
    window_size = 14
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp']
    with_state_avg['Diff below Avg'] = with_state_avg['Diff from Avg'].apply(zero_positive_values)

    X_temp_data = with_state_avg[['State','Date','Diff below Avg']].set_index(['Date','State'])
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")

    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_13_processing(train, val):
    model = Ridge(alpha=1)
    X_train, Y_train = model_13_pipeline(train)
    X_val, Y_val     = model_13_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [157]:
weather_model_record, m13, m13_stats, m13_data = make_and_record_model(weather_model_record, 
                                                  model_13_processing, weather_train, weather_val, 
                                                  "Temp diff below State Avg & Conf_diff, 14D, RIDGE", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 14D, RIDGE",883.292379,422.591057


In [158]:
m13.coef_


array([[ 0.03623187, -0.04336849, -0.06166351, -0.05356423, -0.03768618,
        -0.08214345, -0.01523711,  0.34887922,  0.15529221,  0.15231912,
         0.09259055,  0.14077615,  0.18973595,  0.16007107, -0.47790568]])

In [159]:
m12.coef_

array([[ 0.03623187, -0.04336849, -0.06166351, -0.05356423, -0.03768618,
        -0.08214346, -0.01523711,  0.34887922,  0.15529221,  0.15231912,
         0.09259055,  0.14077615,  0.18973595,  0.16007107, -0.47790624]])

In [160]:
from sklearn.linear_model import Lasso

def model_14_processing(train, val):
    model = Lasso(alpha=5)
    X_train, Y_train = model_13_pipeline(train)
    X_val, Y_val     = model_13_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [161]:
weather_model_record, m14, m14_stats, m14_data = make_and_record_model(weather_model_record, 
                                                  model_14_processing, weather_train, weather_val, 
                                                  "Temp diff below State Avg & Conf_diff, 14D, Lasso", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 14D, RIDGE",883.292379,422.591057
"Temp diff below State Avg & Conf_diff, 14D, Lasso",883.270871,422.204921


In [162]:
print(m14_data['train_data'][0].columns)
m14.coef_

Index(['Confirmed_Diff_day_-13', 'Confirmed_Diff_day_-12',
       'Confirmed_Diff_day_-11', 'Confirmed_Diff_day_-10',
       'Confirmed_Diff_day_-9', 'Confirmed_Diff_day_-8',
       'Confirmed_Diff_day_-7', 'Confirmed_Diff_day_-6',
       'Confirmed_Diff_day_-5', 'Confirmed_Diff_day_-4',
       'Confirmed_Diff_day_-3', 'Confirmed_Diff_day_-2',
       'Confirmed_Diff_day_-1', 'Confirmed_Diff_day_0', 'Diff below Avg'],
      dtype='object')


array([ 0.03624719, -0.04334855, -0.06164782, -0.05355211, -0.03767555,
       -0.08213463, -0.01522932,  0.34887365,  0.15528584,  0.15231383,
        0.0925899 ,  0.14077536,  0.18973753,  0.16007362, -0.39970055])

In [163]:
print(m13_data['train_data'][0].columns)
m13.coef_

Index(['Confirmed_Diff_day_-13', 'Confirmed_Diff_day_-12',
       'Confirmed_Diff_day_-11', 'Confirmed_Diff_day_-10',
       'Confirmed_Diff_day_-9', 'Confirmed_Diff_day_-8',
       'Confirmed_Diff_day_-7', 'Confirmed_Diff_day_-6',
       'Confirmed_Diff_day_-5', 'Confirmed_Diff_day_-4',
       'Confirmed_Diff_day_-3', 'Confirmed_Diff_day_-2',
       'Confirmed_Diff_day_-1', 'Confirmed_Diff_day_0', 'Diff below Avg'],
      dtype='object')


array([[ 0.03623187, -0.04336849, -0.06166351, -0.05356423, -0.03768618,
        -0.08214345, -0.01523711,  0.34887922,  0.15529221,  0.15231912,
         0.09259055,  0.14077615,  0.18973595,  0.16007107, -0.47790568]])

In [164]:
print(m11_data['train_data'][0].columns)
m11.coef_

Index(['Confirmed_Diff_day_-13', 'Confirmed_Diff_day_-12',
       'Confirmed_Diff_day_-11', 'Confirmed_Diff_day_-10',
       'Confirmed_Diff_day_-9', 'Confirmed_Diff_day_-8',
       'Confirmed_Diff_day_-7', 'Confirmed_Diff_day_-6',
       'Confirmed_Diff_day_-5', 'Confirmed_Diff_day_-4',
       'Confirmed_Diff_day_-3', 'Confirmed_Diff_day_-2',
       'Confirmed_Diff_day_-1', 'Confirmed_Diff_day_0', 'Diff below Avg'],
      dtype='object')


array([[ 0.03623187, -0.04336849, -0.06166351, -0.05356423, -0.03768618,
        -0.08214346, -0.01523711,  0.34887922,  0.15529221,  0.15231912,
         0.09259055,  0.14077615,  0.18973595,  0.16007107, -0.47790624]])

Let's try a model with lots of weather data, then again with regulariazation. 

In [165]:
def model_15_pipeline(data, test_data=False): 
    window_size = 14
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp']
    with_state_avg['Diff below Avg'] = with_state_avg['Diff from Avg'].apply(zero_positive_values)

    with_state_avg['month_diff_rm_temp']     = np.abs(data['Monthly Temp (F)'] - 70)
    with_state_avg['month_avg_diff_rm_temp'] = np.abs(data['Monthly Avg Temp (F)'] - 70)
    X_temp_data = with_state_avg[['State','Date',
                                  'Diff below Avg','month_diff_rm_temp', 
                                  'month_avg_diff_rm_temp', 'State Avg Temp', 
                                  'Diff from Avg', 'Diff below Avg']].set_index(['Date','State'])
     
    
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")

    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)

    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_15_linear_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_15_pipeline(train)
    X_val, Y_val     = model_15_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [166]:
weather_model_record, m15_lin, m15_lin_stats, m15_data = make_and_record_model(weather_model_record, 
                                                  model_15_linear_processing, weather_train, weather_val, 
                                                  "lots of weather & Conf_diff, 14D", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 14D, RIDGE",883.292379,422.591057
"Temp diff below State Avg & Conf_diff, 14D, Lasso",883.270871,422.204921
"lots of weather & Conf_diff, 14D",882.256788,428.120726


In [167]:
def model_15_ridge_processing(train, val):
    model = Ridge(alpha=1000000000)
    X_train, Y_train = model_15_pipeline(train)
    X_val, Y_val     = model_15_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [168]:
weather_model_record, m15_ridge, m15_ridge_stats, m15_data = make_and_record_model(weather_model_record, 
                                                  model_15_ridge_processing, weather_train, weather_val, 
                                                  "lots of weather & Conf_diff, 14D, RIDGE", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 14D, RIDGE",883.292379,422.591057
"Temp diff below State Avg & Conf_diff, 14D, Lasso",883.270871,422.204921
"lots of weather & Conf_diff, 14D",882.256788,428.120726
"lots of weather & Conf_diff, 14D, RIDGE",883.78223,421.371089


In [169]:
m15_ridge.coef_

array([[ 0.03656945, -0.03959625, -0.05660682, -0.05062052, -0.03398304,
        -0.07413812, -0.01091059,  0.33480186,  0.15078512,  0.14898587,
         0.09302799,  0.1385007 ,  0.18570424,  0.16012497, -0.00037359,
         0.00040843,  0.00038726,  0.00362051, -0.00442389, -0.00037359]])

In [170]:
def model_15_lasso_processing(train, val):
    model = Lasso(alpha=100000)
    X_train, Y_train = model_15_pipeline(train)
    X_val, Y_val     = model_15_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [171]:
weather_model_record, m15_lasso, m15_lasso_stats, m15_data = make_and_record_model(weather_model_record, 
                                                  model_15_lasso_processing, weather_train, weather_val, 
                                                  "lots of weather & Conf_diff, 14D, LASSO", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 14D, RIDGE",883.292379,422.591057
"Temp diff below State Avg & Conf_diff, 14D, Lasso",883.270871,422.204921
"lots of weather & Conf_diff, 14D",882.256788,428.120726
"lots of weather & Conf_diff, 14D, RIDGE",883.78223,421.371089
"lots of weather & Conf_diff, 14D, LASSO",898.625775,444.559726


In [172]:
m15_lasso.coef_

array([ 0.        , -0.        , -0.        , -0.        , -0.        ,
       -0.        ,  0.        ,  0.30607035,  0.09842798,  0.10094277,
        0.0505964 ,  0.10744342,  0.15916923,  0.16185657,  0.        ,
       -0.        , -0.        ,  0.        , -0.        ,  0.        ])

Tweaking the regularization constant until we get improvements on the validation score without significant loss to training, we get a LASSO model that completely ignores the weather data we input into the model and only utilizes the confirmed cases data. One interpretation of this is that the weather effects we saw were coincidental, aligning with external factors such as holiday travel or arrival of variants. Another potential issue with this model is that monthly weather temperatures would have already affected the confirmed case data going into the model. That is the cases from recent days already mediates the effect of temperature. To investigate this second idea further, let's compose a model that drops the case data entirely and only uses weather, and consider the effectiveness of such a model. 

In [173]:
def model_16_pipeline(data, test_data=False): 
    window_size = 14
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp']
    with_state_avg['Diff below Avg'] = with_state_avg['Diff from Avg'].apply(zero_positive_values)

    with_state_avg['month_diff_rm_temp']     = np.abs(data['Monthly Temp (F)'] - 70)
    with_state_avg['month_avg_diff_rm_temp'] = np.abs(data['Monthly Avg Temp (F)'] - 70)
    X_temp_data = with_state_avg[['State','Date',
                                  'Diff below Avg','month_diff_rm_temp', 
                                  'month_avg_diff_rm_temp', 'State Avg Temp', 
                                  'Diff from Avg', 'Diff below Avg']].set_index(['Date','State'])
     
    
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")

    X_data = X_conf_data.merge(X_temp_data, how="left", left_index=True, right_index=True)
    X_data = X_data[['Diff below Avg','month_diff_rm_temp', 
            'month_avg_diff_rm_temp', 'State Avg Temp', 'Diff from Avg', 'Diff below Avg']]
    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_16_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_16_pipeline(train)
    X_val, Y_val     = model_16_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

def model_16_ridge_processing(train, val, alpha = 1):
    model = Ridge(alpha=alpha)
    X_train, Y_train = model_16_pipeline(train)
    X_val, Y_val     = model_16_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

def model_16_lasso_processing(train, val, alpha = 1):
    model = Lasso(alpha=alpha)
    X_train, Y_train = model_16_pipeline(train)
    X_val, Y_val     = model_16_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [174]:
def regularization_plot(alphas, model_processing, train, val):
    for alpha in alphas:
        model, (X_train, Y_train), (X_val, Y_val) = model_processing(train, val, alpha)
        stats = compute_model_stats(model, X_train, Y_train, X_val, Y_val)
        print(stats)
        

In [175]:
alphas = [1,10,100,1000,3000,10000,30000, 100000, 1000000]
regularization_plot(alphas, model_16_lasso_processing, weather_train, weather_val)

  model = cd_fast.enet_coordinate_descent(


{'training': {'Avg RSME': 2218.7521455257825}, 'validation': {'Avg RSME': 3297.915215239234}}
{'training': {'Avg RSME': 2218.0573707589606}, 'validation': {'Avg RSME': 3271.6061605503096}}
{'training': {'Avg RSME': 2214.2940576316337}, 'validation': {'Avg RSME': 3013.373071692484}}
{'training': {'Avg RSME': 2205.551403422754}, 'validation': {'Avg RSME': 2644.070924326353}}
{'training': {'Avg RSME': 2203.142744819649}, 'validation': {'Avg RSME': 2480.652040166757}}
{'training': {'Avg RSME': 2294.426983063541}, 'validation': {'Avg RSME': 2094.537089128639}}
{'training': {'Avg RSME': 2494.963944608399}, 'validation': {'Avg RSME': 1511.0879424453153}}
{'training': {'Avg RSME': 2494.963944608399}, 'validation': {'Avg RSME': 1511.0879424453153}}
{'training': {'Avg RSME': 2494.963944608399}, 'validation': {'Avg RSME': 1511.0879424453153}}


In [176]:
weather_model_record, m16, m16_stats, m16_data = make_and_record_model(weather_model_record, 
                                                  model_16_processing, weather_train, weather_val, 
                                                  "lots of weather only", override=True)
weather_model_record
weather_model_record, m16_ridge, m16_ridge_stats, m16_data = make_and_record_model(weather_model_record, 
                                                  model_16_ridge_processing, weather_train, weather_val, 
                                                  "lots of weather only, RIDGE", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 14D, RIDGE",883.292379,422.591057
"Temp diff below State Avg & Conf_diff, 14D, Lasso",883.270871,422.204921
"lots of weather & Conf_diff, 14D",882.256788,428.120726
"lots of weather & Conf_diff, 14D, RIDGE",883.78223,421.371089
"lots of weather & Conf_diff, 14D, LASSO",898.625775,444.559726


In [177]:
train[['State','Confirmed_diff']].groupby('State').std().mean()

Confirmed_diff    1845.130758
dtype: float64

The weather only models have significantly higher RSME and when compared to the inherent variability of the the confirmed_diff data, these don't seem to have a significant effect on the model's predictive capability. In particular we also see a larger gap between the training and validation RSMEs. 

### Weather Interaction Models 

In [178]:
def model_17_pipeline(data, test_data=False): 
    window_size = 14
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp']
    with_state_avg['Diff below Avg'] = np.abs(with_state_avg['Diff from Avg'].apply(zero_positive_values))

    X_temp_data = with_state_avg[['State', 'Date', 'Diff below Avg']].set_index(['Date','State'])
     
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")

    X_conf_data_scaled = X_conf_data.mul(X_temp_data['Diff below Avg'], axis=0)
    X_data = X_conf_data.merge(X_conf_data_scaled, how="left", 
                               left_index=True, right_index=True,
                              suffixes=('_orig','_temp_scaled'))
    #X_data = X_data[['Diff below Avg','month_diff_rm_temp', 
    #        'month_avg_diff_rm_temp', 'State Avg Temp', 'Diff from Avg', 'Diff below Avg']]
    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_17_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_17_pipeline(train)
    X_val, Y_val     = model_17_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

def model_17_Lasso_processing(train, val, alpha = 1000):
    model = Lasso(alpha=alpha)
    X_train, Y_train = model_17_pipeline(train)
    X_val, Y_val     = model_17_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))


def model_17_Ridge_processing(train, val, alpha = 10000000):
    model = Ridge(alpha=alpha)
    X_train, Y_train = model_17_pipeline(train)
    X_val, Y_val     = model_17_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))


In [179]:
weather_model_record, m17_ridge, m17_ridge_stats, m17_data = make_and_record_model(weather_model_record, 
                                                  model_17_Ridge_processing, weather_train, weather_val, 
                                                  "conf diff, and conf diff temp scaled, 14D, RIDGE", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 14D, RIDGE",883.292379,422.591057
"Temp diff below State Avg & Conf_diff, 14D, Lasso",883.270871,422.204921
"lots of weather & Conf_diff, 14D",882.256788,428.120726
"lots of weather & Conf_diff, 14D, RIDGE",883.78223,421.371089
"lots of weather & Conf_diff, 14D, LASSO",898.625775,444.559726


In [180]:
weather_model_record, m17, m17_stats, m17_data = make_and_record_model(weather_model_record, 
                                                  model_17_processing, weather_train, weather_val, 
                                                  "conf diff, and conf diff temp scaled, 14D", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 14D, RIDGE",883.292379,422.591057
"Temp diff below State Avg & Conf_diff, 14D, Lasso",883.270871,422.204921
"lots of weather & Conf_diff, 14D",882.256788,428.120726
"lots of weather & Conf_diff, 14D, RIDGE",883.78223,421.371089
"lots of weather & Conf_diff, 14D, LASSO",898.625775,444.559726


In [181]:
weather_model_record, m17_lasso, m17_lasso_stats, m17_data = make_and_record_model(weather_model_record, 
                                                  model_17_Lasso_processing, weather_train, weather_val, 
                                                  "conf diff, and conf diff temp scaled, 14D, LASSO", override=True)
weather_model_record


  model = cd_fast.enet_coordinate_descent(


Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 14D, RIDGE",883.292379,422.591057
"Temp diff below State Avg & Conf_diff, 14D, Lasso",883.270871,422.204921
"lots of weather & Conf_diff, 14D",882.256788,428.120726
"lots of weather & Conf_diff, 14D, RIDGE",883.78223,421.371089
"lots of weather & Conf_diff, 14D, LASSO",898.625775,444.559726


Still doesn't seem to help. Confusingly, the weather models seem to do worse on the training and validation datasets, despite having _more_ variables available. 

In [182]:
m17_lasso.coef_

array([ 5.15879893e-02, -6.39230764e-02, -1.56405811e-02, -4.65233384e-02,
       -3.67352297e-02, -1.20290557e-01,  4.62596923e-02,  1.92255667e-01,
        1.75088889e-01,  1.25533640e-01,  1.64201180e-01,  1.60700114e-01,
        1.72500169e-01,  1.85729548e-01, -1.56160800e-03,  2.00763870e-03,
       -3.37230229e-03, -7.79916684e-05, -3.00026143e-05,  2.59126197e-03,
       -4.55574689e-03,  1.13311918e-02, -1.70849798e-03,  1.94195098e-03,
       -5.59621739e-03, -1.23655318e-03,  1.20451002e-03, -1.49694352e-03])

In [183]:
def model_18_pipeline(data, test_data=False): 
    window_size = 14
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Abs Diff from Room Temp'] = np.abs(with_state_avg['Monthly Avg Temp (F)'] - 70)

    X_temp_data = with_state_avg[['State', 'Date', 'Abs Diff from Room Temp']].set_index(['Date','State'])
     
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")

    X_conf_data_scaled = X_conf_data.mul(X_temp_data['Abs Diff from Room Temp'], axis=0)
    X_data = X_conf_data.merge(X_conf_data_scaled, how="left", 
                               left_index=True, right_index=True,
                              suffixes=('_orig','_temp_scaled'))
    #X_data = X_data[['Diff below Avg','month_diff_rm_temp', 
    #        'month_avg_diff_rm_temp', 'State Avg Temp', 'Diff from Avg', 'Diff below Avg']]
    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_18_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_18_pipeline(train)
    X_val, Y_val     = model_18_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))

In [184]:
weather_model_record, m18, m18_stats, m18_data = make_and_record_model(weather_model_record, 
                                                  model_18_processing, weather_train, weather_val, 
                                                  "conf diff, and conf diff temp scaled abs from room temp, 14D", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 14D, RIDGE",883.292379,422.591057
"Temp diff below State Avg & Conf_diff, 14D, Lasso",883.270871,422.204921
"lots of weather & Conf_diff, 14D",882.256788,428.120726
"lots of weather & Conf_diff, 14D, RIDGE",883.78223,421.371089
"lots of weather & Conf_diff, 14D, LASSO",898.625775,444.559726


In [185]:
def model_19_pipeline(data, test_data=False): 
    window_size = 14
    
    with_state_avg = data.merge(state_avg_temps, how="left", left_on='State',right_index=True)

    with_state_avg['Diff from Avg'] = with_state_avg['Monthly Avg Temp (F)'] - with_state_avg['State Avg Temp']
    with_state_avg['cold scaling'] = 1+np.abs(with_state_avg['Diff from Avg'].apply(zero_positive_values))

    X_temp_data = with_state_avg[['State', 'Date', 'cold scaling']].set_index(['Date','State'])
     
    X_temp_data.index = X_temp_data.index.set_names('Target_day', level=0)
    
    conf_timeseries_data = create_timeseries(data,"Confirmed_diff")
    X_conf_data, Y_data  = convert_timeseries_to_data(conf_timeseries_data, 
                                                window_size, 
                                                col_name="Confirmed_Diff")

    X_conf_data_scaled = X_conf_data.mul(X_temp_data['cold scaling'], axis=0)
    X_data = X_conf_data.merge(X_conf_data_scaled, how="left", 
                               left_index=True, right_index=True,
                              suffixes=('_orig','_temp_scaled'))
    #X_data = X_data[['Diff below Avg','month_diff_rm_temp', 
    #        'month_avg_diff_rm_temp', 'State Avg Temp', 'Diff from Avg', 'Diff below Avg']]
    if test_data:
        return X_data
    else: 
        return (X_data, Y_data)

def model_19_processing(train, val):
    model = LinearRegression()
    X_train, Y_train = model_19_pipeline(train)
    X_val, Y_val     = model_19_pipeline(val)
    model.fit(X_train, Y_train)
    return (model, (X_train, Y_train), (X_val, Y_val))


In [186]:
weather_model_record, m19, m19_stats, m19_data = make_and_record_model(weather_model_record, 
                                                  model_19_processing, weather_train, weather_val, 
                                                  "conf diff, and conf diff temp scaled (1+cold scaling), 14D", override=True)
weather_model_record

Unnamed: 0_level_0,Training,Validation
Unnamed: 0_level_1,Avg RSME,Avg RSME
Model,Unnamed: 1_level_2,Unnamed: 2_level_2
"Confirmed_diff only (weather), 14D",883.179033,420.309967
"Avg Mon. Temp & Conf_diff, 14D",883.317137,422.144384
"Abs Mon Avg Temp diff & Conf_diff, 14D",883.251881,421.503047
"Temp diff from State Avg & Conf_diff, 14D",882.861374,414.593085
"Temp diff below State Avg & Conf_diff, 14D",883.292379,422.59106
"Temp diff below State Avg & Conf_diff, 14D, RIDGE",883.292379,422.591057
"Temp diff below State Avg & Conf_diff, 14D, Lasso",883.270871,422.204921
"lots of weather & Conf_diff, 14D",882.256788,428.120726
"lots of weather & Conf_diff, 14D, RIDGE",883.78223,421.371089
"lots of weather & Conf_diff, 14D, LASSO",898.625775,444.559726


In [192]:
weather_model_record.index

Index(['Confirmed_diff only (weather), 14D', 'Avg Mon. Temp & Conf_diff, 14D',
       'Abs Mon Avg Temp diff & Conf_diff, 14D',
       'Temp diff from State Avg & Conf_diff, 14D',
       'Temp diff below State Avg & Conf_diff, 14D',
       'Temp diff below State Avg & Conf_diff, 14D, RIDGE',
       'Temp diff below State Avg & Conf_diff, 14D, Lasso',
       'lots of weather & Conf_diff, 14D',
       'lots of weather & Conf_diff, 14D, RIDGE',
       'lots of weather & Conf_diff, 14D, LASSO', 'lots of weather only',
       'lots of weather only, RIDGE',
       'conf diff, and conf diff temp scaled, 14D, RIDGE',
       'conf diff, and conf diff temp scaled, 14D',
       'conf diff, and conf diff temp scaled, 14D, LASSO',
       'conf diff, and conf diff temp scaled abs from room temp, 14D',
       'conf diff, and conf diff temp scaled (1+cold scaling), 14D'],
      dtype='object', name='Model')

In [227]:
report_model1 = weather_model_record.copy().astype(float).round(2)
report_model = report_model1.loc[['Confirmed_diff only (weather), 14D', 
                  'Avg Mon. Temp & Conf_diff, 14D',
                  'Temp diff below State Avg & Conf_diff, 14D',
                  'conf diff, and conf diff temp scaled, 14D'],:]
report_model.index = ['Model 1',#: Case Rates (CR) Only', 
                      'Model 2',#: CR+Average Monthly Temperatures',
                      'Model 3',#: CR+Temperature Below State Average',
                      'Model 4']#: CR+Scaled Case Rates by Temperature Below State Average']
report_model['Variables'] = ['Case Rates (CR) Only','CR+Average Monthly Temperatures',
                             'CR+Temperature Below State Average',
                            'CR+Scaled Case Rates by Temperature Below State Average']


pd.set_option("display.colheader_justify","left")
report_model.style.set_properties(**{'text-align': 'left'})
report_model

Unnamed: 0_level_0,Training,Validation,Variables
Unnamed: 0_level_1,Avg RSME,Avg RSME,Unnamed: 3_level_1
Model 1,883.18,420.31,Case Rates (CR) Only
Model 2,883.32,422.14,CR+Average Monthly Temperatures
Model 3,883.29,422.59,CR+Temperature Below State Average
Model 4,880.1,414.61,CR+Scaled Case Rates by Temperature Below Stat...


In [220]:
weather_model_record.loc[:,('Validation','Avg RSME')].astype(float)

Model
Confirmed_diff only (weather), 14D                               420.309967
Avg Mon. Temp & Conf_diff, 14D                                   422.144384
Abs Mon Avg Temp diff & Conf_diff, 14D                           421.503047
Temp diff from State Avg & Conf_diff, 14D                        414.593085
Temp diff below State Avg & Conf_diff, 14D                       422.591060
Temp diff below State Avg & Conf_diff, 14D, RIDGE                422.591057
Temp diff below State Avg & Conf_diff, 14D, Lasso                422.204921
lots of weather & Conf_diff, 14D                                 428.120726
lots of weather & Conf_diff, 14D, RIDGE                          421.371089
lots of weather & Conf_diff, 14D, LASSO                          444.559726
lots of weather only                                            3260.832104
lots of weather only, RIDGE                                     3300.432641
conf diff, and conf diff temp scaled, 14D, RIDGE                 414.622788
conf d