In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, cross_val_score

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import regularizers
from keras.callbacks import EarlyStopping

from drivendata_validator import *

In [175]:
# the pred_maker!

def pred_maker(test_set, model, csv_name, polyfunc=None, scalerfunc=None):
    # preliminary preparation of test set
    test_set.dropna(inplace=True)
    test_set.drop("week_start_date", axis=1, inplace=True)
    test_set.drop("Unnamed: 0", axis=1, inplace=True)
    
    # this is to pull from for later in the prediction dataframe
    city_df = test_set["city"].copy()
    year_df = test_set["year"].copy()
    weekofyear_df = test_set["weekofyear"].copy()
    
    #dummying cities
    test_set = pd.get_dummies(test_set)
    
    # prepping weather features
    test_non_weather = "year weekofyear ndvi_ne ndvi_nw ndvi_se ndvi_sw city_iq city_sj".split()
    test_weather_features = [x for x in test_set.columns if x not in test_non_weather]
    
    # if there is a polynomial features function
    if polyfunc is not None:
        # calling the func and making a dataframe
        pf_test = polyfunc.transform(test_set[test_weather_features])
        test_weather = pd.DataFrame(data=pf_test, columns=pf.get_feature_names(test_weather_features))
        
        test_set.drop(labels=weather_features, axis=1, inplace=True)
        
        # fixing the indices
        test_set.reset_index(drop=True, inplace=True)
        test_weather.reset_index(drop=True, inplace=True)

        test_set['id'] = test_set.index
        test_weather['id'] = test_weather.index
        
        #merging dataframes
        test_set = test_set.merge(test_weather, how='left', on='id')
        test_set.drop("id", axis=1, inplace=True)
        
    # if there is a scaler function
    if scalerfunc is not None:
        test_set = scalerfunc.transform(test_set)
    
    # making predictions
    pred_array = model.predict(test_set)
    
    #setting up prediction dataframe
#     pred_df = pd.DataFrame()
#     pred_df["city"] = city_df
#     pred_df["year"] = year_df
#     pred_df["weekofyear"] = weekofyear_df
#     pred_df["total_cases"] = pred_array
    
#     pred_df = pred_df.round({"total_cases":0})
    
#     pred_df.to_csv("./submissions/"+csv_name, index=False)

# whatever I'm doing here, DrivenData doesnt get it, so I'm taking a new route

    # taking their sample submission...
    sample_csv = "./data/submission_format.csv"
    sample = pd.read_csv(sample_csv)
    
    # ... and making it my own
    sample.total_cases = pred_array
    pred_df = sample.copy()
    pred_df["total_cases"] = pred_df["total_cases"].astype("int64")
    
    pred_df.to_csv("./submissions/"+csv_name, index=False)
    
    return pred_df

### Loading up and prepping

In [160]:
train_csv = "./data/cleaned_train.csv"
test_csv = "./data/test.csv"

In [161]:
train = pd.read_csv(train_csv)

In [162]:
pd.set_option("display.max_columns",50)
pd.set_option("display.max_rows",200)

In [163]:
train.head()

Unnamed: 0.1,Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
0,0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,299.8,295.9,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,4
1,1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,300.9,296.4,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,5
2,2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,300.5,297.3,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,4
3,3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.31,301.4,297.0,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,3
4,4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,295.821429,301.9,297.5,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,6


In [164]:
train.drop("Unnamed: 0", axis=1, inplace=True)
train.drop("week_start_date", axis=1, inplace=True)

In [165]:
train = pd.get_dummies(train)

### 1. Recreating Neural Network with Poly Weather Feats

In [166]:
non_weather = "year weekofyear ndvi_ne ndvi_nw ndvi_se ndvi_sw total_cases city_iq city_sj".split()
weather_features = [x for x in train.columns if x not in non_weather]

In [167]:
pf = PolynomialFeatures(degree=2, include_bias=False)
pf_features = pd.DataFrame(data=pf.fit_transform(train[weather_features]), columns=pf.get_feature_names(weather_features))

In [168]:
train_with_weather = train.copy()
train_with_weather.drop(labels=weather_features, axis=1, inplace=True)
train_with_weather = train_with_weather.join(pf_features, how="outer")

In [169]:
target = "total_cases"
engineered_feats = [x for x in train_with_weather.columns if x != target]
X = train_with_weather[engineered_feats]
y = train_with_weather[target]

In [170]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [171]:
ss = StandardScaler()
Xs_train = ss.fit_transform(X_train)
Xs_test = ss.fit_transform(X_test)

In [172]:
network = Sequential()

network.add(Dense(976, activation="relu", input_dim=Xs_train.shape[1], kernel_regularizer=regularizers.l1(.01)))
network.add(Dropout(.5))
network.add(Dense(976, activation="relu", kernel_regularizer=regularizers.l1(.01)))
network.add(Dropout(.5))
network.add(Dense(31, activation="relu", kernel_regularizer=regularizers.l1(.01)))
network.add(Dropout(.5))
network.add(Dense(1, activation=None, kernel_regularizer=regularizers.l1(.01)))

network.compile(loss="mean_absolute_error", optimizer="adam")

network.fit(Xs_train, y_train, validation_data=(Xs_test, y_test), epochs=500)

Train on 946 samples, validate on 316 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
E

Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Epoch 145/500
Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
Epoch 150/500


Epoch 155/500
Epoch 156/500
Epoch 157/500
Epoch 158/500
Epoch 159/500
Epoch 160/500
Epoch 161/500
Epoch 162/500
Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/500
Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500
Epoch 185/500
Epoch 186/500
Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 217/500
Epoch 218/500
Epoch 219/500
Epoch 220/500
Epoch 221/500
Epoch 222/500
Epoch 223/500
Epoch 224/500
Epoch 225/500
Epoch 

Epoch 308/500
Epoch 309/500
Epoch 310/500
Epoch 311/500
Epoch 312/500
Epoch 313/500
Epoch 314/500
Epoch 315/500
Epoch 316/500
Epoch 317/500
Epoch 318/500
Epoch 319/500
Epoch 320/500
Epoch 321/500
Epoch 322/500
Epoch 323/500
Epoch 324/500
Epoch 325/500
Epoch 326/500
Epoch 327/500
Epoch 328/500
Epoch 329/500
Epoch 330/500
Epoch 331/500
Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 375/500
Epoch 376/500
Epoch 377/500
Epoch 378/500
Epoch 

Epoch 460/500
Epoch 461/500
Epoch 462/500
Epoch 463/500
Epoch 464/500
Epoch 465/500
Epoch 466/500
Epoch 467/500
Epoch 468/500
Epoch 469/500
Epoch 470/500
Epoch 471/500
Epoch 472/500
Epoch 473/500
Epoch 474/500
Epoch 475/500
Epoch 476/500
Epoch 477/500
Epoch 478/500
Epoch 479/500
Epoch 480/500
Epoch 481/500
Epoch 482/500
Epoch 483/500
Epoch 484/500
Epoch 485/500
Epoch 486/500
Epoch 487/500
Epoch 488/500
Epoch 489/500
Epoch 490/500
Epoch 491/500
Epoch 492/500
Epoch 493/500
Epoch 494/500
Epoch 495/500
Epoch 496/500
Epoch 497/500
Epoch 498/500
Epoch 499/500
Epoch 500/500


<keras.callbacks.History at 0x10a6d84a8>

In [176]:
test = pd.read_csv(test_csv)
pred_maker(test, network, "weather_poly_nn_2.csv", polyfunc=pf, scalerfunc=ss)

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,5
1,sj,2008,19,7
2,sj,2008,20,6
3,sj,2008,21,6
4,sj,2008,22,6
5,sj,2008,23,11
6,sj,2008,24,9
7,sj,2008,25,14
8,sj,2008,26,15
9,sj,2008,27,20


Finally got the submission to validate after a few hours

- With first null-cleaning method: MAE Score: 29.6   ---   Rank: 1500
- With second method : MAE Score: 29.1   ---   Rank: 1472

### 2. Neural Network without poly'd features

In [178]:
target = "total_cases"
features = [x for x in train.columns if x != target]
X2 = train[features]
y2 = train[target]

In [179]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2)

In [180]:
ss2 = StandardScaler()
X2s_train = ss2.fit_transform(X2_train)
X2s_test = ss2.fit_transform(X2_test)

In [181]:
X2_train.shape

(946, 24)

In [182]:
network2 = Sequential()

network2.add(Dense(24, activation="relu", input_dim=X2s_train.shape[1], kernel_regularizer=regularizers.l1(.01)))
network2.add(Dropout(.5))
network2.add(Dense(576, activation="relu", kernel_regularizer=regularizers.l1(.01)))
network2.add(Dropout(.5))
network2.add(Dense(24, activation="relu", kernel_regularizer=regularizers.l1(.01)))
network2.add(Dropout(.5))
network2.add(Dense(1, activation=None, kernel_regularizer=regularizers.l1(.01)))

network2.compile(loss="mean_absolute_error", optimizer="adam")

network2.fit(X2s_train, y2_train, validation_data=(X2s_test, y2_test), epochs=500)

Train on 946 samples, validate on 316 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
E

Epoch 156/500
Epoch 157/500
Epoch 158/500
Epoch 159/500
Epoch 160/500
Epoch 161/500
Epoch 162/500
Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/500
Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500
Epoch 185/500
Epoch 186/500
Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 217/500
Epoch 218/500
Epoch 219/500
Epoch 220/500
Epoch 221/500
Epoch 222/500
Epoch 223/500
Epoch 224/500
Epoch 225/500
Epoch 226/500
Epoch 

Epoch 310/500
Epoch 311/500
Epoch 312/500
Epoch 313/500
Epoch 314/500
Epoch 315/500
Epoch 316/500
Epoch 317/500
Epoch 318/500
Epoch 319/500
Epoch 320/500
Epoch 321/500
Epoch 322/500
Epoch 323/500
Epoch 324/500
Epoch 325/500
Epoch 326/500
Epoch 327/500
Epoch 328/500
Epoch 329/500
Epoch 330/500
Epoch 331/500
Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 375/500
Epoch 376/500
Epoch 377/500
Epoch 378/500
Epoch 379/500
Epoch 380/500
Epoch 

Epoch 464/500
Epoch 465/500
Epoch 466/500
Epoch 467/500
Epoch 468/500
Epoch 469/500
Epoch 470/500
Epoch 471/500
Epoch 472/500
Epoch 473/500
Epoch 474/500
Epoch 475/500
Epoch 476/500
Epoch 477/500
Epoch 478/500
Epoch 479/500
Epoch 480/500
Epoch 481/500
Epoch 482/500
Epoch 483/500
Epoch 484/500
Epoch 485/500
Epoch 486/500
Epoch 487/500
Epoch 488/500
Epoch 489/500
Epoch 490/500
Epoch 491/500
Epoch 492/500
Epoch 493/500
Epoch 494/500
Epoch 495/500
Epoch 496/500
Epoch 497/500
Epoch 498/500
Epoch 499/500
Epoch 500/500


<keras.callbacks.History at 0x1a2aab50b8>

In [185]:
test = pd.read_csv(test_csv)
pred_maker(test, network2, "no_poly_nn.csv", scalerfunc=ss2)

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,6
1,sj,2008,19,6
2,sj,2008,20,6
3,sj,2008,21,5
4,sj,2008,22,6
5,sj,2008,23,7
6,sj,2008,24,8
7,sj,2008,25,8
8,sj,2008,26,9
9,sj,2008,27,9


MAE Score: 30.3