In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, cross_val_score

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import regularizers
from keras.callbacks import EarlyStopping

Notes:
- At first I went through and changed all null values of the average of all entries for the same week, but that actually made scores worse. So now I'm going to drop all missing ndvi_ne data, input missing data as the average (it should be a lot less imputed data), and retry
- The second method was much better on the train/test split (~5 MAE points). My next step will be to separate San Juan from Iquitos, because there's probably so little geographic relation between the two that there shouldnt be any influence of one on the other

In [51]:
train_csv = "./data/train.csv"
train = pd.read_csv(train_csv)

In [52]:
 train.drop("Unnamed: 0", axis=1, inplace=True)

In [53]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1456 entries, 0 to 1455
Data columns (total 25 columns):
city                                     1456 non-null object
year                                     1456 non-null int64
weekofyear                               1456 non-null int64
week_start_date                          1456 non-null object
ndvi_ne                                  1262 non-null float64
ndvi_nw                                  1404 non-null float64
ndvi_se                                  1434 non-null float64
ndvi_sw                                  1434 non-null float64
precipitation_amt_mm                     1443 non-null float64
reanalysis_air_temp_k                    1446 non-null float64
reanalysis_avg_temp_k                    1446 non-null float64
reanalysis_dew_point_temp_k              1446 non-null float64
reanalysis_max_air_temp_k                1446 non-null float64
reanalysis_min_air_temp_k                1446 non-null float64
reanalysis_precip

In [54]:
pd.set_option('display.max_rows', 1500)
pd.set_option('display.max_columns', 30)

plan: try to set missing data as an average of all other matching weeks of the year

In [55]:
train.head(10)

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,299.8,295.9,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,4
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,300.9,296.4,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,5
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,300.5,297.3,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.31,301.4,297.0,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,3
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,295.821429,301.9,297.5,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,6
5,sj,1990,23,1990-06-04,,0.17485,0.254314,0.181743,9.58,299.63,299.764286,295.851429,302.4,298.1,26.49,79.891429,9.58,17.212857,2.1,28.114286,6.942857,34.4,23.9,39.1,2
6,sj,1990,24,1990-06-11,0.1129,0.0928,0.205071,0.210271,3.48,299.207143,299.221429,295.865714,301.3,297.7,38.6,82.0,3.48,17.234286,2.042857,27.414286,6.771429,32.2,23.3,29.7,4
7,sj,1990,25,1990-06-18,0.0725,0.0725,0.151471,0.133029,151.12,299.591429,299.528571,296.531429,300.6,298.4,30.0,83.375714,151.12,17.977143,1.571429,28.371429,7.685714,33.9,22.8,21.1,5
8,sj,1990,26,1990-06-25,0.10245,0.146175,0.125571,0.1236,19.32,299.578571,299.557143,296.378571,302.1,297.7,37.51,82.768571,19.32,17.79,1.885714,28.328571,7.385714,33.9,22.8,21.1,10
9,sj,1990,27,1990-07-02,,0.12155,0.160683,0.202567,14.41,300.154286,300.278571,296.651429,302.3,298.7,28.4,81.281429,14.41,18.071429,2.014286,28.328571,6.514286,33.9,24.4,1.1,6


In [56]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1456 entries, 0 to 1455
Data columns (total 25 columns):
city                                     1456 non-null object
year                                     1456 non-null int64
weekofyear                               1456 non-null int64
week_start_date                          1456 non-null object
ndvi_ne                                  1262 non-null float64
ndvi_nw                                  1404 non-null float64
ndvi_se                                  1434 non-null float64
ndvi_sw                                  1434 non-null float64
precipitation_amt_mm                     1443 non-null float64
reanalysis_air_temp_k                    1446 non-null float64
reanalysis_avg_temp_k                    1446 non-null float64
reanalysis_dew_point_temp_k              1446 non-null float64
reanalysis_max_air_temp_k                1446 non-null float64
reanalysis_min_air_temp_k                1446 non-null float64
reanalysis_precip

In [57]:
# a function/loop that can take null values and replace them with column averages

In [58]:
# testing out a few code snippets
train.iloc[10]["weekofyear"]

28

In [59]:
# more testing
(train[train["weekofyear"]==10].precipitation_amt_mm).mean()

37.88678571428572

In [60]:
# more testing
pd.isnull(train["ndvi_ne"][9])

True

In [61]:
train.dropna(axis=0, subset=["ndvi_ne"], inplace=True)

In [62]:
train.reset_index(drop=True, inplace=True)

In [63]:
# checking unique values beforehand
for i in train.columns:
    print(i,":", len(train[i].unique()))

city : 2
year : 21
weekofyear : 52
week_start_date : 928
ndvi_ne : 1214
ndvi_nw : 1221
ndvi_se : 1228
ndvi_sw : 1227
precipitation_amt_mm : 1030
reanalysis_air_temp_k : 1057
reanalysis_avg_temp_k : 571
reanalysis_dew_point_temp_k : 1053
reanalysis_max_air_temp_k : 142
reanalysis_min_air_temp_k : 118
reanalysis_precip_amt_kg_per_m2 : 952
reanalysis_relative_humidity_percent : 1198
reanalysis_sat_precip_amt_mm : 1030
reanalysis_specific_humidity_g_per_kg : 1044
reanalysis_tdtr_k : 511
station_avg_temp_c : 472
station_diur_temp_rng_c : 451
station_max_temp_c : 74
station_min_temp_c : 74
station_precip_mm : 624
total_cases : 118


In [64]:
for column_name in train.columns:
    for index, entry in enumerate(train[column_name]):
        if column_name not in ["city", "week_start_date"]:
            if pd.isnull(entry):
                train.loc[index,column_name] = (train[
                                                    train["weekofyear"]==train.iloc[index]["weekofyear"]
                                                ][column_name]).mean()

# this loop is essentially zooming in on a particular NaN entry, taking the index, using the index to find the 
# matching week of the year, and then inputting the average of that column for all entries with the same week of 
# the year in place of the NaN value

In [65]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1262 entries, 0 to 1261
Data columns (total 25 columns):
city                                     1262 non-null object
year                                     1262 non-null int64
weekofyear                               1262 non-null int64
week_start_date                          1262 non-null object
ndvi_ne                                  1262 non-null float64
ndvi_nw                                  1262 non-null float64
ndvi_se                                  1262 non-null float64
ndvi_sw                                  1262 non-null float64
precipitation_amt_mm                     1262 non-null float64
reanalysis_air_temp_k                    1262 non-null float64
reanalysis_avg_temp_k                    1262 non-null float64
reanalysis_dew_point_temp_k              1262 non-null float64
reanalysis_max_air_temp_k                1262 non-null float64
reanalysis_min_air_temp_k                1262 non-null float64
reanalysis_precip

In [66]:
# checking unique values afterwards
for i in train.columns:
    print(i,":", len(train[i].unique()))

city : 2
year : 21
weekofyear : 52
week_start_date : 928
ndvi_ne : 1214
ndvi_nw : 1234
ndvi_se : 1228
ndvi_sw : 1227
precipitation_amt_mm : 1033
reanalysis_air_temp_k : 1058
reanalysis_avg_temp_k : 571
reanalysis_dew_point_temp_k : 1054
reanalysis_max_air_temp_k : 142
reanalysis_min_air_temp_k : 118
reanalysis_precip_amt_kg_per_m2 : 954
reanalysis_relative_humidity_percent : 1199
reanalysis_sat_precip_amt_mm : 1033
reanalysis_specific_humidity_g_per_kg : 1046
reanalysis_tdtr_k : 513
station_avg_temp_c : 505
station_diur_temp_rng_c : 481
station_max_temp_c : 85
station_min_temp_c : 78
station_precip_mm : 633
total_cases : 118


In [67]:
# not so many new values, good

In [68]:
test_csv = "./data/dengue_features_test.csv"
test = pd.read_csv(test_csv)

In [69]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416 entries, 0 to 415
Data columns (total 24 columns):
city                                     416 non-null object
year                                     416 non-null int64
weekofyear                               416 non-null int64
week_start_date                          416 non-null object
ndvi_ne                                  373 non-null float64
ndvi_nw                                  405 non-null float64
ndvi_se                                  415 non-null float64
ndvi_sw                                  415 non-null float64
precipitation_amt_mm                     414 non-null float64
reanalysis_air_temp_k                    414 non-null float64
reanalysis_avg_temp_k                    414 non-null float64
reanalysis_dew_point_temp_k              414 non-null float64
reanalysis_max_air_temp_k                414 non-null float64
reanalysis_min_air_temp_k                414 non-null float64
reanalysis_precip_amt_kg_per_m2  

In [70]:
for column_name in test.columns:
    for index, entry in enumerate(test[column_name]):
        if column_name not in ["city", "week_start_date"]:
            if pd.isnull(entry):
                test.loc[index,column_name] = (test[
                                                    test["weekofyear"]==test.iloc[index]["weekofyear"]
                                                ][column_name]).mean()

In [71]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416 entries, 0 to 415
Data columns (total 24 columns):
city                                     416 non-null object
year                                     416 non-null int64
weekofyear                               416 non-null int64
week_start_date                          416 non-null object
ndvi_ne                                  415 non-null float64
ndvi_nw                                  415 non-null float64
ndvi_se                                  415 non-null float64
ndvi_sw                                  415 non-null float64
precipitation_amt_mm                     415 non-null float64
reanalysis_air_temp_k                    415 non-null float64
reanalysis_avg_temp_k                    415 non-null float64
reanalysis_dew_point_temp_k              415 non-null float64
reanalysis_max_air_temp_k                415 non-null float64
reanalysis_min_air_temp_k                415 non-null float64
reanalysis_precip_amt_kg_per_m2  

In [72]:
test[test["ndvi_ne"].isnull()]
# look at that...

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
87,sj,2010,53,2010-01-01,,,,,,,,,,,,,,,,,,,,


In [73]:
train[train["weekofyear"]==1]

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
29,sj,1991,1,1991-01-01,0.1465,0.140467,0.1515,0.167314,0.0,297.06,297.135714,291.907143,299.0,293.9,4.8,72.938571,0.0,13.511429,2.528571,24.385714,7.457143,29.4,18.9,5.3,21
71,sj,1992,1,1992-01-01,0.1251,0.157867,0.185283,0.181367,26.192273,298.228377,298.708117,294.442532,302.672727,295.086364,30.076364,80.621494,26.192273,15.93013,4.831818,26.201077,7.627721,31.15,21.336364,40.37,81
115,sj,1993,1,1993-01-08,0.02835,0.043667,0.078657,0.046457,46.79,297.467143,297.55,294.15,299.4,296.1,41.9,81.878571,46.79,15.52,1.9,25.871429,6.5,30.6,21.7,28.0,32
158,sj,1994,1,1994-01-08,-0.1209,0.0193,0.094986,0.178514,12.6,298.191429,298.285714,293.585714,300.1,296.3,19.5,75.742857,12.6,14.951429,2.057143,25.5,6.742857,29.4,21.1,41.0,31
191,sj,1995,1,1995-01-08,0.223075,0.219333,0.288617,0.258743,0.0,297.33,297.4,292.734286,300.1,294.7,12.5,75.838571,0.0,14.25,2.371429,25.7,7.685714,31.7,20.0,40.4,91
231,sj,1996,1,1996-01-01,0.014,0.010867,0.091929,0.120443,6.35,297.412857,297.457143,294.54,299.5,295.9,57.0,84.135714,6.35,15.888571,1.657143,25.185714,4.842857,28.3,21.7,46.5,23
269,sj,1997,1,1997-01-01,0.152633,0.1094,0.207743,0.173286,0.0,299.677143,299.821429,295.014286,301.5,297.7,3.24,75.678571,0.0,16.377143,2.357143,26.628571,7.714286,31.7,21.7,0.5,29
310,sj,1998,1,1998-01-01,0.0455,0.0478,0.123986,0.083443,26.192273,298.228377,298.708117,294.442532,302.672727,295.086364,30.076364,80.621494,26.192273,15.93013,4.831818,26.201077,7.627721,31.15,21.336364,40.37,64
390,sj,2000,1,2000-01-08,0.20635,0.1613,0.139883,0.095833,81.73,298.148571,298.164286,294.355714,299.9,296.5,32.3,79.607143,81.73,15.722857,1.9,25.128571,5.314286,29.4,21.7,28.6,16
434,sj,2001,1,2001-01-01,0.004833,0.006633,0.210814,0.202414,0.0,298.477143,298.442857,294.034286,301.2,295.6,1.3,76.505714,0.0,15.444286,3.285714,25.128571,7.685714,30.0,20.6,1.9,28


In [74]:
train[train["weekofyear"]==53]
# filling these in probably isn't good practice so i'll ditch them

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases


In [75]:
train.dropna(inplace=True)

In [76]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1262 entries, 0 to 1261
Data columns (total 25 columns):
city                                     1262 non-null object
year                                     1262 non-null int64
weekofyear                               1262 non-null int64
week_start_date                          1262 non-null object
ndvi_ne                                  1262 non-null float64
ndvi_nw                                  1262 non-null float64
ndvi_se                                  1262 non-null float64
ndvi_sw                                  1262 non-null float64
precipitation_amt_mm                     1262 non-null float64
reanalysis_air_temp_k                    1262 non-null float64
reanalysis_avg_temp_k                    1262 non-null float64
reanalysis_dew_point_temp_k              1262 non-null float64
reanalysis_max_air_temp_k                1262 non-null float64
reanalysis_min_air_temp_k                1262 non-null float64
reanalysis_precip

In [77]:
train.to_csv("./data/cleaned_train.csv")

In [27]:
# i can't drop the test entry so i'll have to fill it in

In [78]:
test[test["weekofyear"]==53]

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
87,sj,2010,53,2010-01-01,,,,,,,,,,,,,,,,,,,,


In [79]:
test.loc[87, "weekofyear"] = 1

In [80]:
for column_name in test.columns:
    for index, entry in enumerate(test[column_name]):
        if column_name not in ["city", "week_start_date"]:
            if pd.isnull(entry):
                test.loc[index,column_name] = (test[
                                                    test["weekofyear"]==test.iloc[index]["weekofyear"]
                                                ][column_name]).mean()

In [81]:
test.info()
# cool

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416 entries, 0 to 415
Data columns (total 24 columns):
city                                     416 non-null object
year                                     416 non-null int64
weekofyear                               416 non-null int64
week_start_date                          416 non-null object
ndvi_ne                                  416 non-null float64
ndvi_nw                                  416 non-null float64
ndvi_se                                  416 non-null float64
ndvi_sw                                  416 non-null float64
precipitation_amt_mm                     416 non-null float64
reanalysis_air_temp_k                    416 non-null float64
reanalysis_avg_temp_k                    416 non-null float64
reanalysis_dew_point_temp_k              416 non-null float64
reanalysis_max_air_temp_k                416 non-null float64
reanalysis_min_air_temp_k                416 non-null float64
reanalysis_precip_amt_kg_per_m2  

In [82]:
test.to_csv("./data/test.csv")

### Separating cities