#Dengue Fever Prediction

In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

%matplotlib inline

In [2]:
train_label_df = pd.read_csv('https://s3.amazonaws.com/drivendata/data/44/public/dengue_labels_train.csv')
train_label_df.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,1990,18,4
1,sj,1990,19,5
2,sj,1990,20,4
3,sj,1990,21,3
4,sj,1990,22,6


In [3]:
train_features_df = pd.read_csv('https://s3.amazonaws.com/drivendata/data/44/public/dengue_features_train.csv')
train_features_df.head()

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8


In [4]:
train_features_df.ndvi_ne = train_features_df.ndvi_ne.fillna(method='ffill')
train_features_df.ndvi_nw = train_features_df.ndvi_nw.fillna(method='ffill')
train_features_df.ndvi_se = train_features_df.ndvi_se.fillna(method='ffill')
train_features_df.ndvi_sw = train_features_df.ndvi_sw.fillna(method='ffill')

train_features_df.station_avg_temp_c = train_features_df.station_avg_temp_c.fillna(method='ffill')
train_features_df.station_diur_temp_rng_c = train_features_df.station_diur_temp_rng_c.fillna(method='ffill')
train_features_df.station_max_temp_c = train_features_df.station_max_temp_c.fillna(method='ffill')
train_features_df.station_min_temp_c = train_features_df.station_min_temp_c.fillna(method='ffill')
train_features_df.station_precip_mm = train_features_df.station_precip_mm.fillna(method='ffill')

train_features_df.precipitation_amt_mm = train_features_df.precipitation_amt_mm.fillna(np.mean(train_features_df.precipitation_amt_mm))

train_features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1456 entries, 0 to 1455
Data columns (total 24 columns):
city                                     1456 non-null object
year                                     1456 non-null int64
weekofyear                               1456 non-null int64
week_start_date                          1456 non-null object
ndvi_ne                                  1456 non-null float64
ndvi_nw                                  1456 non-null float64
ndvi_se                                  1456 non-null float64
ndvi_sw                                  1456 non-null float64
precipitation_amt_mm                     1456 non-null float64
reanalysis_air_temp_k                    1446 non-null float64
reanalysis_avg_temp_k                    1446 non-null float64
reanalysis_dew_point_temp_k              1446 non-null float64
reanalysis_max_air_temp_k                1446 non-null float64
reanalysis_min_air_temp_k                1446 non-null float64
reanalysis_precip

In [0]:
COLUMNS = ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c', 'station_min_temp_c', 'station_precip_mm']

In [6]:
test_features_df = pd.read_csv('https://s3.amazonaws.com/drivendata/data/44/public/dengue_features_test.csv')
test_features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416 entries, 0 to 415
Data columns (total 24 columns):
city                                     416 non-null object
year                                     416 non-null int64
weekofyear                               416 non-null int64
week_start_date                          416 non-null object
ndvi_ne                                  373 non-null float64
ndvi_nw                                  405 non-null float64
ndvi_se                                  415 non-null float64
ndvi_sw                                  415 non-null float64
precipitation_amt_mm                     414 non-null float64
reanalysis_air_temp_k                    414 non-null float64
reanalysis_avg_temp_k                    414 non-null float64
reanalysis_dew_point_temp_k              414 non-null float64
reanalysis_max_air_temp_k                414 non-null float64
reanalysis_min_air_temp_k                414 non-null float64
reanalysis_precip_amt_kg_per_m2  

In [7]:
submission_df = pd.read_csv('https://s3.amazonaws.com/drivendata/data/44/public/submission_format.csv')
submission_df.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,0
1,sj,2008,19,0
2,sj,2008,20,0
3,sj,2008,21,0
4,sj,2008,22,0


# Exploratory Data Analysis

In [8]:
train_features_df['city'].value_counts()

sj    936
iq    520
Name: city, dtype: int64

In [9]:
train_features_df['year'].value_counts()

2001    104
2003    104
2002    104
2007    104
2006    104
2005    104
2004    104
2000     78
2008     69
1991     52
2009     52
1999     52
1998     52
1997     52
1996     52
1995     52
1994     52
1993     52
1992     52
1990     35
2010     26
Name: year, dtype: int64

In [0]:
union_df = train_label_df.merge(train_features_df[['city', 'year', 'weekofyear', 'week_start_date','station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c', 'station_min_temp_c', 'station_precip_mm', 'precipitation_amt_mm']], on=['city', 'year', 'weekofyear'])


In [11]:
union_df.head()

Unnamed: 0,city,year,weekofyear,total_cases,week_start_date,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,precipitation_amt_mm
0,sj,1990,18,4,1990-04-30,25.442857,6.9,29.4,20.0,16.0,12.42
1,sj,1990,19,5,1990-05-07,26.714286,6.371429,31.7,22.2,8.6,22.82
2,sj,1990,20,4,1990-05-14,26.714286,6.485714,32.2,22.8,41.4,34.54
3,sj,1990,21,3,1990-05-21,27.471429,6.771429,33.3,23.3,4.0,15.36
4,sj,1990,22,6,1990-05-28,28.942857,9.371429,35.0,23.9,5.8,7.52


In [12]:
sj = union_df[union_df['city'] == 'sj']
sj_1990 = sj[sj['year'] == 1990]
sj_1991 = sj[sj['year'] == 1991]
sj_1992 = sj[sj['year'] == 1992]
sj_3_yrs = pd.concat([sj_1990, sj_1991, sj_1992])


_ = sj_3_yrs.plot(x='week_start_date', y='reanalysis_max_c')
_ = sj_3_yrs.plot(x='week_start_date', y='station_max_temp_c')
_ = plt.show()

KeyError: ignored

In [83]:
union_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1456 entries, 0 to 1455
Data columns (total 6 columns):
city                    1456 non-null object
year                    1456 non-null int64
weekofyear              1456 non-null int64
total_cases             1456 non-null int64
week_start_date         1456 non-null object
precipitation_amt_mm    1456 non-null float64
dtypes: float64(1), int64(3), object(2)
memory usage: 79.6+ KB


In [95]:
train_features_df[COLUMNS].describe()

Unnamed: 0,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
count,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0,1456.0
mean,0.131271,0.128068,0.202606,0.201987,45.760388,27.180313,8.085646,32.443338,22.099863,39.194162
std,0.138527,0.119561,0.074409,0.083592,43.519806,1.280861,2.122836,1.960769,1.569113,47.363305
min,-0.40625,-0.4561,-0.015533,-0.063457,0.0,21.4,4.528571,26.7,14.7,0.0
25%,0.0391,0.04825,0.152795,0.144455,9.96,26.3,6.528571,31.1,21.1,8.7
50%,0.1139,0.115926,0.195664,0.190121,38.71,27.4,7.364286,32.8,22.2,23.8
75%,0.232018,0.213429,0.247461,0.246775,70.0475,28.132143,9.6,33.9,23.3,53.675
max,0.508357,0.454429,0.538314,0.546017,390.6,30.8,15.8,42.2,25.6,543.3


In [13]:
outliers_df = train_features_df[train_features_df['precipitation_amt_mm'] > 200]
outliers_df

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
88,sj,1992,2,1992-01-08,0.100833,0.073575,0.138714,0.112786,200.85,297.912857,...,18.3,81.3,200.85,15.878571,2.3,25.657143,7.5,30.6,20.6,58.2
264,sj,1995,21,1995-05-28,0.0322,0.162133,0.231829,0.240271,204.62,299.454286,...,17.3,81.447143,204.62,17.412857,2.328571,27.228571,7.342857,31.7,22.2,97.5
332,sj,1996,38,1996-09-16,0.1043,0.02845,0.150429,0.107817,243.55,299.482857,...,151.7,85.69,243.55,18.407143,1.828571,27.042857,5.828571,31.7,22.8,305.9
438,sj,1998,40,1998-10-01,-0.0022,0.0382,0.161886,0.168314,223.61,300.597143,...,124.4,78.782857,223.61,18.061429,2.8,27.914286,6.285714,32.8,23.9,24.7
498,sj,1999,47,1999-11-26,0.0005,0.0712,0.1077,0.08874,224.9,298.732857,...,140.2,86.405714,224.9,17.87,1.9,26.6,4.528571,31.7,23.3,88.9
600,sj,2001,46,2001-11-12,0.0941,0.1127,0.196171,0.2132,287.55,299.42,...,133.93,81.081429,287.55,17.308571,2.242857,26.585714,5.557143,33.3,22.2,96.0
623,sj,2002,17,2002-04-23,-0.0176,0.0519,0.113571,0.111629,214.76,297.97,...,31.0,83.657143,214.76,16.368571,2.514286,25.7,6.185714,30.6,21.7,69.7
675,sj,2003,17,2003-04-23,-0.078,-0.1252,0.148671,0.159471,389.6,298.445714,...,111.7,83.751429,389.6,16.885714,2.785714,25.357143,6.428571,30.6,20.6,88.2
705,sj,2003,47,2003-11-19,-0.21795,0.015,0.128143,0.135014,390.6,299.072857,...,181.74,87.575714,390.6,18.37,1.985714,26.3,4.685714,31.7,23.9,133.2
749,sj,2004,39,2004-09-23,0.04015,-0.06745,0.225257,0.2332,245.73,299.798571,...,254.95,85.33,245.73,18.645714,2.542857,27.228571,6.057143,33.9,23.3,158.2


# Baseline Linear Regression

In [14]:
COLUMNS

['ndvi_ne',
 'ndvi_nw',
 'ndvi_se',
 'ndvi_sw',
 'precipitation_amt_mm',
 'station_avg_temp_c',
 'station_diur_temp_rng_c',
 'station_max_temp_c',
 'station_min_temp_c',
 'station_precip_mm']

In [0]:
MERGE_COLS = ['city', 'year', 'weekofyear'] + COLUMNS
TF_COLS = ['total_cases'] + COLUMNS

training_data = train_label_df.merge(train_features_df[MERGE_COLS], on=['city', 'year', 'weekofyear'])

In [0]:
feature_cols = [
    tf.feature_column.numeric_column('ndvi_ne'),
    tf.feature_column.numeric_column('ndvi_nw'),
    tf.feature_column.numeric_column('ndvi_se'),
    tf.feature_column.numeric_column('ndvi_sw'),
    tf.feature_column.numeric_column('precipitation_amt_mm'),
    tf.feature_column.numeric_column('station_avg_temp_c'),
    tf.feature_column.numeric_column('station_diur_temp_rng_c'),
    tf.feature_column.numeric_column('station_max_temp_c'),
    tf.feature_column.numeric_column('station_min_temp_c'),
    tf.feature_column.numeric_column('station_precip_mm')
]

In [0]:
def train_input(df, batch_size, epochs):
  return tf.estimator.inputs.pandas_input_fn(
      x=df,
      y=df['total_cases'],
      batch_size= batch_size,
      num_epochs= epochs,
      shuffle= True
  )

In [0]:
def eval_input(df):
  return tf.estimator.inputs.pandas_input_fn(
      x=df,
      y=None,
      num_epochs= 1,
      shuffle=False
  )

In [19]:
estimator = tf.estimator.LinearRegressor(
    feature_columns=feature_cols,
    optimizer=tf.train.FtrlOptimizer(learning_rate=0.001)
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f4018130850>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_device_fn': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmp5Dy4Nm', '_train_distribute': None, '_save_summary_steps': 100}


In [33]:
estimator = tf.estimator.DNNRegressor(
    feature_columns = feature_cols,
    hidden_units = [10,20,10]
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_global_id_in_cluster': 0, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f401635fb50>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_device_fn': None, '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmpkA2IXS', '_train_distribute': None, '_save_summary_steps': 100}


In [0]:
estimator.train(
    input_fn=train_input(training_data[TF_COLS], 32, None),
    steps=10000
)

In [22]:
test_features_df.ndvi_ne = test_features_df.ndvi_ne.fillna(method='ffill')
test_features_df.ndvi_nw = test_features_df.ndvi_nw.fillna(method='ffill')
test_features_df.ndvi_se = test_features_df.ndvi_se.fillna(method='ffill')
test_features_df.ndvi_sw = test_features_df.ndvi_sw.fillna(method='ffill')

test_features_df.station_avg_temp_c = test_features_df.station_avg_temp_c.fillna(method='ffill')
test_features_df.station_diur_temp_rng_c = test_features_df.station_diur_temp_rng_c.fillna(method='ffill')
test_features_df.station_max_temp_c = test_features_df.station_max_temp_c.fillna(method='ffill')
test_features_df.station_min_temp_c = test_features_df.station_min_temp_c.fillna(method='ffill')
test_features_df.station_precip_mm = test_features_df.station_precip_mm.fillna(method='ffill')

test_features_df.precipitation_amt_mm = test_features_df.precipitation_amt_mm.fillna(np.mean(train_features_df.precipitation_amt_mm))

test_features_df[COLUMNS].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416 entries, 0 to 415
Data columns (total 10 columns):
ndvi_ne                    416 non-null float64
ndvi_nw                    416 non-null float64
ndvi_se                    416 non-null float64
ndvi_sw                    416 non-null float64
precipitation_amt_mm       416 non-null float64
station_avg_temp_c         416 non-null float64
station_diur_temp_rng_c    416 non-null float64
station_max_temp_c         416 non-null float64
station_min_temp_c         416 non-null float64
station_precip_mm          416 non-null float64
dtypes: float64(10)
memory usage: 32.6 KB


In [0]:
predictions = list(estimator.predict(input_fn=eval_input(test_features_df[COLUMNS])))

In [0]:
import math

In [0]:
output = [int(math.ceil(p['predictions'][0])) for p in predictions]
output

In [26]:
submission_df = test_features_df[['city', 'year', 'weekofyear']]
submission_df['total_cases'] = output
submission_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,18
1,sj,2008,19,10
2,sj,2008,20,8
3,sj,2008,21,8
4,sj,2008,22,13


In [0]:
submission_df.to_csv('submission_lin.csv', index=False)

In [28]:
!ls

sample_data  submission_lin.csv  submission_lin_floor.csv


In [0]:
from google.colab import files
files.download('submission_lin.csv') 

In [0]:
print(estimator.evaluate(input_fn=train_input(training_data[TF_COLS], 32, 1)))

# Bin values and create feature crosses