In [1]:
# Implementation of a simple MLP network with one hidden layer. Tested on the iris data set.
# Requires: numpy, sklearn>=0.18.1, tensorflow>=1.0

# NOTE: In order to make the code simple, we rewrite x * W_1 + b_1 = x' * W_1'
# where x' = [x | 1] and W_1' is the matrix W_1 appended with a new row with elements b_1's.
# Similarly, for h * W_2 + b_2
# for dengai

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [3]:
# make function to preprocess data
def preprocess_data(data_path, labels_path=None):
    # load data and set index to city, year, weekofyear
    df = pd.read_csv(data_path)
    
    # fill missing values
    df.fillna(method='ffill', inplace=True)

    # add labels to dataframe
    if labels_path:
        labels = pd.read_csv(labels_path)
        #df = df.join(labels)
    
    # separate san juan and iquitos
    sj_features = df[df.city == 'sj']
    iq_features = df[df.city == 'iq']
    sj_labels = labels[labels.city == 'sj']
    iq_labels = labels[labels.city == 'iq']
    return sj_features, iq_features, sj_labels, iq_labels

In [4]:
sj_features, iq_features, sj_labels, iq_labels = preprocess_data(
                                                            'data/dengue_features_train.csv',
                                                            labels_path="data/dengue_labels_train.csv")

In [5]:
#dropping date and city as city already divided

iq_features = iq_features.drop(iq_features.columns[[0,3]], axis=1)
sj_features = sj_features.drop(sj_features.columns[[0,3]], axis=1)

#removing city, year, weekofyear from labels tables
sj_labels = sj_labels.total_cases
#sj_labels = sj_labels.set_index([0,2])
iq_labels = iq_labels.total_cases


sj_labels.head()

0    4
1    5
2    4
3    3
4    6
Name: total_cases, dtype: int64

In [None]:
#since data is linear it makes sense to separate data linearly
#split train and test data
sj_train = sj_features.head(800)
sj_train_target = sj_labels.head(800)
sj_test = sj_features.tail(sj_features.shape[0] - 800)
sj_test_target = sj_labels.tail(sj_labels.shape[0] - 800)



iq_train = iq_features.head(400)
iq_train_target = iq_labels.head(400)
iq_test = iq_features.tail(iq_features.shape[0] - 400)
iq_test_target = iq_labels.tail(iq_labels.shape[0] - 400)

In [6]:
#randomly separating data
# splitting data into training set and validation set

sj_train, sj_test, sj_train_target, sj_test_target = train_test_split(sj_features, sj_labels, test_size=0.2, random_state=41)

iq_train, iq_test, iq_train_target, iq_test_target = train_test_split(iq_features, iq_labels, test_size=0.2, random_state=41)

In [7]:
sj_train.head()

Unnamed: 0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
75,1991,41,0.093,0.093,0.145083,0.168167,32.1,299.562857,299.671429,295.887143,...,55.1,80.515714,32.1,17.344286,2.257143,27.657143,6.614286,31.7,22.8,8.3
119,1992,33,0.16395,0.12665,0.215057,0.219271,65.55,299.215714,299.221429,296.007143,...,81.8,82.75,65.55,17.415714,1.957143,27.671429,7.557143,31.7,22.8,41.9
882,2007,16,0.0689,0.044267,0.108357,0.085029,0.0,299.335714,299.557143,294.507143,...,2.22,74.832857,0.0,15.854286,3.0,27.728571,8.014286,33.3,22.8,1.8
319,1996,25,0.07125,0.0646,0.1461,0.156171,47.93,298.49,298.564286,295.435714,...,73.7,83.322857,47.93,16.81,1.928571,26.442857,5.8,30.6,22.8,84.6
693,2003,35,0.0726,0.1,0.204371,0.160357,98.68,300.668571,300.728571,297.048571,...,50.86,80.797143,98.68,18.571429,2.542857,27.914286,6.457143,31.7,23.3,69.6


In [8]:
sj_train_target.head()

75     116
119     30
882      4
319      6
693     32
Name: total_cases, dtype: int64

In [9]:
def init_weights(shape):
    """ Weight initialization """
    weights = tf.random_normal(shape, stddev=0.1)
    return tf.Variable(weights)

In [10]:
def forwardprop(X, w_1, w_2, w_3):
    """
    Forward-propagation.
    IMPORTANT: yhat is not softmax since TensorFlow's softmax_cross_entropy_with_logits() does that internally.
    """
    h    = tf.nn.sigmoid(tf.matmul(X, w_1))  # The \sigma function
    h2 = tf.nn.sigmoid(tf.matmul(h, w_2))
    yhat = tf.matmul(h2, w_3)  # The \varphi function
    return yhat

In [11]:
# Layer's sizes San Juan
x_size = sj_train.shape[1]   # Number of input nodes
h_size = 256                # Number of hidden nodes
y_size = 1   # Number of outcomes (3 iris flowers)

In [12]:
# Symbols
X = tf.placeholder("float", shape=[None, x_size])
y = tf.placeholder("float", shape=[None, y_size])

In [13]:
# Weight initializations
w_1 = init_weights((x_size, h_size))
w_2 = init_weights((h_size, h_size))
w_3 = init_weights((h_size, y_size))

In [14]:
# Forward propagation
yhat    = forwardprop(X, w_1, w_2, w_3)
predict = tf.to_int64(yhat)

In [15]:
# Backward propagation
cost    = tf.losses.mean_squared_error(labels=y, predictions=yhat)
updates = tf.train.GradientDescentOptimizer(0.0000003).minimize(cost)

In [16]:
# transposing labels so that it works in tensorflow
sj_train_target = sj_train_target.as_matrix()
iq_train_target = iq_train_target.as_matrix()

sj_train_target = sj_train_target[:, None]
iq_train_target = iq_train_target[:, None]

sj_test_target = sj_test_target.as_matrix()
iq_test_target = iq_test_target.as_matrix()

sj_test_target = sj_test_target[:, None]
iq_test_target = iq_test_target[:, None]


In [17]:
sj_train_target

array([[116],
       [ 30],
       [  4],
       [  6],
       [ 32],
       [  8],
       [  2],
       [ 14],
       [  3],
       [  8],
       [ 21],
       [ 18],
       [ 52],
       [ 42],
       [ 65],
       [  8],
       [  1],
       [ 59],
       [ 70],
       [ 17],
       [  5],
       [ 13],
       [  3],
       [ 22],
       [ 11],
       [ 13],
       [  4],
       [ 11],
       [  8],
       [ 23],
       [  4],
       [ 17],
       [ 51],
       [ 21],
       [ 10],
       [ 73],
       [ 13],
       [ 29],
       [ 75],
       [ 16],
       [ 16],
       [ 16],
       [ 13],
       [ 48],
       [ 30],
       [  4],
       [ 62],
       [ 22],
       [ 38],
       [ 27],
       [ 24],
       [  6],
       [ 17],
       [ 37],
       [ 37],
       [ 45],
       [ 17],
       [ 13],
       [ 34],
       [  9],
       [112],
       [  7],
       [ 14],
       [  6],
       [ 36],
       [  2],
       [ 11],
       [  2],
       [ 55],
       [  7],
       [ 17],
      

In [18]:
# Run SGD
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [None]:
for epoch in range(100):
        # Train with each example
        for i in range(len(sj_train)):
            sess.run(updates, feed_dict={X: sj_train[i: i + 1], y: sj_train_target[i: i + 1]})

        train_MAE = metrics.mean_absolute_error(sj_train_target,
                                                     sess.run(predict, feed_dict={X: sj_train}))
        test_MAE  = metrics.mean_absolute_error(sj_test_target,
                                                     sess.run(predict, feed_dict={X: sj_test}))

        if epoch % 10 == 0 or epoch == 99:
            print("Epoch = %d, train loss = %.2f, test loss = %.2f"
                  % (epoch + 1, train_MAE, test_MAE))

Epoch = 1, train loss = 35.17, test loss = 30.25
Epoch = 11, train loss = 27.24, test loss = 22.44
Epoch = 21, train loss = 25.57, test loss = 21.41
Epoch = 31, train loss = 26.32, test loss = 22.60


In [None]:
sess.close()

In [None]:
#run model again but for Iquitos, no need to redo x_size since have same number of variables


In [None]:
# reset session
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [None]:
for epoch in range(500):
        # Train with each example
        for i in range(len(iq_train)):
            sess.run(updates, feed_dict={X: iq_train[i: i + 1], y: iq_train_target[i: i + 1]})

        train_MAE = metrics.mean_absolute_error(iq_train_target,
                                                     sess.run(predict, feed_dict={X: iq_train}))
        test_MAE  = metrics.mean_absolute_error(iq_test_target,
                                                     sess.run(predict, feed_dict={X: iq_test}))

        if epoch % 10 == 0 or epoch == 5999:
            print("Epoch = %d, train loss = %.2f, test loss = %.2f"
                  % (epoch + 1, train_MAE, test_MAE))

In [None]:
#Final session to compile final predictions and put in submission file