In [2]:
# Implementation of a simple MLP network with one hidden layer. Tested on the iris data set.
# Requires: numpy, sklearn>=0.18.1, tensorflow>=1.0

# NOTE: In order to make the code simple, we rewrite x * W_1 + b_1 = x' * W_1'
# where x' = [x | 1] and W_1' is the matrix W_1 appended with a new row with elements b_1's.
# Similarly, for h * W_2 + b_2
# for dengai

In [6]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [7]:
# make function to preprocess data
def preprocess_data(data_path, labels_path=None):
    # load data and set index to city, year, weekofyear
    df = pd.read_csv(data_path)
    
    # select features we want
    '''
    features = ['ndvi_nw',
                'ndvi_se',
                'ndvi_sw',
                'precipitation_amt_mm',
                'reanalysis_air_temp_k',
                'reanalysis_avg_temp_k',
                'reanalysis_dew_point_temp_k',
                'reanalysis_max_air_temp_k',
                'reanalysis_min_air_temp_k',
                'reanalysis_precip_amt_kg_per_m2',
                'reanalysis_relative_humidity_percent',
                'reanalysis_sat_precip_amt_mm',
                'reanalysis_specific_humidity_g_per_kg',
                'reanalysis_tdtr_k',
                'station_avg_temp_c',
                'station_diur_temp_rng_c',
                'station_max_temp_c',
                'station_min_temp_c',
                'station_precip_mm']
    df = df[features]
    '''
    # fill missing values
    df.fillna(method='ffill', inplace=True)

    # add labels to dataframe
    if labels_path:
        labels = pd.read_csv(labels_path)
        #df = df.join(labels)
    
    # separate san juan and iquitos
    sj_features = df[df.city == 'sj']
    iq_features = df[df.city == 'iq']
    sj_labels = labels[labels.city == 'sj']
    iq_labels = labels[labels.city == 'iq']
    return sj_features, iq_features, sj_labels, iq_labels

In [62]:
sj_features, iq_features, sj_labels, iq_labels = preprocess_data(
                                                            'data/dengue_features_train.csv',
                                                            labels_path="data/dengue_labels_train.csv")

In [63]:
#dropping date and city as city already divided

iq_features = iq_features.drop(iq_features.columns[[0,3]], axis=1)
sj_features = sj_features.drop(sj_features.columns[[0,3]], axis=1)

sj_features.head()

Unnamed: 0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,1990,18,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,1990,19,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,1990,20,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,1990,21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.31,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,1990,22,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,295.821429,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8


In [75]:
#since data is linear it makes sense to separate data linearly
#split train and test data
sj_train = sj_features.head(800)
sj_train_target = sj_labels.head(800)
sj_test = sj_features.tail(sj_features.shape[0] - 800)
sj_test_target = sj_labels.tail(sj_labels.shape[0] - 800)



iq_train = iq_features.head(400)
iq_train_target = iq_labels.head(400)
iq_test = iq_features.tail(iq_features.shape[0] - 400)
iq_test_target = iq_labels.tail(iq_labels.shape[0] - 400)

In [65]:
RANDOM_SEED = 42
tf.set_random_seed(RANDOM_SEED)

In [66]:
def init_weights(shape):
    """ Weight initialization """
    weights = tf.random_normal(shape, stddev=0.1)
    return tf.Variable(weights)

In [67]:
def forwardprop(X, w_1, w_2, w_3):
    """
    Forward-propagation.
    IMPORTANT: yhat is not softmax since TensorFlow's softmax_cross_entropy_with_logits() does that internally.
    """
    h    = tf.nn.sigmoid(tf.matmul(X, w_1))  # The \sigma function
    h2 = tf.nn.sigmoid(tf.matmul(h, w_2))
    yhat = tf.matmul(h2, w_3)  # The \varphi function
    return yhat

In [68]:
# Layer's sizes San Juan
x_size = sj_train.shape[1]   # Number of input nodes
h_size = 256                # Number of hidden nodes
y_size = 1   # Number of outcomes (3 iris flowers)

In [69]:
# Symbols
X = tf.placeholder("float", shape=[None, x_size])
y = tf.placeholder("float", shape=[None, y_size])

In [70]:
# Weight initializations
w_1 = init_weights((x_size, h_size))
w_2 = init_weights((h_size, h_size))
w_3 = init_weights((h_size, y_size))

In [71]:
# Forward propagation
yhat    = forwardprop(X, w_1, w_2, w_3)
predict = tf.argmax(yhat, axis=1)

In [72]:
# Backward propagation
cost    = tf.reduce_mean(tf.losses.mean_squared_error(labels=y, predictions=yhat))
updates = tf.train.GradientDescentOptimizer(0.00003).minimize(cost)

In [73]:
# Run SGD
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [74]:
for epoch in range(6000):
        # Train with each example
        for i in range(len(sj_train)):
            sess.run(updates, feed_dict={X: sj_train[i: i + 1], y: sj_train_target[i: i + 1]})

        train_MAE = metrics.mean_absolute_error(sj_train_target,
                                                     sess.run(predict, feed_dict={X: sj_train}))
        test_MAE  = metrics.mean_absolute_error(sj_test_target,
                                                     sess.run(predict, feed_dict={X: sj_test}))

        if epoch % 10 == 0 or epoch == epochs-1:
            print("Epoch = %d, train loss = %.2f, test loss = %.2f"
                  % (epoch + 1, train_MAE, test_MAE))

ValueError: could not convert string to float: 'sj'

In [15]:
sess.close()