In [None]:
# coding: utf-8
%matplotlib inline

import matplotlib.pyplot as plt 
import matplotlib
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
from pandas.plotting import scatter_matrix
import tensorboard

In [None]:
with open('../../data/transformed_data.pkl','rb') as f:
    transformed_data = pickle.load(f)
    f.close()

with open('../../data/targets.pkl','rb') as f:
    targets = pickle.load(f)
    f.close()


df = pd.DataFrame(np.c_[transformed_data, targets])

df.columns = ['start_date_unix', 'start_date_weekday', 'start_date_dayofyear', 'start_date_day', 
                'start_date_week', 'start_date_month', 'start_date_hour','time_delta',
                'comment_why_you_came_strlength',
                'comment_why_you_came_capsratio', 'comment_where_for_help_strlength',
                'comment_where_for_help_capsratio','comment_further_comments_strlength',
                'comment_further_comments_capsratio','target']

df.shape

Add bias node to the data

In [None]:
transformed_data = np.c_[np.ones((transformed_data.shape[0], 1)), transformed_data]

Create a stratified test and train set, ensuring that there is a good proportion of targets and non-targets in the test/training set.

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size=0.2, random_state=1337)
for train_index, test_index in split.split(transformed_data, targets):
    train_index = train_index
    test_index=test_index


In [None]:
print('test_m =', len(test_index))
print('test_m =', len(train_index))
print('proportion of targets =',sum(targets[test_index])/len(targets[test_index]))
print('proportion of targets =',sum(targets[train_index])/len(targets[train_index]))

train_X = transformed_data[train_index]
train_y = targets[train_index]
test_X = transformed_data[test_index]
test_y = targets[test_index]

# Try to solve label shape error

from sklearn.utils import column_or_1d
train_y = column_or_1d(train_y)
test_y = column_or_1d(test_y)

In [None]:
m = train_X.shape[0]
n = train_X.shape[1]

In [None]:
tf.reset_default_graph()

from datetime import datetime

now = datetime.utcnow().strftime("%Y%m%d%H%M%S")

n_hidden1 = 100
n_hidden2 = 300
n_outputs = 2
learning_rate = 0.01
dropout_rate = 0.5  # == 1 - keep_prob



In [None]:
file_writer = tf.summary.FileWriter('tf_logs', tf.get_default_graph())

X = tf.placeholder(tf.float32, shape=(None, n), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")


In [None]:
def neuron_layer(X, n_neurons, name, activation=None):
    with tf.name_scope(name):
        n_inputs = int(X.get_shape()[1])
        stddev = 2 / np.sqrt(n_inputs)
        init = tf.truncated_normal((n_inputs, n_neurons), stddev=stddev)
        W = tf.Variable(init, name="kernel")
        b = tf.Variable(tf.zeros([n_neurons]), name="bias")
        Z = tf.matmul(X, W) + b
        if activation is not None:
            return activation(Z)
        else:
            return Z

In [None]:
# Uses the ELU activation function

training = tf.placeholder_with_default(False, shape=(), name='training')

dropout_rate = 0.5  # == 1 - keep_prob
X_drop = tf.layers.dropout(X, dropout_rate, training=training)

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X_drop, n_hidden1, activation=tf.nn.elu,
                              name="hidden1")
    
    hidden1_drop = tf.layers.dropout(hidden1, dropout_rate, training=training)
    hidden2 = tf.layers.dense(hidden1_drop, n_hidden2, activation=tf.nn.elu,
                              name="hidden2")
    
    hidden2_drop = tf.layers.dropout(hidden2, dropout_rate, training=training)
    logits = tf.layers.dense(hidden2_drop, n_outputs, name="outputs")

In [None]:
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                              logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

In [None]:
with tf.name_scope("train"):
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
    training_op = optimizer.minimize(loss)

In [None]:
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [None]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)

In [None]:
def fetch_batch(epoch, batch_index, batch_size):
    
    # Set a random seed that can be reproduced
    
    np.random.seed(epoch * n_batches + batch_index)  # not shown in the book
    
    # Select random indexes from m (training examples) 
    
    indices = np.random.randint(m, size=batch_size)  # not shown
    
    # Create batches
    
    X_batch = train_X[indices] # not shown
    #y_batch = train_y.reshape(-1, 1)[indices] # not shown
    y_batch = column_or_1d(train_y[indices]) # not shown
    
    return X_batch, y_batch

In [None]:
n_epochs = 50
n_batches = 50
batch_size = 100

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(m // batch_size):
            X_batch, y_batch = fetch_batch(epoch, iteration, batch_size)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
            if iteration % 100 == 0:
                summary_str = accuracy.eval(feed_dict={X: test_X, y: test_y})
                step = epoch * n_batches + iteration
                
                #file_writer.add_summary(summary_str,step)
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_test = accuracy.eval(feed_dict={X: test_X, y: test_y})

        print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test)

    save_path = saver.save(sess, "./saver/my_model_final.ckpt")

In [None]:
file_writer.close()