In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf
import random as rn

import pandas as pd
from matplotlib import pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

tf.logging.set_verbosity(tf.logging.INFO)

seed = 42
np.random.seed(seed)
rn.seed(seed)
TRAIN_FILE_PATH = "data/X_train.csv"
TARGET_FILE_PATH =  "data/y_train.csv"
TEST_FILE_PATH = "data/X_test.csv"

#print(tf.VERSION)

In [13]:
def model_fn(features, labels, mode):
    
    n_features = 1000
    n_classes = 3
    lambda_reg = 2.0
    
    
    # Input Layer
    # Reshape X to 2-D tensor: [batch_size, n_features]
    input_layer = tf.reshape(features["x"], [-1, n_features])
    

    # Input Tensor Shape: [batch_size, 1000]
    # Output Tensor Shape: [batch_size, 32]
    dense1 = tf.layers.dense(inputs=input_layer, 
                             units=32, 
                             activation=tf.nn.relu,
                             kernel_regularizer = tf.contrib.layers.l2_regularizer(scale=lambda_reg))
    
    # Input Tensor Shape: [batch_size, 32]
    # Output Tensor Shape: [batch_size, 16]
    dense2 = tf.layers.dense(inputs=dense1, 
                             units=16, 
                             activation=tf.nn.relu)
    
    # Input Tensor Shape: [batch_size, 16]
    # Output Tensor Shape: [batch_size, 8]
    dense3 = tf.layers.dense(inputs=dense2, 
                             units=8, 
                             activation=tf.nn.relu)
    
    # Input Tensor Shape: [batch_size, 8]
    # Output Tensor Shape: [batch_size, 4]
    dense4 = tf.layers.dense(inputs=dense3, 
                             units=4, 
                             activation=tf.nn.relu)
    
    # Logits layer
    # Input Tensor Shape: [batch_size, 4]
    # Output Tensor Shape: [batch_size, 3]
    logits = tf.layers.dense(inputs=dense4, units=n_classes)

    predictions = {
        "classes": tf.argmax(input=logits, axis=1),
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    
    onehot_labels = tf.one_hot(labels, depth=n_classes)
    # Calculate Loss (for both TRAIN and EVAL modes)
    
    # V1 Unweighted Loss
    unweighted_loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    
    # V2 Weighted Loss
    class_weights = tf.constant([7.0, 1.0, 7.0])    
    sample_weights=tf.gather(class_weights, labels)
    weighted_loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, 
                                                            logits=logits, 
                                                            weights=sample_weights)
    
    loss = weighted_loss # use the weighted loss
    
    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
          
        train_op = optimizer.minimize( 
            loss=loss, 
            global_step=tf.train.get_global_step())
    
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    
    
    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
      "accuracy": tf.metrics.accuracy(labels=labels, 
                                      predictions=predictions["classes"]),
      "bmac": tf.metrics.mean_per_class_accuracy(labels=labels, 
                                                 predictions=predictions["classes"],
                                                 num_classes=n_classes)
    }
    
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

In [14]:
train_data = pd.read_csv(TRAIN_FILE_PATH)
train_data.drop("id", axis=1, inplace=True)

train_labels = pd.read_csv(TARGET_FILE_PATH).astype(int)
train_labels.drop("id", axis=1, inplace = True)

test_data =  pd.read_csv(TEST_FILE_PATH)
id_test = test_data.columns[0]
test_data.drop("id", axis=1, inplace=True)


train_data_scaled, test_data_scaled = scale_data(X_train=train_data, y_train=train_labels, X_test=test_data)
train_labels = train_labels.values

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

for train_idx, valid_idx in kfold.split(train_data_scaled, train_labels):
    X_train = train_data_scaled[train_idx]
    y_train = train_labels[train_idx]
    
    X_valid = train_data_scaled[valid_idx]
    y_valid = train_labels[valid_idx]
    valid_size = y_valid.shape[0]
    
    print(f"Training Set:\t X_train={X_train.shape} {X_train.dtype}\t y_train={y_train.shape} {y_train.dtype}\t  Class Distribution:  {np.bincount(y_train.astype('int64')[:,0])}")
    print(f"Validation Set:\t X_valid={X_valid.shape} {X_valid.dtype}\t y_valid={y_valid.shape} {y_valid.dtype} \t  Class Distribution:  {np.bincount(y_valid.astype('int64')[:,0])}")
   
    # Create the Estimator
    estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir="./tmp/task2_model11")
          
    # Setup logging hook
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=50)
    
    # Define Training Data Input Function
    train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": X_train},
                                                        y=y_train,
                                                        batch_size=32,
                                                        num_epochs=20,
                                                        shuffle=True)
       
    # Define Validationg Data Input Function   
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": X_valid},
                                                       y=y_valid,
                                                       #batch_size=valid_size,
                                                       num_epochs=None, # must be kept like this
                                                       shuffle=False)
          
    evaluator = tf.contrib.estimator.InMemoryEvaluatorHook(estimator=classifier, 
                                                           input_fn=eval_input_fn, 
                                                           every_n_iter=200, 
                                                           steps=1000)
          
    estimator.train(input_fn=train_input_fn, steps=500, hooks=[logging_hook, evaluator])
          
    break
          
    predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": X_valid}, shuffle=False)
 
          

          
    #p = classifier.predict(input_fn=predict_input_fn)
    #print(f"Prediction: ")          
    
          
    #y_pred_lst = []

    #for x in p:
     #   y_pred_lst.append(x['classes'])
          
    #y_pred =  np.asarray(y_pred_lst)
          
    #cur_BMAC = balanced_accuracy_score(y_valid, y_pred) 
    #print(f"Cur BMAC {cur_BMAC}")    
    
    break
    

X_train: (4800, 1000)
y_train: (4800, 1)
X_test: (4100, 1000)
Training Set:	 X_train=(4320, 1000) float64	 y_train=(4320, 1) int32	  Class Distribution:  [ 540 3240  540]
Validation Set:	 X_valid=(480, 1000) float64	 y_valid=(480, 1) int32 	  Class Distribution:  [ 60 360  60]
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './tmp/task2_model11', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002589DF28550>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
I

INFO:tensorflow:Evaluation [600/1000]
INFO:tensorflow:Evaluation [700/1000]
INFO:tensorflow:Evaluation [800/1000]
INFO:tensorflow:Evaluation [900/1000]
INFO:tensorflow:Evaluation [1000/1000]
INFO:tensorflow:Finished evaluation at 2018-10-23-13:27:44
INFO:tensorflow:Saving dict for global step 7: accuracy = 0.71664065, bmac = 0.38338962, global_step = 7, loss = 2.5446887
INFO:tensorflow:Starting evaluation at 2018-10-23-13:27:44
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [100/1000]
INFO:tensorflow:Evaluation [200/1000]
INFO:tensorflow:Evaluation [300/1000]
INFO:tensorflow:Evaluation [400/1000]
INFO:tensorflow:Evaluation [500/1000]
INFO:tensorflow:Evaluation [600/1000]
INFO:tensorflow:Evaluation [700/1000]
INFO:tensorflow:Evaluation [800/1000]
INFO:tensorflow:Evaluation [900/1000]
INFO:tensorflow:Evaluation [1000/1000]
INFO:tensorflow:Finished evaluation at 2018-10-23-13:27:46
INFO:ten

KeyboardInterrupt: 

In [3]:
## ================ FUNCTION DEFS ================ ##

#Zero mean unit variance for train and test data
def scale_data(X_train, y_train, X_test):
    
    print(f"X_train: {X_train.shape}")
    print(f"y_train: {y_train.shape}")
    print(f"X_test: {X_test.shape}")
    
    scaler = StandardScaler().fit(X_train, y_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
   
    return X_train_scaled, X_test_scaled

def make_submission(filename, predictions):
    test_data =  pd.read_csv(TEST_FILE_PATH)
    test_data["y"] = predictions
    test_data[["id", "y"]].to_csv("submissions/"+filename, index= False)