In [1]:
import random
import pandas as pd
import numpy as np
import tensorflow as tf
import skflow

from sklearn.linear_model import LogisticRegression
from sklearn.utils import check_array
from sklearn.cross_validation import train_test_split
from sklearn import metrics, cross_validation

random.seed(42)

In [43]:
%reload_ext watermark
%watermark -a "Ken Cavagnolo" -n -u -v -m -h -g -p numpy,pandas,tensorflow,skflow,scikit-learn

Ken Cavagnolo 
Last updated: Fri Mar 04 2016 

CPython 2.7.10
IPython 4.0.3

numpy 1.10.4
pandas 0.17.1
tensorflow 0.6.0
skflow 0.1.0
scikit-learn 0.17

compiler   : GCC 5.2.1 20151010
system     : Linux
release    : 4.2.0-23-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 4
interpreter: 64bit
host name  : ubuntu
Git hash   : f8c2c79984809a1a8d0d0f33f6556d59b9d385bd


In [25]:
data = pd.read_csv('data/titanic_train.csv')
X = data[['Age', 'SibSp', 'Fare']].fillna(0)
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
print("Accuracy: {0}".format(metrics.accuracy_score(lr.predict(X_test), y_test)))
print("ROC: {0}".format(metrics.roc_auc_score(lr.predict(X_test), y_test)))

Accuracy: 0.664804469274
ROC: 0.730746960046


In [27]:
# Linear classifier.
tflr = skflow.TensorFlowLinearClassifier(n_classes=2,
                                         batch_size=128,
                                         steps=500,
                                         learning_rate=0.05)
tflr.fit(X_train, y_train)
print("Accuracy: {0}".format(metrics.accuracy_score(tflr.predict(X_test), y_test)))
print("ROC: {0}".format(metrics.roc_auc_score(tflr.predict(X_test), y_test)))

Step #1, avg. loss: 9.65833
Step #51, epoch #8, avg. loss: 3.02035
Step #101, epoch #16, avg. loss: 3.04305
Step #151, epoch #25, avg. loss: 2.88455
Step #201, epoch #33, avg. loss: 2.90514
Step #251, epoch #41, avg. loss: 2.92437
Step #301, epoch #50, avg. loss: 2.86000
Step #351, epoch #58, avg. loss: 2.88256
Step #401, epoch #66, avg. loss: 2.85180
Step #451, epoch #75, avg. loss: 2.86843
Accuracy: 0.608938547486
ROC: 0.651775147929


In [28]:
# 3 layer neural network with rectified linear activation.
classifier = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
                                            n_classes=2,
                                            batch_size=128,
                                            steps=500,
                                            learning_rate=0.05)
classifier.fit(X_train, y_train)
print("Accuracy: {0}".format(metrics.accuracy_score(classifier.predict(X_test), y_test)))
print("ROC: {0}".format(metrics.roc_auc_score(classifier.predict(X_test), y_test)))

Step #1, avg. loss: 10.06707
Step #51, epoch #8, avg. loss: 0.73915
Step #101, epoch #16, avg. loss: 0.62324
Step #151, epoch #25, avg. loss: 0.62795
Step #201, epoch #33, avg. loss: 0.62312
Step #251, epoch #41, avg. loss: 0.62120
Step #301, epoch #50, avg. loss: 0.61702
Step #351, epoch #58, avg. loss: 0.61531
Step #401, epoch #66, avg. loss: 0.61496
Step #451, epoch #75, avg. loss: 0.61467
Accuracy: 0.653631284916
ROC: 0.69435483871


In [29]:
# 3 layer neural network with hyperbolic tangent activation.
def dnn_tanh(X, y):
    layers = skflow.ops.dnn(X, [10, 20, 10], tf.tanh)
    return skflow.models.logistic_regression(layers, y)

In [31]:
classifier = skflow.TensorFlowEstimator(model_fn=dnn_tanh,
                                        n_classes=2,
                                        batch_size=128,
                                        steps=500,
                                        learning_rate=0.05)
classifier.fit(X_train, y_train)
print("Accuracy: {0}".format(metrics.accuracy_score(classifier.predict(X_test), y_test)))
print("ROC: {0}".format(metrics.roc_auc_score(classifier.predict(X_test), y_test)))

Step #1, avg. loss: 0.68093
Step #51, epoch #8, avg. loss: 0.64784
Step #101, epoch #16, avg. loss: 0.63315
Step #151, epoch #25, avg. loss: 0.63321
Step #201, epoch #33, avg. loss: 0.62441
Step #251, epoch #41, avg. loss: 0.61640
Step #301, epoch #50, avg. loss: 0.61377
Step #351, epoch #58, avg. loss: 0.61024
Step #401, epoch #66, avg. loss: 0.61550
Step #451, epoch #75, avg. loss: 0.61014
Accuracy: 0.692737430168
ROC: 0.695760799484


In [35]:
data = pd.read_csv('data/titanic_train.csv')
X = data[['Embarked']]
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
embarked_classes = X_train["Embarked"].unique()
print('Embarked has next classes: ', embarked_classes)

cat_processor = skflow.preprocessing.CategoricalProcessor()
X_train = np.array(list(cat_processor.fit_transform(X_train)))
X_test = np.array(list(cat_processor.transform(X_test)))

# Total number of classes for this variable from Categorical Processor.
# Includes unknown token and unique classes for variable.
n_classes = len(cat_processor.vocabularies_[0])

# embeddings
EMBEDDING_SIZE = 3

def categorical_model(X, y):
    features = skflow.ops.categorical_variable(
        X, n_classes, embedding_size=EMBEDDING_SIZE, name='embarked')
    return skflow.models.logistic_regression(tf.squeeze(features, [1]), y)

('Embarked has next classes: ', array(['S', 'C', 'Q', nan], dtype=object))


In [33]:
classifier = skflow.TensorFlowEstimator(model_fn=categorical_model,
                                        n_classes=2)
classifier.fit(X_train, y_train)
print("Accuracy: {0}".format(metrics.accuracy_score(classifier.predict(X_test), y_test)))
print("ROC: {0}".format(metrics.roc_auc_score(classifier.predict(X_test), y_test)))

('Embarked has next classes: ', array(['S', 'C', 'Q', nan], dtype=object))
Step #1, avg. loss: 0.76733
Step #21, avg. loss: 0.67197
Step #41, epoch #1, avg. loss: 0.67355
Step #61, epoch #2, avg. loss: 0.65313
Step #81, epoch #3, avg. loss: 0.65903
Step #101, epoch #4, avg. loss: 0.65579
Step #121, epoch #5, avg. loss: 0.64565
Step #141, epoch #6, avg. loss: 0.65923
Step #161, epoch #7, avg. loss: 0.65823
Step #181, epoch #7, avg. loss: 0.65129
Accuracy: 0.625698324022
ROC: 0.610550615595


In [37]:
def one_hot_categorical_model(X, y):
    features = skflow.ops.one_hot_matrix(X, n_classes)
    return skflow.models.logistic_regression(tf.squeeze(features, [1]), y)

In [38]:
classifier = skflow.TensorFlowEstimator(model_fn=one_hot_categorical_model,
    n_classes=2, steps=1000, learning_rate=0.01)
classifier.fit(X_train, y_train)

print("Accuracy: {0}".format(metrics.accuracy_score(classifier.predict(X_test), y_test)))
print("ROC: {0}".format(metrics.roc_auc_score(classifier.predict(X_test), y_test)))

Step #1, avg. loss: 0.82722
Step #101, epoch #4, avg. loss: 0.82588
Step #201, epoch #8, avg. loss: 0.68599
Step #301, epoch #13, avg. loss: 0.66034
Step #401, epoch #17, avg. loss: 0.65153
Step #501, epoch #21, avg. loss: 0.65113
Step #601, epoch #26, avg. loss: 0.65095
Step #701, epoch #30, avg. loss: 0.65017
Step #801, epoch #34, avg. loss: 0.65184
Step #901, epoch #39, avg. loss: 0.64966
Accuracy: 0.625698324022
ROC: 0.610550615595
