# Imports

In [2]:
##########
# basics #
##########

import warnings
warnings.filterwarnings('ignore')
import collections
import datetime
import glob
import hashlib
import itertools
import math
import operator
import os
import pickle
import random
import re
import string
import sys
import time

###########
# science #
###########

import scipy as sp
import numpy as np
import pandas as pd
rseed = random.seed(42)

######
# ml #
######

import tensorflow as tf
import tensorflow.contrib.learn as tfsk

###################
# sklearn tooling #
###################

from sklearn import decomposition
from sklearn import preprocessing
from sklearn import metrics
from sklearn import model_selection
from sklearn import grid_search
from sklearn import pipeline
from sklearn import feature_selection

#################
# visualization #
#################

# plotly
import plotly.plotly as py
import plotly.tools as tls
from plotly.graph_objs import *
import cufflinks as cf
tls.set_credentials_file(username=os.environ.get('PLOTLY_USERNAME'), api_key=os.environ.get('PLOTLY_APIKEY'))
cf.set_config_file(offline=False, world_readable=True, theme='pearl')

# matplotlib
import matplotlib.pyplot as plt
import matplotlib as mpl
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'svg')
mpl.rcParams['figure.figsize']=(12.0,4.0)
%matplotlib inline

############
# sys info #
############

%reload_ext watermark
%watermark -a "Ken Cavagnolo" -n -u -v -m -h -g -p numpy,scipy,pandas,sklearn,\
matplotlib,plotly

Ken Cavagnolo 
last updated: Tue Oct 04 2016 

CPython 2.7.12
IPython 5.1.0

numpy 1.11.1
scipy 0.18.1
pandas 0.19.0
sklearn 0.18
matplotlib 1.5.3
plotly 1.12.9

compiler   : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)
system     : Linux
release    : 4.4.0-38-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 4
interpreter: 64bit
host name  : ubuntu
Git hash   : e59184294aa18096437f6b3c6c910045b2a2f6d1


# Data

In [3]:
df = pd.read_csv('/home/kcavagnolo/ml_fun/datasets/titanic_train.csv')

In [4]:
df.shape

(891, 12)

In [5]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
y, X = df['Survived'], df[['Age', 'SibSp', 'Fare']].fillna(0)
X_tr, X_te, y_tr, y_te = model_selection.train_test_split(X, y, test_size=0.2, random_state=rseed)

# DNN w/ ReLu

In [11]:
# build dnn
classifier = tfsk.DNNClassifier(feature_columns=tfsk.infer_real_valued_columns_from_input(X_tr),
                                hidden_units=[10, 20, 10],
                                n_classes=2)



**BELOW IS BROKEN, DON'T RUN**

Crashes system after core dump

In [45]:
# run grid search
param_grid = {"optimizer": [tf.train.GradientDescentOptimizer(learning_rate=0.01),
                            tf.train.GradientDescentOptimizer(learning_rate=0.03)],
              "hidden_units": [[10,20,10], [10,50,20]]}

# actual search
from sklearn.grid_search import GridSearchCV as GSC
grid_search = GSC(classifier, param_grid=param_grid, scoring = 'accuracy', verbose=5, n_jobs=-1, cv=2)
grid_search.fit(X_tr, y_tr)

# results
print(grid_search)
print(grid_search.best_score_)
print(grid_search.best_params_)

In [12]:
# Fit and predict
classifier.fit(X_tr, y_tr, batch_size=128, steps=500)



DNNClassifier(hidden_units=[10, 20, 10], dropout=None, optimizer=None, feature_columns=[_RealValuedColumn(column_name='', dimension=3, default_value=None, dtype=tf.float32)])

In [13]:
predictions = classifier.predict(X_te)
score = metrics.accuracy_score(y_te, predictions)
print('Accuracy: {0:f}'.format(score))

Accuracy: 0.597765


# DNN w/ tanh

In [22]:
from tensorflow.contrib import layers

def dnn_tanh(features, target):
    target = tf.one_hot(target, 2, 1.0, 0.0)
    logits = layers.stack(features, layers.fully_connected, [10, 20, 10],
                          activation_fn=tf.tanh)
    prediction, loss = tfsk.models.logistic_regression(logits, target)
    train_op = layers.optimize_loss(loss,
                                    tf.contrib.framework.get_global_step(),
                                    optimizer='SGD',
                                    learning_rate=0.05)
    return tf.argmax(prediction, dimension=1), loss, train_op

In [23]:
classifier = tfsk.Estimator(model_fn=dnn_tanh)
classifier.fit(X_tr, y_tr, batch_size=128, steps=100)
print("Accuracy: %f" % score)



Accuracy: 0.597765


# Digits

In [25]:
from sklearn import datasets
digits = datasets.load_digits()
X = digits.images
y = digits.target

In [26]:
X_tr, X_te, y_tr, y_te = model_selection.train_test_split(X, y, test_size=0.33, random_state=rseed)

In [31]:
def conv_model(features, target):
    target = tf.one_hot(target, 10, 1.0, 0.0)
    features = tf.expand_dims(features, 3)
    features = tf.reduce_max(layers.conv2d(features, 12, [3, 3]), [1, 2])
    features = tf.reshape(features, [-1, 12])
    prediction, loss = tfsk.models.logistic_regression(features, target)
    train_op = layers.optimize_loss(loss,
                                    tf.contrib.framework.get_global_step(),
                                    optimizer='SGD',
                                    learning_rate=0.01)
    return tf.argmax(prediction, dimension=1), loss, train_op

In [35]:
classifier = tfsk.Estimator(model_fn=conv_model)
classifier.fit(X_tr, y_tr, steps=1000, batch_size=128)
score = metrics.accuracy_score(classifier.predict(X_te), y_te)
print('Accuracy: %f' % score)



Accuracy: 0.648148
