In [146]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [147]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [190]:
from pandas.tools.plotting import scatter_matrix

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression

from sklearn.dummy import DummyClassifier

from sklearn.model_selection import ShuffleSplit

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import confusion_matrix

# Class, for use in pipelines, to select certain columns from a DataFrame and convert to a numpy array
# From A. Geron: Hands-On Machine Learning with Scikit-Learn & TensorFlow, O'Reilly, 2017
# Modified by Derek Bridge to allow for casting in the same ways as pandas.DatFrame.astype
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, dtype=None):
        self.attribute_names = attribute_names
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_selected = X[self.attribute_names]
        if self.dtype:
            return X_selected.astype(self.dtype).values
        return X_selected.values

# Class, for use in pipelines, to binarize nominal-valued features (while avoiding the dummy variabe trap)
# By Derek Bridge, 2017
class FeatureBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, features_values):
        self.features_values = features_values
        self.num_features = len(features_values)
        self.labelencodings = [LabelEncoder().fit(feature_values) for feature_values in features_values]
        self.onehotencoder = OneHotEncoder(sparse=False,
            n_values=[len(feature_values) for feature_values in features_values])
        self.last_indexes = np.cumsum([len(feature_values) - 1 for feature_values in self.features_values])
    def fit(self, X, y=None):
        for i in range(0, self.num_features):
            X[:, i] = self.labelencodings[i].transform(X[:, i])
        return self.onehotencoder.fit(X)
    def transform(self, X, y=None):
        for i in range(0, self.num_features):
            X[:, i] = self.labelencodings[i].transform(X[:, i])
        onehotencoded = self.onehotencoder.transform(X)
        return np.delete(onehotencoded, self.last_indexes, axis=1)
    def fit_transform(self, X, y=None):
        onehotencoded = self.fit(X).transform(X)
        return np.delete(onehotencoded, self.last_indexes, axis=1)
    def get_params(self, deep=True):
        return {"features_values" : self.features_values}
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            self.setattr(parameter, value)
        return self

In [149]:
df = pd.read_csv('CS4618Resources/datasets/dataset_alcohol.csv')

In [150]:
df.shape

(76, 9)

In [151]:
df.columns

Index(['age_yrs', 'height_cm', 'weight_kg', 'duration_mins', 'elapsed_mins',
       'sex', 'last_meal', 'units', 'over_limit'],
      dtype='object')

We can see that some numeric values appear as nominal values (maybe missing values): duration_mins, elapsed_mins

In [152]:
df.dtypes

age_yrs            int64
height_cm          int64
weight_kg          int64
duration_mins     object
elapsed_mins      object
sex               object
last_meal         object
units            float64
over_limit        object
dtype: object

In [153]:
df[:10]

Unnamed: 0,age_yrs,height_cm,weight_kg,duration_mins,elapsed_mins,sex,last_meal,units,over_limit
0,40,170,75,?,?,Male,Lunch,0.0,No
1,26,177,76,60,10,Male,Full,2.9,No
2,24,160,60,60,10,Female,Full,2.6,No
3,29,160,63,90,10,Female,Full,1.2,No
4,23,182,63,120,10,Male,Full,5.2,No
5,19,165,51,120,10,Female,Lunch,5.2,Yes
6,20,185,66,120,10,Male,Snack,5.2,No
7,23,185,84,150,10,Male,Lunch,15.0,No
8,23,170,66,60,30,Male,?,4.5,No
9,18,177,82,60,10,Male,Full,2.6,No


In [154]:
df.describe(include='all')

Unnamed: 0,age_yrs,height_cm,weight_kg,duration_mins,elapsed_mins,sex,last_meal,units,over_limit
count,76.0,76.0,76.0,76.0,76.0,76,76,76.0,76
unique,,,,16.0,9.0,2,5,,2
top,,,,120.0,10.0,Male,Full,,No
freq,,,,15.0,61.0,60,33,,46
mean,22.657895,176.644737,71.486842,,,,,8.632895,
std,5.627439,8.453329,11.474602,,,,,5.775567,
min,18.0,157.0,47.0,,,,,0.0,
25%,19.0,172.0,63.0,,,,,4.275,
50%,21.0,177.0,72.0,,,,,8.4,
75%,23.0,182.0,79.0,,,,,12.1,


In [155]:
(df['duration_mins'] == '?').sum()

2

In [161]:
(df['elapsed_mins'] == '?').sum()

2

In [187]:
(df['last_meal'] == '?').sum()

1

In [188]:
df[df['units'] == 0]

Unnamed: 0,age_yrs,height_cm,weight_kg,duration_mins,elapsed_mins,sex,last_meal,units,over_limit
0,40,170,75,?,?,Male,Lunch,0.0,No
31,18,177,57,?,?,Female,Full,0.0,No


Imputing these values would be a bad idea, since 
    1. they are not drinking 
    2. we should put duration 0, but imputing elapsed_mins does not make sense (how much since last drink)??


In [164]:
df_clean = (df[(df['duration_mins'] != '?') & (df['elapsed_mins'] != '?') & (df['last_meal'] != '?')]).copy()
df_clean.reset_index(inplace=True, drop=True)

In [165]:
df_clean[:10]

Unnamed: 0,age_yrs,height_cm,weight_kg,duration_mins,elapsed_mins,sex,last_meal,units,over_limit
0,26,177,76,60,10,Male,Full,2.9,No
1,24,160,60,60,10,Female,Full,2.6,No
2,29,160,63,90,10,Female,Full,1.2,No
3,23,182,63,120,10,Male,Full,5.2,No
4,19,165,51,120,10,Female,Lunch,5.2,Yes
5,20,185,66,120,10,Male,Snack,5.2,No
6,23,185,84,150,10,Male,Lunch,15.0,No
7,18,177,82,60,10,Male,Full,2.6,No
8,18,177,82,60,10,Male,Full,2.4,No
9,25,177,71,60,10,Male,Snack,10.0,No


In [166]:
df_clean.shape

(73, 9)

In [167]:
df_clean['duration_mins'] = df_clean['duration_mins'].astype(int)
df_clean['elapsed_mins'] = df_clean['elapsed_mins'].astype(int)

In [168]:
df_clean.describe(include='all')

Unnamed: 0,age_yrs,height_cm,weight_kg,duration_mins,elapsed_mins,sex,last_meal,units,over_limit
count,73.0,73.0,73.0,73.0,73.0,73,73,73.0,73
unique,,,,,,2,4,,2
top,,,,,,Male,Full,,No
freq,,,,,,58,32,,43
mean,22.479452,176.821918,71.712329,157.39726,16.369863,,,8.926027,
std,5.336222,8.554308,11.558402,100.588924,25.539461,,,5.687926,
min,18.0,157.0,47.0,5.0,5.0,,,1.2,
25%,19.0,172.0,63.0,90.0,10.0,,,4.8,
50%,21.0,177.0,72.0,120.0,10.0,,,9.1,
75%,23.0,182.0,79.0,240.0,10.0,,,12.4,


We could delete the whacky values from elapsed_mins and duration_mins

In [170]:
df_clean = df_clean.take(np.random.permutation(len(df_clean)))
df_clean.reset_index(drop=True, inplace=True)

# Scaling is MANDATORY

In [171]:
numeric_features = ['age_yrs', 'height_cm', 'weight_kg', 'duration_mins', 'elapsed_mins', 'units']
nominal_features = ['sex', 'last_meal']

numeric_pipeline = Pipeline([
    ('selector', DataFrameSelector(numeric_features)),
    ('scaler', StandardScaler())
])

nominal_pipeline = Pipeline([
    ('selector', DataFrameSelector(nominal_features)),
    ('binarizer', FeatureBinarizer([df_clean[feature].unique() for feature in nominal_features])),
])

pipeline = Pipeline([
    ('union', FeatureUnion([
        ('nominal_pipeline', nominal_pipeline),
        ('numeric_pipeline', numeric_pipeline),
    ])),
    ('classfier', LogisticRegression())
])

In [172]:
features = ['age_yrs', 'height_cm', 'weight_kg', 'duration_mins', 'elapsed_mins', 'units']

dummy_pipeline = Pipeline([
    ('selector', DataFrameSelector(features)),
    ('dummy', DummyClassifier(strategy='most_frequent')),
])

In [173]:
encoder = LabelEncoder()
y = df_clean['over_limit'].values
y_encoded = encoder.fit_transform(y)

Using Stratified Kfold with k = 3 to try to put around 30 elements in each fold

In [189]:
print('classifier accuracy:', np.mean(cross_val_score(pipeline, df_clean, y_encoded, scoring='accuracy', cv=3)))
print('dummy classifier accuracy:', np.mean(cross_val_score(dummy_pipeline, df_clean, y_encoded, scoring='accuracy', cv=3)))

classifier accuracy: 0.821111111111
dummy classifier accuracy: 0.588888888889


In [186]:
y_predicted = cross_val_predict(pipeline, df_clean, y_encoded, cv=3) # NB cross-val_predict, not cross_val_score
confusion_matrix(y_encoded, y_predicted)

array([[36,  7],
       [ 6, 24]])

Holdout, do it multiple times (4 in this case)

In [198]:
ss = ShuffleSplit(n_splits=40, train_size=0.8)
np.mean(cross_val_score(pipeline, df_clean, y_encoded, scoring='accuracy', cv=ss))



0.796875