In [None]:
%pylab inline
%run helper_functions.py

import pandas as pd
import seaborn as sns
import numpy as np
import sklearn
sns.set_style("whitegrid")

# Loading the Titanic Dataset (cleaned)

In [None]:
# Load the titanic dataset from the disk
dataset = pd.read_csv("../data/titanic_clean.csv")
X = dataset.drop('survived', axis = 1)
y = dataset.survived

# to print stats
feature_names = X.columns
class_labels = ["Died", "Survived"]

In [None]:
#printing

# Fitting a model with sklearn
(in this case we fit a Decision tree)

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X,y)


#### Predicting new data...

In [None]:
model.predict(X[0:5])

# Evaluation metrics

In [None]:
from sklearn.metrics import *
# we predict some data
y_pred = model.predict(X)

#Accuracy
print ("Accuracy (train set) : ", accuracy_score(y, y_pred))

In [None]:
#confusion matrix
cm =  confusion_matrix(y_pred=y_pred, y_true=y, labels=[0,1])
print (cm)
# Plotting confusion matrix (custom help function)
plot_confusion_matrix(cm, class_labels) 

In [None]:
# Classification report
print (classification_report(y, y_pred))

# AUC
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.auc.html

In [None]:
# Getting the probabilities per class
y_probabilities = model.predict_proba(X)
# AUC
roc_auc_score(y, y_probabilities[:, 1])


In [None]:
# Custom plot function
get_auc(y, y_probabilities, class_labels, column=1, plot=True) # Helper function

# Properly evaluating with a test set...

#### Splitting in test and train sets, train with train set only

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)


#### Train with train set
model  = DecisionTreeClassifier() # max_leaf_nodes=2...10
model.fit(X_train, y_train)

#### Accuracy
print ("Accuracy (training set): ", model.score(X_train, y_train), "\n")

#### Evaluate with test set
# we predict some data
y_pred = model.predict(X_test)

#### Accuracy
print ("Accuracy (test set): ", accuracy_score(y_test, y_pred), "\n")

#### Classification report
print (classification_report(y_test, y_pred))

####  ROC/AUC
# Getting the probabilities per class
y_probabilities = model.predict_proba(X_test)
# Custom plot function
get_auc(y_test, y_probabilities, class_labels, column=1, plot=True) # Help function

# Cross validating
## Cross-validation score

In [None]:
from sklearn.model_selection import cross_val_score

model = DecisionTreeClassifier() # we can now play with max_depth= 1, 10, 15
scores = cross_val_score(model, X, y, cv=10)

print(scores)
print("Mean fold accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

## Cross-validation predict
Predict the whole dataset in a cross-validated way (to avoid overfitting)

In [None]:
from sklearn.model_selection import cross_val_predict

model = DecisionTreeClassifier() # we can now play with max_depth= 1, 10, 15

####  We use cross-validation to predict all the data with the best model. 
y_pred = cross_val_predict(model, X, y)

print ("Accuracy (cross-validated): ", accuracy_score(y, y_pred))

####  Classification report
print (classification_report(y, y_pred))

####  ROC/AUC
# Getting the probabilities per class
y_probabilities = cross_val_predict(model, X, y, method='predict_proba')
# Custom plot function
get_auc(y, y_probabilities, class_labels, column=1, plot=True) # Help function


# Parameter tuning and Cross Validating

In [None]:
from sklearn.model_selection import GridSearchCV
#### The parameeters to tune (as a dictionary name:values_to_try)
params = { 
           'max_leaf_nodes':  range(2,10),
           'max_features'  :  range(1,10)
         }

#### Grid search
model = DecisionTreeClassifier()
grid = GridSearchCV(estimator=model, cv = 10, param_grid=params )
grid.fit(X, y)

####  Summarize the results of the grid search
print("Best parameters: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)

## Evaluating further the best found model
(cross validation using the best model)

In [None]:
from sklearn.metrics import *
from sklearn.model_selection import cross_val_predict

#### Get the best model from grid search (previous run)
model = grid.best_estimator_

#### FOLLOW THE SAME PROCESS AS BEFORE
#### We use cross-validation to predict all the data with the best model. 
y_pred = cross_val_predict(model, X, y)

#### Accuracy
print ("Accuracy (cross-validated): ", accuracy_score(y, y_pred))

####  Classification report
print (classification_report(y, y_pred))

####  ROC/AUC
# Getting the probabilities per class
y_probabilities = cross_val_predict(model, X, y, method='predict_proba')
# Custom plot function
get_auc(y, y_probabilities, class_labels, column=1, plot=True) # Help function

# Parameter tuning with final test-set evaluation.
- Cross validation to select the best model
- Test-set at the end to report the final accuracy

In [None]:
from sklearn.model_selection import GridSearchCV
params = { 
           'max_leaf_nodes':  range(2,10),
           'max_features'  :  range(1,10)
         }

#### Split the data. 
#### Train set to find and the best model using grid search
#### Test set to report the final accuracy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

####  Grid search
model = DecisionTreeClassifier()
grid = GridSearchCV(estimator=model, cv = 10, param_grid=params )
grid.fit(X_train, y_train)
print("Best cross validated accuracy after tuning: ", grid.best_score_)

#### 
#### After tuning, lets see the performance on a seperate test set
y_pred = grid.best_estimator_.predict(X_test)

#### Accuracy on test set
print ("Accuracy (test set): ", accuracy_score(y_test, y_pred))
# Classification report
print (classification_report(y_test, y_pred))

####  ROC/AUC on test set
# Getting the probabilities per class
y_probabilities =  grid.best_estimator_.predict_proba(X_test)
# Custom plot function
get_auc(y_test, y_probabilities, class_labels, column=1, plot=True) # Help function


# Feature importance

In [None]:
grid.best_estimator_.feature_importances_

In [None]:
important_features = pd.Series(grid.best_estimator_.feature_importances_ ,index=X.columns)
important_features.sort_values(ascending=False)


# How to handle class imbalance


In [None]:
wine_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep=';')

In [None]:
wine_df.sample(5)

In [None]:
wine_df.quality.value_counts()

## Let make it more imbalanced than it is....

In [None]:
wine_df['quality'] = [1 if q >= 8 else 0 for q in wine_df.quality ]

In [None]:
wine_df.quality.value_counts()

In [None]:
# prepare the data
X = wine_df.drop('quality', axis =1)
y = wine_df.quality
X, y = sklearn.utils.shuffle(X,y)

In [None]:
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression


model = LogisticRegression()#(class_weight='balanced')
y_pred = cross_val_predict(model, X, y)

#### Accuracy
print ("Accuracy (cross-validated): ", accuracy_score(y, y_pred))

####  Classification report
print (classification_report(y, y_pred))


#confusion matrix
cm =  confusion_matrix(y_pred=y_pred, y_true=y, labels=[0,1])
# Plotting confusion matrix (custom help function)
plot_confusion_matrix(cm, ["Bad/Average Wine", "Great Wine"]) 

In [None]:
####  ROC/AUC on test set
# Getting the probabilities per class
y_probabilities = cross_val_predict(model, X, y, method='predict_proba')
# Custom plot function
get_auc(y, y_probabilities, [0,1], column=1, plot=True) # Help function
