In [None]:
##################################################
### Author: Anthony Igel                       ###
### Team: Category Management Transformation   ###
### Project: Developing practical Python Tools ###
### Purpose: Decision Tree                     ###
### Date: 05/24/2018                           ###
##################################################

# https://blog.socialcops.com/engineering/machine-learning-python/

######################################################################
########                     Import Modules                   ########
######################################################################
import py_effo as py_effo
### pandas
# Pandas is for structured data operations and manipulations, extensively used for data preparation,
import pandas as pd

### numpy
# NumPy stands for Numerical Python, a library contains basic linear algebra functions, Fourier Transforms and advanced random
# number capabilities
import numpy as np

### Scipy
# Scipy performs a host of statistical calculations, built on top of Numpy, thus we do not need to import Numpy as all Numpy
# functions are contained in Scipy
# https://oneau.wordpress.com/2011/02/28/simple-statistics-with-scipy/
import scipy as sp

### sklearn
# Sklearn contains basic statistical models
from sklearn.datasets import load_iris
# As well as a module to calculate model performance statistics
from sklearn import tree, preprocessing, metrics, model_selection 
import sklearn.ensemble as ske

# Sklearn contains basic statistical models
import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
# As well as a module to calculate model performance statistics

    
### Statsmodels
# Sklearn contains basic statistical models and data sets
import statsmodels.api as sm

### Matplotlib
# Matplotlib is a Python based plotting library with complete 2D support and limited 3D support
%matplotlib inline
import matplotlib as mlb
import matplotlib.pyplot as plt

### Seaborn
# Seaborn is a Python visualization library based on Matplolib, providing high-level interface for statistcial graphing
# Seaborn supports numpy and pandas data structures as well as statistical routines from scipy and statsmodels
# Note: https://seaborn.pydata.org/introduction.html
import seaborn as sns

### String
# Allows for more flexible solutions for dealing with string characters
import string as st


In [None]:
######################################################################
########                     Import Data                      ########
######################################################################

### Import Titanic data set
titanic_df = pd.read_csv("//nfs/analysis/analysis/kroger/category_management_transformation/mini_hack_days/python/titanic3.csv")

### View top 10 records of the data frame
titanic_df.head(3)

### Variable Information
# pclass: Passenger Class (1 = first; 2 = second; 3 = third)
# survived: Survival (0 = no; 1 = yes)
# name: Name of passenger
# sex: Male or female
# age: Age of passenger
# sibsp: Number of siblings/spouses aboard
# parch: Number of parents/children aboard
# ticket: Ticket number
# fare: Passenger fare
# cabin: Cabin
# embarked: Port of embarkment (C = Cherbourg; Q = Queenstown; S = Southampton)
# boat: Lifeboat (if survived)
# body: Body number (if did not survive and body was recovered)
# home.dest: Home destination from titanic


In [None]:
######################################################################
########                 Data Exploration                     ########
######################################################################

######## Core Data Frame Info ########
### Let's determine how the data types look for this data frame
print('Data Type Information')
print(titanic_df.info())
print()
### View descriptive statistics about data set
print('Descriptive Statistics of Titanic Data')
print(titanic_df.describe())
print()
### View if any of our data is null
print('Summary of Nulls in Titanic Data')
print(titanic_df.isnull().sum())

### Dimension of df
print('Number of Records: ' + str(len(titanic_df)))
print()
### Lets view the overall chance for survival
print('Average Survival Rate: '+ str((titanic_df['survived'].mean().round(4)) * 100) + '%')

In [None]:
######## Variable Group Stats ########
### Lets view the general composition of each class
class_grouping = titanic_df.groupby(['pclass']).mean()
print('Class Composition')
print(class_grouping)
print()

### Lets view the general composition of each class & sex combination
class_sex_grouping = titanic_df.groupby(['pclass','sex']).mean()
print('Class & Sex Composition')
print(class_sex_grouping)

In [None]:
######################################################################
########                  Data Preparation                    ########
######################################################################

######## Clean Feature Names ########
### This will ensure that all column names are stripped of whitespace
titanic_df.rename(columns = lambda x: x.strip(), inplace = True)

### We can also adjust the case of our metrics table columns
titanic_df.rename(columns = lambda x: x.lower(), inplace = True)

######## Clean Null Values ########
### Lets view the count of non-null values
print('Count of Records Pre-Clean')
print(titanic_df.count())
print()
### Most of the values in boat or cabin are missing, so we can delete these features
titanic_df = titanic_df.drop(['body','cabin','boat'], axis=1)
### Home destination isn't necessarily vaulable 
titanic_df["home.dest"] = titanic_df["home.dest"].fillna("NA")
titanic_df = titanic_df.dropna()
print('Count of Records Post-Clean')
print(titanic_df.count())

In [None]:
######## Format Data for ML Processing ########
### The preprocessing function from Sklearn will take our string value fields, "sex" & "destination", and convert to them
### integer fields
# sex: Female = 0; Male = 1
# embarked: Cherbourg = 1; Queenstown = 2; Southampton = 3
### The features "name", "ticket" and "home.dest" are not categorical, thus difficult to use in a classification algorithm
### Therefore we will drop them
def preprocess_titanic_df(df):
    processed_df = df.copy()
    le = preprocessing.LabelEncoder()
    processed_df.sex = le.fit_transform(processed_df.sex)
    processed_df.embarked = le.fit_transform(processed_df.embarked)
    processed_df = processed_df.drop(['name','ticket','home.dest'],axis=1)
    return processed_df

processed_df = preprocess_titanic_df(titanic_df)

print(processed_df.info())
print()
print(processed_df.head(5))

In [None]:
######################################################################
########                 Model Preparation                    ########
######################################################################

### Split the data set into dependent and independent variables
x = processed_df.drop(['survived'], axis = 1).values
y = processed_df['survived'].values

### Split the data set into training and testing data sets; 80 - 20 split
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size = 0.2)

In [None]:
######################################################################
########                Decision Tree Modeling                ########
######################################################################
# http://scikit-learn.org/stable/modules/tree.html#tree

### Initialize Decision Tree Object, an untrained Decision Tree Classifier with maximum tree depth set to 1-
clf_dt = tree.DecisionTreeClassifier(max_depth = 10)

### Fit the Decision Tree with the training data sets
### This enables the Decision Tree model to "learn" how different features affect survivability
clf_dt.fit (x_train, y_train)

### Score the Decision Tree with the testing data sets
print('% of Correctly Predicted Survivals: ' + str((clf_dt.score (x_test, y_test) * 100).round(2)) + '%')

In [None]:
### Now if you are under the assumption that resulting scores could differ depending on which rows are selected for the training
### and testing data sets, we can get around this by using a shuffle validator

### The shuffle validator will apply the same 80-20 split as before, but generates 20 unique permutations of this split
### By passing the shuffle validator as a parameter to the cross_val_score function we can score our classifer against each
### of the different splits and compute their accuracy

from sklearn import cross_validation

shuffle_validator = cross_validation.ShuffleSplit(len(x), n_iter = 20, test_size = 0.2, train_size = 0.8, random_state = 0)
def test_classifier(clf):
    scores = cross_validation.cross_val_score(clf, x, y, cv = shuffle_validator)
    print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))

### Thus, no matter how we split the data we will recieve this % of correctly predicted survivals and the following 
### standard deviation 
test_classifier(clf_dt)

In [None]:
### Create Vector of Predicted values
pred_dt = clf_dt.predict(x)
pred_dt

In [None]:
### Additionally, we can use scikit-learn to easily test other machine learning algorithms using the same syntax
clf_rf = ske.RandomForestClassifier(n_estimators=50)
print('Random Forest Classification')
test_classifier(clf_rf)
print()
print('Gradient Boosting Classifier')
clf_gb = ske.GradientBoostingClassifier(n_estimators=50)
test_classifier(clf_gb)
print()
print('Voting Classifier')
eclf = ske.VotingClassifier([('dt', clf_dt), ('rf', clf_rf), ('gb', clf_gb)])
test_classifier(eclf)

######## Random Forest Classification ########
### The Random Forest algorithm will create a multitude of tree for the data set, generally very poor, using different random
## subsets of the input variables. Then returns whichever prediction was returned by the most trees to avoid "overfitting"

### Overfitting occurs when a model is very tightly fitted to arbitrary correlations in the training data that it performs
## poorly on the testing data 

######## Gradient Boosting Classification ########
### The Gradient Boosting algorithm will again, generate many weak/shallow trees, and then combine (boost) them into a strong
## model
## GBM performs well on our df, but is often relatively slow and difficult to optimize since the model construction happens
## sequentially and cannot be performed in parallel

######## Voting Classification ########
### The Voting algorithm can be used to apply multiple conceptually divergent classification models to the same data set and
## will return the majority vote from all of the classifiers
## E.g. if the GBM predicts that a passenger will not survive, but the Decision Tree and Random Forest both predict that the 
## same passenger will survive, the Voting classifier will chose the latter

In [None]:
### Display level of Importance for each Feature
clf_dt.feature_importances_

In [None]:
### Get information on parameters used in Decision Tree Model
clf_dt.get_params()

In [None]:
### Get information on parameters used in Random Forest Model
clf_rf.get_params()

In [None]:
######################################################################
########                Logisitc Modeling                     ########
######################################################################

######## Model Preparation ######## 
logreg = pd.DataFrame(processed_df)

### Split the data set into dependent and independent variables
x2 = logreg.drop(['survived'], axis = 1).values
y2 = logreg['survived'].values

### Split the data set into training and testing data sets; 80 - 20 split
x2_train, x2_test, y2_train, y2_test = model_selection.train_test_split(x2, y2, test_size = 0.2)


In [None]:
######## Modeling  With Full Data ########

### Lets run a logistic regression on the entire data set and see if it accurate
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(x2, y2)

# check the accuracy on the training set
ModelScore_1 = model.score(x2, y2)
print('Model Score: ' + str((ModelScore_1 * 100).round(0)) + '%')
print()
# what percentage had affairs?
PredMean_1 = y2.mean()
print('What % of Passengers Survived? '+ str((PredMean_1 * 100).round(0)) + '%')

In [None]:
######## Modeling  With Train/Test Data ########

model2 = LogisticRegression()
model2.fit(x2_train, y2_train)

### Predict class labels for the test set
predicted = model2.predict(x2_test)

print('Predicted Values')
print(predicted)
print()
### Generate class probabilities
probs = model2.predict_proba(x2_test)
print('Prediction Probabilities')
print(probs)
print()
### Generate evaluation metrics
AccuracyScore_2 = metrics.accuracy_score(y2_test, predicted)
RocAucScore_2 = metrics.roc_auc_score(y2_test, probs[:, 1])
print('Accuracy Score: ' + str((AccuracyScore_2 * 100).round(0)) + '%')
print('ROC AUC Score: ' + str((RocAucScore_2 * 100).round(0)) + '%')

In [None]:
######## Additional Reports ########
### Confusion Matrix
## A confusion matrix is a summary of prediction results on a classification problem
## The number of correct and incorrect predictions are summarised with count values

## true prediction | false prediction
## false prediction | true prediction
print('Confusion Matrix')
print(metrics.confusion_matrix(y2_test, predicted))
print()
### Classification Report
print('Classification Report')
print(metrics.classification_report(y2_test, predicted))

In [None]:
######################################################################
########                  Model Validation                    ########
######################################################################

### Evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(), x2, y2, scoring = 'accuracy', cv = 10)
print('Cross Validation Scores')
print((scores * 100).round(0))
print()
print('Average Scores')
print(str((scores.mean() * 100).round(0)) + '%')

### Model is performing with 77% accuracy