In [None]:

#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## ADVANCED CLASSIFICATION ENSEMBLE METHODS ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs



In [None]:
#=================================================-
#### Slide 20: Directory settings  ####

from pathlib import Path
# Set `home_dir` to the root directory of your computer.
home_dir = Path.home()

# Set `main_dir` to the location of your `advanced-classification` folder.
main_dir = home_dir / "Desktop" / "advanced-classification"

# Make `data_dir` from the `main_dir` and remainder of the path to data directory.
data_dir = main_dir / "data"



In [None]:
#=================================================-
#### Slide 21: Working directory  ####

# Set working directory.
os.chdir(data_dir)
# Check working directory.
print(os.getcwd())



In [None]:
#=================================================-
#### Slide 22: Loading packages  ####

# Helper packages.
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from textwrap import wrap
import pickle
# Model set up and tuning packages from scikit-learn.
from sklearn.model_selection import train_test_split

# Scikit-learn package for data preprocessing.
from sklearn import preprocessing

# Scikit-learn packages for evaluating model performance.
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Random forests and boosting packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier



In [None]:
#=================================================-
#### Slide 23: Loading data into Python  ####

credit_card = pd.read_csv("credit_card_data.csv")
print(credit_card.head())



In [None]:
#=================================================-
#### Slide 24: Renaming target variable  ####

credit_card = credit_card.rename(columns = {'default_payment_next_month' : 'default_payment'})
print(credit_card.head())



In [None]:
#=================================================-
#### Slide 25: The data at first glance  ####

# The first 5 rows.
print(credit_card.head())
# The data types.
print(credit_card.dtypes)



In [None]:
#=================================================-
#### Slide 26: Frequency table of the target variable  ####

print(credit_card['default_payment'].value_counts())



In [None]:
#=================================================-
#### Slide 27: Data prep: check for NAs  ####

# Check for NAs.
print(credit_card.isnull().sum())



In [None]:
#=================================================-
#### Slide 28: Filling missing values  ####

# Fill missing values with mean
credit_card = credit_card.fillna(credit_card.mean()['BILL_AMT1'])
# Check for NAs in 'BILL_AMT1'.
print(credit_card.isnull().sum()['BILL_AMT1'])



In [None]:
#=================================================-
#### Slide 29: Transform categorical variables into dummies  ####

# Convert 'sex' into dummy variables.
sex_dummy = pd.get_dummies(credit_card['SEX'], prefix = 'sex', drop_first = True)
# Convert 'education' into dummy variables.
education_dummy = pd.get_dummies(credit_card['EDUCATION'], prefix = 'education', drop_first = True)
# Convert 'marriage' into dummy variables.
marriage_dummy = pd.get_dummies(credit_card['MARRIAGE'], prefix = 'marriage', drop_first = True)

# Drop `sex`, `education`, `marriage` from the data.
credit_card.drop(['SEX', 'EDUCATION', 'MARRIAGE'], axis = 1, inplace = True)

# Concatenate `sex_dummy`, `education_dummy`, `marriage_dummy` to our dataset.
credit_card = pd.concat([credit_card,sex_dummy,education_dummy,marriage_dummy],axis=1)
print(credit_card.head())



In [None]:
#=================================================-
#### Slide 30: Data prep: ready for random forests  ####

print(credit_card.default_payment.dtypes)
credit_card["default_payment"] = np.where(credit_card["default_payment"] == 1, True, False)

# Check class again.
print(credit_card.default_payment.dtypes)

#dropping unnecessary identifier 'ID'
credit_card = credit_card.drop('ID',axis = 1)



In [None]:
#=================================================-
#### Slide 32: Exercise 1  ####





In [None]:
#=================================================-
#### Slide 35: Split into training and test sets  ####

# Split the predictors from data.
X = credit_card.drop('default_payment', axis = 1)

# Separate target from data.
y = np.array(credit_card['default_payment'])

# Set the seed.
np.random.seed(1)

# Split the data into training and test set, use a 70 train - 30 test split.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = .3)




In [None]:
#=================================================-
#### Slide 37: Building our model  ####

forest = RandomForestClassifier(criterion = 'gini',
                                n_estimators = 100,
                                random_state = 1)



In [None]:
#=================================================-
#### Slide 38: Fitting our model  ####

# Fit the saved model to your training data.
forest.fit(X_train, y_train)



In [None]:
#=================================================-
#### Slide 39: Predicting with our data  ####

# Predict on test data.
y_predict_forest = forest.predict(X_test)

# Look at the first few predictions.
print(y_predict_forest[0:5,])



In [None]:
#=================================================-
#### Slide 41: Confusion matrix and accuracy  ####

# Take a look at test data confusion matrix.
conf_matrix_forest = metrics.confusion_matrix(y_test, y_predict_forest)
print(conf_matrix_forest)

accuracy_forest = metrics.accuracy_score(y_test, y_predict_forest)
print("Accuracy for random forests on test data: ", accuracy_forest)



In [None]:
#=================================================-
#### Slide 42: Accuracy of the training dataset  ####

# Compute accuracy using training data.
acc_train_forest = forest.score(X_train, y_train)

print ("Train Accuracy:", acc_train_forest)



In [None]:
#=================================================-
#### Slide 43: Evaluation of random forests  ####

model_final = pickle.load(open("model_final.sav","rb"))

#Add this model to our model champion dataframe.
model_final = model_final.append({'metrics': "accuracy",
                                  'values':round(accuracy_forest,4),
                                  'model':'random_forest'},
                                 ignore_index = True)

print(model_final)



In [None]:
#=================================================-
#### Slide 46: Subsetting our features  ####

credit_card_features = credit_card.drop('default_payment', axis = 1)

features = credit_card_features.columns
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
top_indices = indices[0:10][::-1]

plt.figure(1)
plt.title('Feature Importance')
plt.barh(range(len(top_indices)), importances[top_indices], color = 'b', align = 'center')
labels = features[top_indices]
labels = [ '\n'.join(wrap(l,13)) for l in labels ]
plt.yticks(range(len(top_indices)), labels)
plt.xlabel('Relative Importance')



In [None]:
#=================================================-
#### Slide 49: Exercise 2  ####





In [None]:
#=================================================-
#### Slide 61: Boosting: build model  ####

# Save the parameters we will be using for our gradient boosting classifier.
gbm = GradientBoostingClassifier(n_estimators = 200,
                                 learning_rate = 1,
                                 max_depth = 2,
                                 random_state = 1)



In [None]:
#=================================================-
#### Slide 62: Boosting: fit model  ####

# Fit the saved model to your training data.
gbm.fit(X_train, y_train)



In [None]:
#=================================================-
#### Slide 63: Boosting: predict  ####

# Predict on test data.
predicted_values_gbm = gbm.predict(X_test)
print(predicted_values_gbm)



In [None]:
#=================================================-
#### Slide 64: Confusion matrix and accuracy  ####

# Take a look at test data confusion matrix.
conf_matrix_boosting = metrics.confusion_matrix(y_test, predicted_values_gbm)
print(conf_matrix_boosting)

# Compute test model accuracy score.
accuracy_gbm = metrics.accuracy_score(y_test, predicted_values_gbm)
print('Accuracy of gbm on test data: ', accuracy_gbm)



In [None]:
#=================================================-
#### Slide 65: Accuracy of training model  ####

# Compute accuracy using training data.
train_accuracy_gbm = gbm.score(X_train, y_train)

print ("Train Accuracy:", train_accuracy_gbm)



In [None]:
#=================================================-
#### Slide 66: Add final accuracy to the pickled dataframe  ####

# Add the model to our dataframe.
model_final = model_final.append(
    {'metrics' : "accuracy" ,
     'values' : round(accuracy_gbm,4),
     'model': 'boosting' } ,
    ignore_index = True)

print(model_final)



In [None]:
#=================================================-
#### Slide 67: Our top 10 features  ####

features = credit_card_features.columns
importances = gbm.feature_importances_
indices = np.argsort(importances)[::-1]
top_indices = indices[0:10][::-1]

plt.figure(1)
plt.title('Feature Importance')
plt.barh(range(len(top_indices)), importances[top_indices], color = 'b', align = 'center')
labels = features[top_indices]
labels = [ '\n'.join(wrap(l,13)) for l in labels ]
plt.yticks(range(len(top_indices)), features[top_indices])
plt.xlabel('Relative Importance')

