In [None]:

#######################################################
#######################################################
############    COPYRIGHT - DATA SOCIETY   ############
#######################################################
#######################################################

## ADVANCED CLASSIFICATION LOGISTIC REGRESSION ##

## NOTE: To run individual pieces of code, select the line of code and
##       press ctrl + enter for PCs or command + enter for Macs



In [None]:
#=================================================-
#### Slide 10: Getting started: directory settings  ####

from pathlib import Path
# Set `home_dir` to the root directory of your computer.
home_dir = Path.home()

# Set `main_dir` to the location of your `advanced-classification` folder.
main_dir = home_dir / "Desktop" / "advanced-classification"

# Make `data_dir` from the `main_dir` and remainder of the path to data directory.
data_dir = main_dir / "data"



In [None]:
#=================================================-
#### Slide 11: Getting started: working directory  ####

# Set working directory.
os.chdir(data_dir)
# Check working directory.
print(os.getcwd())



In [None]:
#=================================================-
#### Slide 12: Getting started: loading packages  ####

# Helper packages.
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

# Scikit-learn package for logistic regression.
from sklearn import linear_model

# Model set up and tuning packages from scikit-learn.
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Scikit-learn packages for evaluating model performance.
from sklearn import metrics

# Scikit-learn package for data preprocessing.
from sklearn import preprocessing



In [None]:
#=================================================-
#### Slide 26: Loading data into Python  ####

credit_card = pd.read_csv("credit_card_data.csv")
print(credit_card.head())



In [None]:
#=================================================-
#### Slide 27: Renaming target variable  ####

credit_card = credit_card.rename(columns = {'default_payment_next_month' : 'default_payment'})
print(credit_card.head())



In [None]:
#=================================================-
#### Slide 28: The data at first glance  ####

# The data types.
print(credit_card.dtypes)



In [None]:
#=================================================-
#### Slide 29: Frequency table of the target variable  ####

print(credit_card['default_payment'].value_counts())



In [None]:
#=================================================-
#### Slide 30: Data prep: check for NAs  ####

# Check for NAs.
print(credit_card.isnull().sum())



In [None]:
#=================================================-
#### Slide 31: Filling missing values  ####

# Fill missing values with mean
credit_card = credit_card.fillna(credit_card.mean()['BILL_AMT1'])

# Check for NAs in 'BILL_AMT1'.
print(credit_card.isnull().sum()['BILL_AMT1'])



In [None]:
#=================================================-
#### Slide 32: Data prep: numeric variables  ####

print(credit_card.dtypes.head())



In [None]:
#=================================================-
#### Slide 33: Data prep: target  ####

print(credit_card.default_payment.dtypes)
credit_card["default_payment"] = np.where(credit_card["default_payment"] == 1, True, False)

# Check class again.
print(credit_card.default_payment.dtypes)



In [None]:
#=================================================-
#### Slide 34: Subsetting data  ####

credit_card_glm = credit_card[["LIMIT_BAL","SEX","EDUCATION", "MARRIAGE", "AGE", "default_payment"]]
print(credit_card_glm.head())



In [None]:
#=================================================-
#### Slide 35: Split into train and test set  ####

# Separate predictors from data.
X = credit_card_glm[['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE']]
# Separate target from data.
y = np.array(credit_card_glm['default_payment'])
# Set the seed.
np.random.seed(1)

# Split data into training and test sets, use a 70 train - 30 test split.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = .3)




In [None]:
#=================================================-
#### Slide 38: Logistic regression: build  ####

# Set up logistic regression model.
logistic_regression_model = linear_model.LogisticRegression()
print(logistic_regression_model)



In [None]:
#=================================================-
#### Slide 40: Logistic regression: fit  ####

# Fit the model.
logistic_regression_model.fit(X_train,
                              y_train)



In [None]:
#=================================================-
#### Slide 42: Logistic regression: predict  ####

# Predict on test data.
predicted_values = logistic_regression_model.predict(X_test)
print(predicted_values)



In [None]:
#=================================================-
#### Slide 44: Exercise 1  ####





In [None]:
#=================================================-
#### Slide 51: Confusion matrix and accuracy  ####

# Take a look at test data confusion matrix.
conf_matrix_test = metrics.confusion_matrix(y_test, predicted_values)
print(conf_matrix_test)

# Compute test model accuracy score.
test_accuracy_score = metrics.accuracy_score(y_test, predicted_values)
print("Accuracy on test data: ", test_accuracy_score)



In [None]:
#=================================================-
#### Slide 52: Classification report  ####

# Create a list of target names to interpret class assignments.
target_names = ['default_payment_0', 'default_payment_1']



In [None]:
#=================================================-
#### Slide 57: Model champion dataframe  ####

# Create a dictionary with accuracy values for our logistic regression model.
model_final_dict = {'metrics': ["accuracy"],
                    'values':[round(test_accuracy_score, 4)],
                    'model':['logistic']}

model_final = pd.DataFrame(data = model_final_dict)
print(model_final)



In [None]:
#=================================================-
#### Slide 58: Getting probabilities instead of class labels  ####

# Get probabilities instead of predicted values.
test_probabilities = logistic_regression_model.predict_proba(X_test)
print(test_probabilities[0:5, :])

# Get probabilities of test predictions only.
test_predictions = test_probabilities[:, 1]
print(test_predictions[0:5])



In [None]:
#=================================================-
#### Slide 59: Computing FPR, TPR, and threshold  ####

# Get FPR, TPR, and threshold values.
fpr, tpr, threshold = metrics.roc_curve(y_test,            #<- test data labels
                                        test_predictions)  #<- predicted probabilities

print("False positive: ", fpr[:5])
print("True positive: ", tpr[:5])
print("Threshold: ", threshold[:5])



In [None]:
#=================================================-
#### Slide 60: Computing AUC  ####

# Get AUC by providing the FPR and TPR.
auc = metrics.auc(fpr, tpr)
print("Area under the ROC curve: ", auc)



In [None]:
#=================================================-
#### Slide 61: Putting it all together: ROC plot  ####

# Make an ROC curve plot.
plt.title('Receiver Operator Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()



In [None]:
#=================================================-
#### Slide 63: Exercise 2  ####





In [None]:
#=================================================-
#### Slide 65: Working with categorical variables  ####

print(credit_card.AGE.head())
credit_card['AGE'] = np.where(credit_card['AGE'] <= 30, "30 or Below",
                              np.where(credit_card['AGE'] < 60, 'Between 30 and 60', '60 and above'))



In [None]:
#=================================================-
#### Slide 66: Working with categorical variables  ####

credit_card.AGE.value_counts()



In [None]:
#=================================================-
#### Slide 70: Transform age variable into a dummy variable  ####

# Convert 'age' into dummy variables.
age_dummy = pd.get_dummies(credit_card['AGE'], drop_first = True)
print(age_dummy.head())



In [None]:
#=================================================-
#### Slide 71: Drop age and replace with the dummy variable  ####

# Drop `age` from the data.
credit_card.drop(['AGE'], axis = 1, inplace = True)

# Concatenate `age_dummy` to our dataset.
credit_card = pd.concat([credit_card,age_dummy],axis=1)
print(credit_card.head())



In [None]:
#=================================================-
#### Slide 72: Transform and replace other categorical variables  ####

# Convert 'sex' into dummy variables.
sex_dummy = pd.get_dummies(credit_card['SEX'], prefix = 'sex', drop_first = True)
# Convert 'education' into dummy variables.
education_dummy = pd.get_dummies(credit_card['EDUCATION'], prefix = 'education', drop_first = True)
# Convert 'marriage' into dummy variables.
marriage_dummy = pd.get_dummies(credit_card['MARRIAGE'], prefix = 'marriage', drop_first = True)

# Drop `sex`, `education`, `marriage` from the data.
credit_card.drop(['SEX', 'EDUCATION', 'MARRIAGE'], axis = 1, inplace = True)

# Concatenate `sex_dummy`, `education_dummy`, `marriage_dummy` to our dataset.
credit_card = pd.concat([credit_card,sex_dummy,education_dummy,marriage_dummy],axis=1)
print(credit_card.head())



In [None]:
#=================================================-
#### Slide 74: Split into train and test set  ####

# Separate predictors from data.
# We can just drop the target variable, as we are using all other variables as predictors.
X = credit_card.drop('default_payment', axis = 1)

# Separate target from data.
y = np.array(credit_card['default_payment'])

# Set the seed.
np.random.seed(1)
# Split data into training and test sets, use a 70 train - 30 test split.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = .3)



In [None]:
#=================================================-
#### Slide 75: Logistic regression: build  ####

# Set up the logistic regression model.
logistic_regression_model = linear_model.LogisticRegression(solver='liblinear')
print(logistic_regression_model)



In [None]:
#=================================================-
#### Slide 76: Logistic regression: fit  ####

# Fit the model.
logistic_regression_model.fit(X_train,
                              y_train)



In [None]:
#=================================================-
#### Slide 77: Logistic regression: predict  ####

# Predict on test data.
predicted_values = logistic_regression_model.predict(X_test)
print(predicted_values)



In [None]:
#=================================================-
#### Slide 78: Confusion matrix and accuracy  ####

# Take a look at test data confusion matrix.
conf_matrix_test = metrics.confusion_matrix(y_test, predicted_values)
print(conf_matrix_test)

# Compute test model accuracy score.
test_accuracy_score = metrics.accuracy_score(y_test, predicted_values)
print("Accuracy on test data: ", test_accuracy_score)



In [None]:
#=================================================-
#### Slide 79: Add accuracy score to the final scores  ####

model_final = model_final.append({'metrics' : "accuracy" ,
                                  'values' : round(test_accuracy_score,4),
                                  'model':'logistic_whole_dataset'} ,
                                 ignore_index = True)
print(model_final)



In [None]:
#=================================================-
#### Slide 80: Accuracy on train vs accuracy on test  ####

# Compute trained model accuracy score.
trained_accuracy_score = logistic_regression_model.score(X_train, y_train)
print("Accuracy on train data: " , trained_accuracy_score)



In [None]:
#=================================================-
#### Slide 82: Exercise 3  ####





In [None]:
#=================================================-
#### Slide 94: Prepare parameters for optimization  ####

# Create regularization penalty space.
penalty = ['l1', 'l2']
# Create regularization constant space.
C = np.logspace(0, 10, 10)
print("Regularization constant: ", C)

# Create hyperparameter options dictionary.
hyperparameters = dict(C = C, penalty = penalty)
print(hyperparameters)



In [None]:
#=================================================-
#### Slide 95: Set up cross-validation logistic function  ####

# Grid search 10-fold cross-validation with above parameters.
clf = GridSearchCV(linear_model.LogisticRegression(solver='liblinear'), #<- function to optimize
                   hyperparameters,                   #<- grid search parameters
                   cv = 10,                           #<- 10-fold cv
                   verbose = 0)                       #<- no messages to show

# Fit CV grid search.
best_model = clf.fit(X_train, y_train) 
best_model



In [None]:
#=================================================-
#### Slide 96: Check best parameters found by CV  ####

# Get best penalty and constant parameters.
penalty = best_model.best_estimator_.get_params()['penalty'] 
constant = best_model.best_estimator_.get_params()['C'] 
print('Best penalty: ', penalty)

print('Best C: ', constant)

In [None]:
#=================================================-
#### Slide 97: Predict using the best model parameters  ####

# Predict on test data using best model.
best_predicted_values = best_model.predict(X_test) 
print(best_predicted_values)

# Compute best model accuracy score.
best_accuracy_score = metrics.accuracy_score(y_test, best_predicted_values) 
print("Accuracy on test data (best model): ", best_accuracy_score)

In [None]:
#=================================================-
#### Slide 97: Predict using the best model parameters (cont'd)  ####

# Compute confusion matrix for best model.
best_confusion_matrix = metrics.confusion_matrix(y_test, best_predicted_values) 
print(best_confusion_matrix)

# Create a list of target names to interpret class assignments.
target_names = ['default_payment_no', 'default_payment_yes']

# Compute classification report for best model.
best_class_report = metrics.classification_report(y_test, best_predicted_values,
                                                  target_names = target_names)
print(best_class_report)



In [None]:
#=================================================-
#### Slide 99: Add accuracy score to the final scores  ####

model_final = model_final.append({'metrics' : "accuracy",
                                  'values' : round(best_accuracy_score, 4),
                                  'model':'logistic_tuned' } ,
                                 ignore_index = True)

print(model_final)
pickle.dump(model_final, open("model_final.sav","wb" ))



In [None]:
#=================================================-
#### Slide 100: Get metrics for ROC curve  ####

# Get probabilities instead of predicted values.
best_test_probabilities = best_model.predict_proba(X_test) 
print(best_test_probabilities[0:5, ])

# Get probabilities of test predictions only.
best_test_predictions = best_test_probabilities[:, 1] 
print(best_test_predictions[0:5])

In [None]:
#=================================================-
#### Slide 100: Get metrics for ROC curve (cont'd)  ####

# Get ROC curve metrics.
best_fpr, best_tpr, best_threshold = metrics.roc_curve(y_test, best_test_predictions) 
best_auc = metrics.auc(best_fpr, best_tpr)
print("Area under the ROC curve: ", best_auc)

In [None]:
#=================================================-
#### Slide 102: Plot ROC curve for both models  ####

# Make an ROC curve plot.
plt.title('Receiver Operator Characteristic')
plt.plot(fpr, tpr, 'blue',
label = 'AUC = %0.2f'%auc)
plt.plot(best_fpr, best_tpr, 'black',
label = 'AUC (best) = %0.2f'%best_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

