<a href="https://colab.research.google.com/github/cbsobral/ml-fies/blob/main/Module03_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Module 03

In this module, we perform the following steps:

1. 

### 1 - Load Data

Here, we import the testing set created in Module00_Data.


In [None]:
import pandas as pd

url_test = "https://drive.google.com/file/d/1v4FqKwt7NzG5RM6d9f1y7CLIdKq69jSS/view?usp=sharing"
path_test = "https://drive.google.com/uc?export=download&id="+url_test.split("/")[-2]
test = pd.read_csv(path_test)
test.shape

(87751, 31)

In [None]:
test_set = test.drop("default", axis=1) # drop targets for test set
test_target = test["default"].copy()

### 2 - Pipeline

The pipeline contains functions that will be used to transform the dataset. For the numeric attributes, the stardardization is performed by the StandardScaler. For ordinal attributes, variables are encoded by the OrdinalEncoder, and for categorical, theOneHotEncoder. 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
        ("num_imputer", SimpleImputer(strategy="median")),
        ("std_scaler", StandardScaler()),
    ])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

ord_pipeline = Pipeline([
        ("ord_imputer", SimpleImputer(strategy="most_frequent")),
        ("ord_encoder", OrdinalEncoder()),
    ])

In [None]:
ord_attribs = ["igc","date_contract"] # 2 attributes

num_attribs = ["family_income",   #17
               "personal_income",
               "high_school_endyear",
               "n_sem_course",
               "n_completed_sem",
               "sem_funded",
               "fam_size",
               "income_pc",
               "tuition_current",
               "inc_prop",
               "perc_requested",
               "loan_value_sem",
               "student_resource",
               "loan_value",
               "loan_limit",
               "total_debt",
               "age"]
  

cat_attribs = ["semester_enroll",  #9
               "gender",
               "occupation", 
               "marital_status",
               "ethnicity", 
               "public_hs", 
               "state_course", 
               "degree", 
               "contract_phase"]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
        ("ord", ord_pipeline, ord_attribs)
        ])

In [None]:
test_prepared = full_pipeline.fit_transform(test_set)
test_prepared[:1]

<1x94 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

### 3 - Load Best Models

In [None]:
import joblib

# ANN 
mod_1 = joblib.load('url')

# Random Forest 
mod_2 = joblib.load('url')

# Logist Regression 
mod_3 = joblib.load('url')

### 4 - Performance Evaluation

Here, we calculate AUC and Brier scores for the different samples.

#### Test Set

In [None]:

from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss


# predict probabilities for test set
pred_avgw = mod_avgw.predict_proba(test_prepared)
pred_1 = mod_1.predict_proba(test_prepared)
pred_2 = mod_2.predict_proba(test_prepared)
pred_3 = mod_3.predict_proba(test_prepared)

# AUC score
auc_avgw = roc_auc_score(test_target, pred_avgw[:,1])
auc_1 = roc_auc_score(test_target, pred_1[:,1])
auc_2 = roc_auc_score(test_target, pred_2[:,1])
auc_3 = roc_auc_score(test_target, pred_3[:,1])

# Brier score
bs_avgw = brier_score_loss(test_target, pred_avgw[:,1])
bs_1 = brier_score_loss(test_target, pred_1[:,1])
bs_2 = brier_score_loss(test_target, pred_2[:,1])
bs_3 = brier_score_loss(test_target, pred_3[:,1])

In [None]:
# List with AUC scores
auc_list = [auc_avgw, auc_1, auc_2, auc_3]

# List with Brier Scores
bs_list = [bs_avgw, bs_1, bs_2, bs_3]

# List with model names
names_list = ['AvgW', 'ANN', 'Random Forest', 'Logistic Regression']

# Dataframe 
auc_df = pd.DataFrame({"Model": names_list, "AUC": auc_list, "BS": bs_list})
auc_df.sort_values(by = "AUC", ascending=False)

The AUC results can be plotted, as shown bellow. 

In [None]:
from sklearn.metrics import roc_curve

# roc curve for models
fpr_avgw, tpr_avgw, thresh_avgw = roc_curve(test_target, pred_avgw[:,1], pos_label=1)
fpr_1, tpr_1, thresh_1 = roc_curve(test_target, pred_1[:,1], pos_label=1)
fpr_2, tpr_2, thresh_2 = roc_curve(test_target, pred_2[:,1], pos_label=1)
fpr_3, tpr_3, thresh_3 = roc_curve(test_target, pred_3[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(test_target))]
p_fpr, p_tpr, _ = roc_curve(test_target, random_probs, pos_label=1)

In [None]:
import matplotlib.pyplot as plt
plt.style.use("seaborn")

# plot roc curves
plt.plot(fpr_avgw, tpr_avgw, linestyle="--",color="purple", label="AvgW")
plt.plot(fpr_1, tpr_1, linestyle="--",color="pink", label="ANN")
plt.plot(fpr_2, tpr_2, linestyle="--",color="orange", label="RF")
plt.plot(fpr_3, tpr_3, linestyle="--",color="gree", label="LR")
plt.plot(p_fpr, p_tpr, linestyle="-", color="black")

# x label
plt.xlabel("False Positive Rate")
# y label
plt.ylabel("True Positive Rate")

plt.legend(loc="best")
plt.savefig("ROC", dpi = 300)
plt.show()

#### Training Set

In [None]:
from sklearn.model_selection import cross_val_score

auc_train_avgw = (cross_val_score(mod_avgw, train_prepared, train_target, cv=2, scoring="roc_auc")).mean()
auc_train_1 = (cross_val_score(mod_1, train_prepared, train_target, cv=2, scoring="roc_auc")).mean()
auc_train_2 = (cross_val_score(mod_2, train_prepared, train_target, cv=2, scoring="roc_auc")).mean()
auc_train_3 = (cross_val_score(mod_3, train_prepared, train_target, cv=2, scoring="roc_auc")).mean()

In [None]:
# List with AUC scores for training set
auc_models_train = [auc_train_avgw, auc_train_1, auc_train_2, auc_train_3]
auc_names_train = ['AvgW', 'ANN', 'Random Forest', 'Logistic Regression']

# Dataframe 
auc_train_df = pd.DataFrame({"Sample": auc_names_train, "AUC": auc_models_train})
auc_train_df.sort_values(by = "AUC", ascending=False)