<a href="https://colab.research.google.com/github/cbsobral/ml-fies/blob/main/Module01_Classifiers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Module 01 - Models

In this module, we perform the following steps:

1. Load the data from Mod_00 and create sets and targets for train and test datasets;
2. Standardize and encode observations;
3. Run preliminary model;
4. Provide performance measures and visualization. 

### 1 - Load Data

Here, we import the training and testing sets created in Module00_Data. 


In [1]:
import pandas as pd

url_train = "https://drive.google.com/file/d/1IP7jyXkLgD_Ouy5cL6fJk4VUA5qRB2PK/view?usp=sharing"
path_train = "https://drive.google.com/uc?export=download&id="+url_train.split("/")[-2]
train = pd.read_csv(path_train)
train.shape

(351001, 31)

In [2]:
url_test = "https://drive.google.com/file/d/1v4FqKwt7NzG5RM6d9f1y7CLIdKq69jSS/view?usp=sharing"
path_test = "https://drive.google.com/uc?export=download&id="+url_test.split("/")[-2]
test = pd.read_csv(path_test)
test.shape

(87751, 31)

In [3]:
train_set = train.drop("default", axis=1) # drop targets for training set
train_target = train["default"].copy()

In [4]:
test_set = test.drop("default", axis=1) # drop targets for test set
test_target = test["default"].copy()

### 2 - Pipeline

The pipeline contains functions that will be used to transform the dataset. For the numeric attributes, the stardardization is performed by the StandardScaler. For ordinal attributes, variables are encoded by the OrdinalEncoder, and for categorical, theOneHotEncoder. 

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
        ("num_imputer", SimpleImputer(strategy="median")),
        ("std_scaler", StandardScaler()),
    ])

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

ord_pipeline = Pipeline([
        ("ord_imputer", SimpleImputer(strategy="most_frequent")),
        ("ord_encoder", OrdinalEncoder()),
    ])

In [7]:
ord_attribs = ["igc","date_contract"] # 2 attributes

num_attribs = ["family_income",   #17
               "personal_income",
               "high_school_endyear",
               "n_sem_course",
               "n_completed_sem",
               "sem_funded",
               "fam_size",
               "income_pc",
               "tuition_current",
               "inc_prop",
               "perc_requested",
               "loan_value_sem",
               "student_resource",
               "loan_value",
               "loan_limit",
               "total_debt",
               "age"]
  

cat_attribs = ["semester_enroll",  #9
               "gender",
               "occupation", 
               "marital_status",
               "ethnicity", 
               "public_hs", 
               "state_course", 
               "degree", 
               "contract_phase"]

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
        ("ord", ord_pipeline, ord_attribs)
        ])

In [10]:
train_prepared = full_pipeline.fit_transform(train_set)
train_prepared[:1]

<1x94 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

In [11]:
test_prepared = full_pipeline.fit_transform(test_set)
test_prepared[:1]

<1x94 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

### 3 - Classifiers

For the initial runs, we employ 5 methods: logistic regression, decision tree, random forest, linear support vector classification (SVC), and artificial neural networks (ANN). 


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

logr = LogisticRegression(max_iter=1000, random_state=42)
logr.fit(train_prepared, train_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
from sklearn.tree import DecisionTreeClassifier 

dtc = DecisionTreeClassifier()
dtc = dtc.fit(train_prepared, train_target)

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(train_prepared, train_target)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [16]:
from sklearn.svm import LinearSVC

svm = LinearSVC(C=1, loss="hinge", max_iter=1000)
svm.fit(train_prepared, train_target)



LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
          penalty='l2', random_state=None, tol=0.0001, verbose=0)

In [None]:
from sklearn.neural_network import MLPClassifier

ann = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000)
ann.fit(train_prepared, train_target.values.ravel())

### 4 - Performance Evaluation

To asses the preliminary results of the classifiers on the test set, we use the AUC. Using cross-validation for the training set, the best result was achieved with the ANN -- AUC of XX%. The best performing models on the test set, in line with the cross-validation scores, were XX and XX. AUC of XX% and XX%, respectively. 

#### Cross-Validation

In [None]:
cross_logr = (cross_val_score(logr, train_prepared, train_target, cv=2, scoring="roc_auc")).mean()
cross_dtc = (cross_val_score(dtc, train_prepared, train_target, cv=2, scoring="roc_auc")).mean()
cross_rf = (cross_val_score(rf, train_prepared, train_target, cv=2, scoring="roc_auc")).mean()
cross_svm = (cross_val_score(svm, train_prepared, train_target, cv=2, scoring="roc_auc")).mean()
cross_ann = (cross_val_score(ann, train_prepared, train_target, cv=2, scoring="roc_auc")).mean()

In [None]:
# List with AUC scores for training set
cross_list = [cross_logr, cross_dtc, cross_rf, cross_svm, cross_ann]

# Dataframe 
cross_df = pd.DataFrame({"AUC": cross_list})
cross_df.sort_values(by = "AUC", ascending=False)

#### Test Set

In [None]:
from sklearn.metrics import roc_auc_score

# Predict probabilities for test set
pred_logr = logr.predict_proba(test_prepared)
pred_dtc = dtc.predict_proba(test_prepared)
pred_rf = rf.predict_proba(test_prepared)
pred_svm = svm.predict_proba(test_prepared)
pred_ann = ann.predict_proba(test_prepared)

In [None]:
from sklearn.metrics import roc_auc_score

# AUC score
auc_logr = roc_auc_score(test_target, pred_logr[:,1])
auc_dtc = roc_auc_score(test_target, pred_dtc[:,1])
auc_rf = roc_auc_score(test_target, pred_rf[:,1])
auc_svm = roc_auc_score(test_target, pred_svm[:,1])
auc_ann = roc_auc_score(test_target, pred_ann[:,1])

# List with AUC scores
auc_list = [auc_logr, auc_dtc, auc_rf, auc_svm, auc_ann]

# Dataframe 
auc_df= pd.DataFrame({"AUC": auc_list})
auc_df.sort_values(by = "AUC", ascending=False)

In [None]:
from sklearn.metrics import roc_curve

# roc curve for models
fpr_logr, tpr_logr, thresh_logr = roc_curve(test_target, pred_logr[:,1], pos_label=1)
fpr_dtc, tpr_dtc, thresh_dtc = roc_curve(test_target, pred_dtc[:,1], pos_label=1)
fpr_rf, tpr_rf, thresh_rf = roc_curve(test_target, pred_rf[:,1], pos_label=1)
fpr_svm, tpr_svm, thresh_svm = roc_curve(test_target, pred_svm[:,1], pos_label=1)
fpr_ann, tpr_ann, thresh_ann = roc_curve(test_target, pred_ann[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(test_target))]
p_fpr, p_tpr, _ = roc_curve(test_target, random_probs, pos_label=1)

In [None]:
import matplotlib.pyplot as plt
plt.style.use("seaborn")

# plot roc curves
plt.plot(fpr_logr, tpr_logr, linestyle="--",color="purple", label="LR")
plt.plot(fpr_dtc, tpr_dtc, linestyle="--",color="green", label="DT")
plt.plot(fpr_rf, tpr_rf, linestyle="--",color="blue", label="RF")
plt.plot(fpr_rf, tpr_rf, linestyle="--",color="orange", label="SVM")
plt.plot(fpr_svm, tpr_svm, linestyle="--",color="red", label="ANN")
plt.plot(fpr_ann, tpr_ann, linestyle="--", color="black")

# title
plt.title("ROC Curve")
# x label
plt.xlabel("False Positive Rate")
# y label
plt.ylabel("True Positive Rate")

plt.legend(loc="best")
plt.savefig("ROC", dpi = 300)
plt.show();

### X - Feature Importance 

In [None]:
feat_names = list(train_set.columns)

In [None]:
rf.feature_importances_
plt.barh(feat_names, rf.feature_importances_)

### X - Save Models

In [None]:
import joblib

joblib.dump(logr, logr.sav)
joblib.dump(dtc, dtc.sav)
joblib.dump(svm, svm.sav)
joblib.dump(ann, ann.sav)

In [None]:
import joblib

logr = joblib.load(logr.sav)