<a href="https://colab.research.google.com/github/cbsobral/ml-fies/blob/main/Module_2b_Logistic_Tuning_and_Sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Module 2b - Fine Tuning and Sampling

In this module, we perform the following steps:

1. Load the data from Mod_00 and create sets and targets for train and test datasets;
2. Standardize and encode observations;
3. Run preliminary models;
4. Provide performance measures and visualization. 

In [None]:
pd.DataFrame(train_prepared).to_csv("/content/drive/MyDrive/M/Machine Learning/Data.Base/train_prepared.csv")

##### 1 - Load Data

Here, we import the training and testing sets created in Module00_Data. 


In [2]:
import pandas as pd

url_train = "https://drive.google.com/file/d/1IP7jyXkLgD_Ouy5cL6fJk4VUA5qRB2PK/view?usp=sharing"
path_train = "https://drive.google.com/uc?export=download&id="+url_train.split("/")[-2]
train = pd.read_csv(path_train)
train.shape

(351001, 31)

In [3]:
url_test = "https://drive.google.com/file/d/1v4FqKwt7NzG5RM6d9f1y7CLIdKq69jSS/view?usp=sharing"
path_test = "https://drive.google.com/uc?export=download&id="+url_test.split("/")[-2]
test = pd.read_csv(path_test)
test.shape

(87751, 31)

In [4]:
train_set = train.drop("default", axis=1) # drop targets for training set
train_target = train["default"].copy()

In [5]:
test_set = test.drop("default", axis=1) # drop targets for test set
test_target = test["default"].copy()

#### 2 - Pipeline

The pipeline contains functions that will be used to transform the dataset. For the numeric attributes, the stardardization is performed by the StandardScaler. For ordinal attributes, variables are encoded by the OrdinalEncoder, and for categorical, theOneHotEncoder. 

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
        ("num_imputer", SimpleImputer(strategy="median")),
        ("std_scaler", StandardScaler()),
    ])

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

ord_pipeline = Pipeline([
        ("ord_imputer", SimpleImputer(strategy="most_frequent")),
        ("ord_encoder", OrdinalEncoder()),
    ])

In [8]:
from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline(steps=[('one_hot', OneHotEncoder())])

In [9]:
ord_attribs = ["igc","date_contract"] # 2 attributes

num_attribs = ["family_income",   #17
               "personal_income",
               "high_school_endyear",
               "n_sem_course",
               "n_completed_sem",
               "sem_funded",
               "fam_size",
               "income_pc",
               "tuition_current",
               "inc_prop",
               "perc_requested",
               "loan_value_sem",
               "student_resource",
               "loan_value",
               "loan_limit",
               "total_debt",
               "age"]
  

cat_attribs = ["semester_enroll",  #9
               "gender",
               "occupation", 
               "marital_status",
               "ethnicity", 
               "public_hs", 
               "state_course", 
               "degree", 
               "contract_phase"]

In [10]:
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
        ("ord", ord_pipeline, ord_attribs)
        ])

In [14]:
train_prepared = full_pipeline.fit_transform(train_set)
train_prepared[:1]

<1x94 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

In [15]:
test_prepared = full_pipeline.fit_transform(test_set)
test_prepared[:1]

<1x94 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

#### 3 - Sampling 

In [31]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)

train_over, target_over = ros.fit_resample(train_prepared, train_target)



In [26]:
train_over[:1]

<1x94 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

In [17]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
train_under, target_under = rus.fit_resample(train_prepared, train_target)



In [27]:
train_under[:1]

<1x94 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>



```
# This is formatted as code
```

#### 4 - Fine Tuning - Logistic Classifiers

Based on the sampled and normal datasets, we deploy a ra


In [18]:
#Randomsearch normal_model
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()

LRparam_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'max_iter': list(range(100,1100,100)),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

In [None]:
LR_search = RandomizedSearchCV(LR,param_distributions=LRparam_grid,refit = True,scoring="roc_auc",verbose = 3, cv=5, random_state=42 )
# fitting the model for grid search 
LR_search.fit(train_prepared ,train_target)
LR_search.best_params_


In [20]:
# summarize
print('Mean Accuracy: %.3f' % LR_search.best_score_)
print('Config: %s' % LR_search.best_params_)

Mean Accuracy: 0.724
Config: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 200, 'C': 0.1}


In [None]:
LR_search_under = RandomizedSearchCV(LR,param_distributions=LRparam_grid,refit = True,scoring="roc_auc",verbose = 3, cv=5, random_state=42 )
# fitting the model for grid search 
LR_search_under.fit(train_under ,target_under)
LR_search_under.best_params_

In [24]:
print('Mean Accuracy: %.3f' % LR_search_under.best_score_)
print('Config: %s' % LR_search_under.best_params_)

Mean Accuracy: 0.724
Config: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 200, 'C': 0.1}


In [None]:

LR_search_over = RandomizedSearchCV(LR,param_distributions=LRparam_grid,refit = True,scoring="roc_auc",verbose = 3, cv=5, random_state=42 )
# fitting the model for grid search 
LR_search_over.fit(train_over ,target_over)
LR_search_over.best_params_



In [33]:
# summarize
print('Mean Accuracy: %.3f' % LR_search_over.best_score_)
print('Config: %s' % LR_search_over.best_params_)

Mean Accuracy: 0.725
Config: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 200, 'C': 0.1}


#### 5 - Save the models

Save Optimized models

In [41]:

logr_optimal = LogisticRegression(max_iter=200, random_state=42,solver= 'liblinear', penalty= 'l1', C =0.1)
logr_optimal.fit(train_prepared,train_target)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
logr_under = LogisticRegression(solver='liblinear',penalty= 'l1', max_iter= 200, C= 0.1,random_state=42)
logr_under.fit(train_under, target_under)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [38]:
logr_over = LogisticRegression(solver='liblinear',penalty= 'l1', max_iter= 200, C= 0.1,random_state=42)
logr_over.fit(train_over, target_over)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
import joblib

file_name = '/content/drive/MyDrive/M/Machine Learning/Data.Base/logistitic_model_fies.save'
file_name_under = '/content/drive/MyDrive/M/Machine Learning/Data.Base/logistitic_model_fies_under.save'
file_name_over = '/content/drive/MyDrive/M/Machine Learning/Data.Base/logistitic_model_fies_over.save'


joblib.dump(logr_optimal, file_name)
joblib.dump(logr_under, file_name_under)
joblib.dump(logr_over, file_name_over)


['/content/drive/MyDrive/M/Machine Learning/Data.Base/logistitic_model_fies_over.save']

#### 6 - Test Set - Scores

In this section, AUC and Brier scores were not calculated for the linear SVC model because it does not have a *predict_proba* function.


In [43]:
from sklearn.metrics import roc_auc_score

# Predict probabilities for test set
pred_logr = logr_optimal.predict_proba(test_prepared)
pred_logr_under= logr_under.predict_proba(test_prepared)
pred_logr_over= logr_over.predict_proba(test_prepared)



In [None]:
from sklearn.metrics import brier_score_loss

loss_logr = brier_score_loss(test_target, pred_logr[:,1])
loss_logr_smote = brier_score_loss(test_target, pred_logr_smote[:,1])


In [44]:
from sklearn.metrics import roc_auc_score

# AUC score
auc_logr = roc_auc_score(test_target, pred_logr[:,1])
auc_logr_under = roc_auc_score(test_target, pred_logr_under[:,1])
auc_logr_over = roc_auc_score(test_target, pred_logr_over[:,1])

In [46]:
# List with AUC scores
auc_list = [auc_logr, auc_logr_under,auc_logr_over]

# List with Brier Scores
loss_list = [auc_logr, auc_logr_under,auc_logr_over]

# List with model names
m2_list = ['Logistic Regression', 'Undersampled','Oversampled']

# Dataframe 
auc_df= pd.DataFrame({"Model": m2_list, "AUC": auc_list, "BS": loss_list})
auc_df.sort_values(by = "AUC", ascending=False)

Unnamed: 0,Model,AUC,BS
2,Oversampled,0.724579,0.724579
0,Logistic Regression,0.724568,0.724568
1,Undersampled,0.724486,0.724486


In [None]:
# Save Model Using joblib
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
import joblib


loaded_model = joblib.load('/content/drive/MyDrive/M/Machine Learning/Data.Base/logistitic_model_fies.save')



#### 7 - Feature Importance

This routine is necessary to get the feature importance and feature names

In [21]:
clf = Pipeline(steps=[('preprocessor', full_pipeline),
                      ('classifier', logr)])
clf.fit(train_set,train_target)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('num_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                         

In [27]:
onehot_columns = list(clf.named_steps['preprocessor'].named_transformers_['cat'].named_steps['one_hot'].get_feature_names(input_features=cat_attribs))


In [28]:
numeric_features_list = list(num_attribs)
numeric_features_list.extend(onehot_columns)
numeric_features_list.extend(ord_attribs)

In [29]:
!pip install eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/d1/54/04cab6e1c0ae535bec93f795d8403fdf6caf66fa5a6512263202dbb14ea6/eli5-0.11.0-py2.py3-none-any.whl (106kB)
[K     |███                             | 10kB 14.5MB/s eta 0:00:01[K     |██████▏                         | 20kB 13.2MB/s eta 0:00:01[K     |█████████▎                      | 30kB 9.2MB/s eta 0:00:01[K     |████████████▍                   | 40kB 7.8MB/s eta 0:00:01[K     |███████████████▌                | 51kB 4.4MB/s eta 0:00:01[K     |██████████████████▌             | 61kB 5.0MB/s eta 0:00:01[K     |█████████████████████▋          | 71kB 5.3MB/s eta 0:00:01[K     |████████████████████████▊       | 81kB 5.4MB/s eta 0:00:01[K     |███████████████████████████▉    | 92kB 5.2MB/s eta 0:00:01[K     |███████████████████████████████ | 102kB 5.7MB/s eta 0:00:01[K     |████████████████████████████████| 112kB 5.7MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.11.0


In [32]:
import eli5
eli5.explain_weights(clf.named_steps['classifier'], top=100, feature_names=numeric_features_list, feature_filter=lambda x: x != '<BIAS>')

Weight?,Feature
1.582,semester_enroll_before 2012
1.045,semester_enroll_12012
0.782,state_course_AM
0.63,state_course_AP
0.482,semester_enroll_22012
0.276,state_course_GO
0.224,loan_value_sem
0.2,degree_SERVIÇO SOCIAL
0.199,state_course_RJ
0.198,degree_EDUCAÇÃO FÍSICA
