In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', 50)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Prepare the LendingClub dataset

df = pd.read_csv('LendingClub.csv')

# Convert categorical variable "purpose" to dummies, and drop the most frequent dummy
df = pd.get_dummies(df, columns=['purpose']).drop(columns=['purpose_debt_consolidation'])

X = df.drop(columns=['not_fully_paid'])
y = df['not_fully_paid']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=365)
X_train = X_train.copy()
X_test = X_test.copy()

## Dealing with severely unbalanced data

#### class_weight:put weight into the target: is 0 and 1 equal: class_weight={0:1,1:2}: which type of mistake is more severe  
#### class_weight is associated with classification mistake

In [7]:
# Logistic regression, penalty 还有一个参数叫elastic net！！！！

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty='none', max_iter=1000,class_weight={0:1,1:1})
clf.fit(X_train,y_train)

y_predict = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_predict).round(4)
print(f"The accuracy is: {accuracy:.2%}")
print("The confusion matrix is:")
cm = confusion_matrix(y_test, y_predict)
print(cm)

# save the results for later comparison
clf_lr = clf
accuracy_lr = accuracy
cm_lr = cm

The accuracy is: 84.39%
The confusion matrix is:
[[1616    0]
 [ 299    1]]


The accuracy above, while pretty high, is misleading because we actually got an extremely biased trained model: this trained model almost always predicts that borrowers will not default, as evident from the confusion matrix. 

This extremely biased trained model is triggered by the severely unbalanced dataset:

In [None]:
y_test.value_counts(normalize=True)

### Options for dealing with severely unbalanced data

+ **Option 1. Re-sampling the data to make it balanced.** This can be done in two ways:
  + **undersampling** the majority class
    + this is the usual choice when we have large enough data
  + **oversampling** the minority class
    + it may cause the [data leakage](https://towardsdatascience.com/data-leakage-in-machine-learning-10bdd3eec742) problem, thus should be avoided unless the data size is too small
+ **Option 2. Do not use "accuracy" as the performance metric.** Instead, 
  + use alternative metrics that can give different weight to different classes of the target, e.g., counts '1' more heavily than '0' in the target of the LendingClub dataset (to be discussed in the next lecture)

### Undersampling the majority class

The function for this is `sklearn.utils.resample()`.

In [8]:
# First, separate the classes, where we already know 'not_fully_paid==0' is the majority class
df_0 = df[df.not_fully_paid==0]
df_1 = df[df.not_fully_paid==1]

# Remember the sizes of the two classes
n_majority_class = df_0.shape[0]
n_minority_class = df_1.shape[0]
print(f"The majority class contains {n_majority_class} records. \nThe minority class contains {n_minority_class} records. ")

The majority class contains 8045 records. 
The minority class contains 1533 records. 


In [9]:
from sklearn.utils import resample

# undersample the majority class
# replace=True means put back and shuffle, then choose
df_0_undersampled = resample(df_0, replace=False, 
                             n_samples=n_minority_class, 
                             random_state=1234)
df_0_undersampled.shape

(1533, 19)

In [12]:
from sklearn.utils import resample

# oversample the majority class
# replace=True means put back and shuffle, then choose
#This is called bootstrapping: repeatedly put out data from the same dataset over and over again
df_1_oversampled = resample(df_1, replace=True, 
                             n_samples=n_majority_class, 
                             random_state=1234)
df_1_oversampled.shape

(8045, 19)

### Combining the two classes into a single (resampled) dataset

In [None]:
df_balanced = pd.concat([df_0_undersampled, df_1])
df_balanced.not_fully_paid.value_counts()

In [None]:
# Save the balanced data for future use
df_balanced.to_csv('LendingClub_balanced.csv', index=False)

### Comments on oversampling

The reason it should be avoided when possible: the the [data leakage](https://towardsdatascience.com/data-leakage-in-machine-learning-10bdd3eec742) problem.

However, if you have to use it because the size of the minority class is too small, here are a few hints:
+ make sure you do `train_test_split()` *before* oversampling (why?)
+ ways to oversample:
  + Use [bootstrapping](https://en.wikipedia.org/wiki/Bootstrapping_(statistics)), a.k.a. `resample()` with the option `replace=True`.
  + Use [`imblearn.over_sampling.SMOTE`](https://imbalanced-learn.org/stable/over_sampling.html) -- a k-NN inspired method to create synthetic records
    + [A nice tutorial on SMOTE](https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/)

### Splitting this balanced data into train and test

In [None]:
X = df_balanced.drop(columns=['not_fully_paid'])
y = df_balanced['not_fully_paid']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=365)
X_train = X_train.copy()
X_test = X_test.copy()

### Training the logistic regression model over this balanced data

In [None]:
clf.fit(X_train,y_train)

y_predict = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_predict).round(4)
print(f"The accuracy is: {accuracy:.2%}")
print("The confusion matrix is:")
cm = confusion_matrix(y_test, y_predict)
print(cm)

# save the results for later comparison
clf_lr = clf
accuracy_lr = accuracy
cm_lr = cm

As shown above, the predictions are no longer extremely biased.

## Normalize/standardize the data

Recall that: "normalize" --> [0,1], and "standardize" --> mean 0 and std 1.

The LendingClub dataset consists of columns of varying scales. In addition, some columns are significantly skewed.

In [None]:
X_train.agg(['mean','std','skew'])

Variables of varying scales, and skewed variables, are commonly seen in business datasets. 
+ E.g., salary is in the tens of thousands, while age is usually in two digits
+ E.g., monetary variables (salary, spending, ...) are often right skewed 

### *Do we need to normalize/standardize the data?*

Nowadays, almost always **yes** because:
+ Many learning algorithms are sensitive to varying data scales (e.g., kNN, SVM) or varying data distribution shapes (e.g., regression)
+ **Regularization** is heavily used in modern machine learning. And regularization does NOT work without data normalization/stanardization
    + See these two brief posts on the concept of regularization: [Over-fitting and Regularization](https://towardsdatascience.com/over-fitting-and-regularization-64d16100f45c), [L1 and L2 Regularization Methods](https://towardsdatascience.com/l1-and-l2-regularization-methods-ce25e7fc831c)

Tree-based classifiers are an exception because they don't compare column values when splitting.


### Manually performing data normalization/standardization

In [13]:
# Make a copy, as later we'll also try another standardization method
X_train_std_manual = X_train.copy()
X_test_std_manual = X_test.copy()

In [None]:
# Below we normalize/standardize some input columns
# Remember we need to work on both train and test datasets
# In practice, remember to update your data description file afterwards!

for x in [X_train_std_manual, X_test_std_manual]:
    x['installment1000'] = x.installment / 1000
    x.drop('installment', axis=1, inplace=True)

    x['fico_ratio'] = x.fico / 850
    x.drop('fico', axis=1, inplace=True)

    x['decades_with_cr_line'] = x.days_with_cr_line / 3650
    x.drop('days_with_cr_line', axis=1, inplace=True)

    x['log_revol_bal'] = np.log(x.revol_bal + 1)
    x.drop('revol_bal', axis=1, inplace=True)

    x.revol_util = x.revol_util / 100

In [None]:
# Check the summary statistics of the transformed data
X_train_std_manual.agg(['mean','std','skew'])

In [None]:
# Now let's run the logistic regression again with this transformed data
clf.fit(X_train_std_manual,y_train)

y_predict = clf.predict(X_test_std_manual)

accuracy = accuracy_score(y_test, y_predict).round(4)
print(f"The accuracy is: {accuracy:.2%}")
print("The confusion matrix is:")
cm = confusion_matrix(y_test, y_predict)
print(cm)

### Automatically performing data normalization/standardization

We can automatically standardize data using `sklearn.preprocessing.StandardScaler`.

In [None]:
# Make a copy, as later we'll also try another standardization method
X_train_std_auto = X_train.copy()
X_test_std_auto = X_test.copy()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
# Warning: we don't want to standardize any categorical columns!
# Therefore, let's pick out only the numerical ones.
num_columns = ['int_rate', 'installment', 'log_annual_inc', 'dti', 
               'fico', 'days_with_cr_line', 'revol_bal', 'revol_util',
               'inq_last_6mths', 'delinq_2yrs', 'pub_rec']

In [None]:
scaler.fit(X_train_std_auto[num_columns])

In [None]:
print(scaler.mean_)

In [None]:
X_train_std_auto[num_columns] = scaler.transform(X_train_std_auto[num_columns])
X_test_std_auto[num_columns] = scaler.transform(X_test_std_auto[num_columns])

In [None]:
# Verify that standardization is done
X_train_std_auto.agg(['mean','std','skew'])

In [None]:
## The above scaler.fit() and scaler.transform() steps can be combined into one:
# X_train_std_auto[num_columns] = scaler.fit_transform(X_train_std_auto[num_columns])

In [None]:
# Now let's run the logistic regression again with this standardized data
clf.fit(X_train_std_auto,y_train)

y_predict = clf.predict(X_test_std_auto)

accuracy = accuracy_score(y_test, y_predict).round(4)
print(f"The accuracy is: {accuracy:.2%}")
print("The confusion matrix is:")
cm = confusion_matrix(y_test, y_predict)
print(cm)