In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, plot_precision_recall_curve, plot_roc_curve
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

  from pandas import MultiIndex, Int64Index


In [2]:
# Read data
data=pd.read_csv('Lending_Club_Data.csv')

In [3]:
# Data cleaning
data = data.drop(
    ['emp_title', 'dti_joint.1', 'int_rate', 'recoveries', 'total_pymnt', 'total_rec_int', 'total_rec_late_fee',
     'total_rec_prncp'], axis=1)  # This column is difficult to handle, so drop it
data = data.fillna(0)
for i in data.columns:  # For column in used to be str, convert 0 to '0'
    if data[i].dtypes == 'object':
        data[i] = data[i].astype('str')

In [4]:
# Numerating non-numerical data
encoder = preprocessing.OrdinalEncoder()
encoder.fit(data)
df = pd.DataFrame(encoder.transform(data), columns=data.columns)

In [5]:
# Train-test split
random_state = 809
X_train, X_test, y_train, y_test = train_test_split(df.drop(['loan_status'], axis=1),
                                                    df['loan_status'], test_size=0.3,
                                                    random_state=random_state)

In [6]:
# Random forest with default setting
rfc = RandomForestClassifier(random_state=random_state, n_jobs=-1)
time_start = time.time()
rfc.fit(X_train, y_train)
time_end = time.time()
print('time cost', time_end - time_start, 's')

time cost 24.820993900299072 s


In [10]:
time_start = time.time()
oversample = SMOTE(random_state=random_state, n_jobs=10)
oversampled_X_train, oversampled_y_train = oversample.fit_sample(X_train, y_train)
time_end = time.time()
print('time cost', time_end - time_start, 's')

KeyboardInterrupt: 

In [8]:
rfc1 = RandomForestClassifier(random_state=random_state, n_jobs=-1)
rfc1.fit(oversampled_X_train, oversampled_y_train)

KeyboardInterrupt: 

In [None]:
confusion_matrix = confusion_matrix(y_test, rfc.predict(X_test))
confusion_matrix1 = confusion_matrix(y_test, rfc1.predict(X_test))

In [None]:
# Evaluation
print(rfc.score(X_test, y_test))
confusion_matrix = confusion_matrix(y_test, rfc.predict(X_test))
fig, (ax, ax1) = plt.subplots(ncols=2, figsize=(15, 10))
plot_roc_curve(rfc, X_test, y_test, ax=ax)
plot_precision_recall_curve(rfc, X_test, y_test, ax=ax1)
ax.set_title('ROC', fontsize=18)
ax1.set_title('Precision Recall', fontsize=18)
ax.legend(fontsize=15)
ax1.legend(fontsize=15)
plt.suptitle('LendingClub', fontsize=20)
plt.show()

In [6]:
# Parameter optimisation
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = [0.5, 'sqrt', 'auto']
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
param_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf}

rfc = RandomForestClassifier()
n_iter = 20 # Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.
cv = 3 # Cross-validation folds
n_jobs = -1 # Uses all cores available
verbose = 5 # Controls how much the algo will print text to inform us of what is going on
rf_random = RandomizedSearchCV(estimator = rfc, param_distributions =param_grid, n_iter = n_iter, cv = cv, verbose=verbose, random_state=random_state, n_jobs = n_jobs)
time_start = time.time()
rf_random.fit(X_train, y_train)
time_end = time.time()
print('time cost', time_end - time_start, 's')
best_parameters = rf_random.best_params_
print('The best Parameters are:')
for key, item in best_parameters.items():
    print(f'- {key}: {item}')

Fitting 3 folds for each of 20 candidates, totalling 60 fits


KeyboardInterrupt: 