In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from scipy import sparse

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
os.chdir("/content/drive/MyDrive/lendingClubLoanData/notebooks")

In [4]:
! ls

Exploration.ipynb  Preprocessing.ipynb	Train_Evaluate.ipynb


In [5]:
chemin = "../../dataset/accepted_2007_to_2018Q4.csv"
data = pd.read_csv(chemin, low_memory = False)

In [6]:
data.head(4)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,


In [7]:
data["issue_d"][0]

'Dec-2015'

In [8]:
data.shape

(2260701, 151)

In [9]:
data["issue_d"] = pd.to_datetime(data["issue_d"], format="%b-%Y")

In [10]:
data = data[data["issue_d"].dt.year == 2018]
data = data[data["issue_d"].dt.month > 5]

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 298139 entries, 749522 to 1546807
Columns: 151 entries, id to settlement_term
dtypes: datetime64[ns](1), float64(113), object(37)
memory usage: 345.7+ MB


In [12]:
data.shape

(298139, 151)

In [13]:
data.describe()

Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,issue_d,dti,delinq_2yrs,...,deferral_term,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,settlement_amount,settlement_percentage,settlement_term
count,0.0,298139.0,298139.0,298139.0,298139.0,298139.0,298139.0,298139,297489.0,298139.0,...,32.0,32.0,32.0,32.0,30.0,32.0,32.0,64.0,64.0,64.0
mean,,16017.506599,16017.506599,16014.380076,12.890043,466.579288,81112.51,2018-08-30 22:17:45.851834112,19.753515,0.228685,...,3.0,157.349063,3.0,7.65625,500.653,14551.069375,286.014062,7778.237969,59.566562,17.1875
min,,1000.0,1000.0,725.0,5.31,30.12,0.0,2018-06-01 00:00:00,0.0,0.0,...,3.0,5.95,3.0,0.0,36.78,424.11,0.07,655.13,45.0,1.0
25%,,8000.0,8000.0,8000.0,8.46,256.36,46800.0,2018-07-01 00:00:00,11.57,0.0,...,3.0,71.99,3.0,0.0,295.2,8621.935,0.9525,3948.0,53.75,18.0
50%,,14000.0,14000.0,14000.0,11.8,387.66,67000.0,2018-09-01 00:00:00,17.79,0.0,...,3.0,158.915,3.0,0.0,496.47,12596.73,181.97,5778.0,64.995,18.0
75%,,22000.0,22000.0,21950.0,16.14,627.47,97000.0,2018-11-01 00:00:00,25.07,0.0,...,3.0,216.3075,3.0,18.5,652.5675,19682.3625,366.1725,11452.75,65.0,18.0
max,,40000.0,40000.0,40000.0,30.99,1670.15,9757200.0,2018-12-01 00:00:00,999.0,58.0,...,3.0,378.9,3.0,29.0,1136.7,40149.35,1159.62,23506.0,65.12,24.0
std,,10125.819789,10125.819789,10126.241894,5.213854,286.209065,95643.45,,20.064695,0.745392,...,0.0,99.344628,0.0,10.891279,285.240894,9637.064333,337.379187,5461.724828,7.744592,4.917978


In [14]:
target = "loan_status"

In [15]:
data = data[(data[target] == "Fully Paid") | (data[target] == "Charged Off")]

In [16]:
not_defined = (data.isna().sum().sort_values(ascending = False) / data.shape[0]) * 100

In [17]:
eliminated = not_defined[not_defined > 60]
eliminated.shape

(43,)

In [18]:
index = eliminated.index
data = data.drop(index, axis = 1)
data.shape

(20230, 108)

In [19]:
(data.isna().sum() / data.shape[0]).sort_values(ascending = False)

Unnamed: 0,0
mths_since_last_delinq,0.550420
il_util,0.161443
emp_title,0.104795
emp_length,0.085171
mths_since_recent_inq,0.082600
...,...
total_bc_limit,0.000000
total_il_high_credit_limit,0.000000
hardship_flag,0.000000
disbursement_method,0.000000


In [20]:
#
numeric_variable = []
categorical_variable = []
for elt in data.select_dtypes(["float", "int"]):
    numeric_variable.append(elt)

for elt in data.select_dtypes("object"):
    categorical_variable.append(elt)

In [21]:
len(categorical_variable) + len(numeric_variable) == data.shape[1]

False

In [22]:
#suppression de "loan status" dans les variables categorielles
i = 0
for elt in categorical_variable:
    if (elt == "loan_status") | (elt == "id"):
        del categorical_variable[i]

    i +=1

In [23]:
dictionnaire = {
    'Fully Paid' : "1",
    'Charged Off':"0"
}
data["loan_status"] = data["loan_status"].replace(dictionnaire).astype("int")

In [24]:
##pipeline de preprocessing pour les variables numeriques

numeric_pipe = Pipeline(steps = [
    ("Imputation Numerique", SimpleImputer(strategy = "mean")),
    ("Normalisation", StandardScaler())
])


In [25]:
#pipeline de preprocessing pour les variables categorielles
categorical_pipe = Pipeline(steps = [
    ("Imputation categorielle", SimpleImputer(strategy = "most_frequent")),
    ("Encodage des variables", OneHotEncoder(handle_unknown="ignore")),
])

In [26]:
preprocessor = ColumnTransformer(transformers = [
    ("numerique", numeric_pipe, numeric_variable),
    ("categorielle", categorical_pipe, categorical_variable)
])

In [27]:
#pipeline de preprocessing

preprocessed_pipeline = Pipeline(steps = [
    ("preprocessing", preprocessor)
])

In [28]:
# division de la base de donnees en entrainement, validation et test
X = data.drop(["loan_status", "id"], axis = 1)
Y = data["loan_status"]

Xtrain, Xval, Ytrain, Yval = train_test_split(X, Y, test_size = 0.4, random_state = 42)
Xval, Xtest, Yval, Ytest = train_test_split(Xval, Yval, test_size = 0.3, random_state = 42)

In [29]:
Xtrain.shape

(12138, 106)

In [30]:
Xval.shape

(5664, 106)

In [31]:
Xtest.shape

(2428, 106)

In [32]:
data["loan_status"].unique()

array([1, 0])

In [33]:
Xtrain_preprocessed = preprocessed_pipeline.fit_transform(Xtrain)
Xtest_preprocessed = preprocessed_pipeline.transform(Xtest)
Xval_preprocessed = preprocessed_pipeline.transform(Xval)

In [34]:
Xtrain_preprocessed.shape

(12138, 20163)

In [35]:
sparse.save_npz("../../dataset/preprocessed_data/Xtrain_preprocessed.npz", Xtrain_preprocessed)
sparse.save_npz("../../dataset/preprocessed_data/Xtest_preprocessed.npz", Xtest_preprocessed)
sparse.save_npz("../../dataset/preprocessed_data/Xval_preprocessed.npz", Xval_preprocessed)

In [36]:
Ytrain.to_csv("../../dataset/preprocessed_data/Ytrain.csv", index = False)
Ytest.to_csv("../../dataset/preprocessed_data/Ytest.csv", index = False)
Yval.to_csv("../../dataset/preprocessed_data/Yval.csv", index = False)