In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from scipy import sparse

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
os.chdir("/content/drive/MyDrive/lendingClubLoanData/notebooks")

In [4]:
! ls

Exploration.ipynb  Preprocessing.ipynb	Train_Evaluate.ipynb


In [5]:
chemin = "../../dataset/accepted_2007_to_2018Q4.csv"
data = pd.read_csv(chemin, low_memory = False)

In [6]:
data.head(4)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,


In [7]:
data["issue_d"][0]

'Dec-2015'

In [8]:
data["issue_d"] = pd.to_datetime(data["issue_d"], format="%b-%Y")

In [9]:
data = data[data["issue_d"].dt.year == 2018]

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 495242 entries, 421097 to 1611876
Columns: 151 entries, id to settlement_term
dtypes: datetime64[ns](1), float64(113), object(37)
memory usage: 574.3+ MB


In [11]:
data.shape

(495242, 151)

In [12]:
data.describe()

Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,issue_d,dti,delinq_2yrs,...,deferral_term,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,settlement_amount,settlement_percentage,settlement_term
count,0.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242.0,495242,494110.0,495242.0,...,368.0,368.0,368.0,368.0,321.0,368.0,368.0,480.0,480.0,480.0
mean,,16025.020394,16025.020394,16021.669277,12.729072,466.612858,80093.99,2018-06-21 08:01:20.833208832,19.668887,0.229252,...,3.0,196.141902,3.0,12.76087,595.37785,15633.306304,215.650815,7145.746083,54.567729,18.022917
min,,1000.0,1000.0,725.0,5.31,29.76,0.0,2018-01-01 00:00:00,0.0,0.0,...,3.0,5.95,3.0,0.0,31.47,424.11,0.07,413.93,29.95,1.0
25%,,8000.0,8000.0,8000.0,8.46,254.56,46000.0,2018-04-01 00:00:00,11.43,0.0,...,3.0,87.3975,3.0,5.0,258.72,7966.5175,50.87,3367.1275,45.0,17.0
50%,,14000.0,14000.0,14000.0,11.8,386.82,66000.0,2018-07-01 00:00:00,17.71,0.0,...,3.0,160.105,3.0,13.0,479.82,13323.565,153.205,5626.135,55.0,18.0
75%,,22000.0,22000.0,22000.0,16.01,629.04,96000.0,2018-10-01 00:00:00,25.03,0.0,...,3.0,271.905,3.0,20.0,819.9,22590.2,327.3825,9761.5,65.0,24.0
max,,40000.0,40000.0,40000.0,30.99,1670.15,9930475.0,2018-12-01 00:00:00,999.0,58.0,...,3.0,845.22,3.0,29.0,2535.66,40149.35,1159.62,28503.0,80.0,24.0
std,,10138.075023,10138.075023,10137.900298,5.150204,286.909624,88871.61,,20.458244,0.743665,...,0.0,147.686771,0.0,8.979989,451.388039,9434.677079,215.056762,5037.838506,9.347597,6.361213


In [13]:
target = "loan_status"

In [14]:
data = data[(data[target] == "Fully Paid") | (data[target] == "Charged Off")]

In [15]:
not_defined = (data.isna().sum().sort_values(ascending = False) / data.shape[0]) * 100

In [16]:
eliminated = not_defined[not_defined > 60]
eliminated.shape

(43,)

In [17]:
index = eliminated.index
data = data.drop(index, axis = 1)
data.shape

(56311, 108)

In [18]:
(data.isna().sum() / data.shape[0]).sort_values(ascending = False)

Unnamed: 0,0
mths_since_last_delinq,0.552485
il_util,0.166309
emp_title,0.095665
emp_length,0.087621
mths_since_recent_inq,0.083483
...,...
total_bc_limit,0.000000
total_il_high_credit_limit,0.000000
hardship_flag,0.000000
disbursement_method,0.000000


In [19]:
#
numeric_variable = []
categorical_variable = []
for elt in data.select_dtypes(["float", "int"]):
    numeric_variable.append(elt)

for elt in data.select_dtypes("object"):
    categorical_variable.append(elt)

In [20]:
len(categorical_variable) + len(numeric_variable) == data.shape[1]

False

In [21]:
#suppression de "loan status" dans les variables categorielles
i = 0
for elt in categorical_variable:
    if (elt == "loan_status") | (elt == "id"):
        del categorical_variable[i]

    i +=1

In [22]:
dictionnaire = {
    'Fully Paid' : "1",
    'Charged Off':"0"
}
data["loan_status"] = data["loan_status"].replace(dictionnaire).astype("int")

In [23]:
##pipeline de preprocessing pour les variables numeriques

numeric_pipe = Pipeline(steps = [
    ("Imputation Numerique", SimpleImputer(strategy = "mean")),
    ("Normalisation", StandardScaler())
])


In [24]:
#pipeline de preprocessing pour les variables categorielles
categorical_pipe = Pipeline(steps = [
    ("Imputation categorielle", SimpleImputer(strategy = "most_frequent")),
    ("Encodage des variables", OneHotEncoder(handle_unknown="ignore")),
])

In [25]:
preprocessor = ColumnTransformer(transformers = [
    ("numerique", numeric_pipe, numeric_variable),
    ("categorielle", categorical_pipe, categorical_variable)
])

In [26]:
#pipeline de preprocessing

preprocessed_pipeline = Pipeline(steps = [
    ("preprocessing", preprocessor)
])

In [27]:
# division de la base de donnees en entrainement, validation et test
X = data.drop(["loan_status", "id"], axis = 1)
Y = data["loan_status"]

Xtrain, Xval, Ytrain, Yval = train_test_split(X, Y, test_size = 0.4, random_state = 42)
Xval, Xtest, Yval, Ytest = train_test_split(Xval, Yval, test_size = 0.3, random_state = 42)

In [28]:
Xtrain.shape

(33786, 106)

In [29]:
Xval.shape

(15767, 106)

In [30]:
Xtest.shape

(6758, 106)

In [31]:
data["loan_status"].unique()

array([1, 0])

In [32]:
Xtrain_preprocessed = preprocessed_pipeline.fit_transform(Xtrain)
Xtest_preprocessed = preprocessed_pipeline.transform(Xtest)
Xval_preprocessed = preprocessed_pipeline.transform(Xval)

In [33]:
Xtrain_preprocessed.shape

(33786, 50256)

In [34]:
sparse.save_npz("../../dataset/preprocessed_data/Xtrain_preprocessed.npz", Xtrain_preprocessed)
sparse.save_npz("../../dataset/preprocessed_data/Xtest_preprocessed.npz", Xtest_preprocessed)
sparse.save_npz("../../dataset/preprocessed_data/Xval_preprocessed.npz", Xval_preprocessed)

In [35]:
Ytrain.to_csv("../../dataset/preprocessed_data/Ytrain.csv", index = False)
Ytest.to_csv("../../dataset/preprocessed_data/Ytest.csv", index = False)
Yval.to_csv("../../dataset/preprocessed_data/Yval.csv", index = False)