In [470]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, LabelEncoder, KBinsDiscretizer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

from feature_engine import outlier_removers 
from feature_engine.categorical_encoders import OneHotCategoricalEncoder, RareLabelCategoricalEncoder
import dtale

from imblearn.over_sampling import SMOTE
# Display options

pd.options.mode.chained_assignment = None #set it to None to remove SettingWithCopyWarning
pd.options.display.float_format = '{:.4f}'.format #set it to convert scientific noations such as 4.225108e+11 to 422510842796.00
pd.set_option('display.max_columns', 100) #  display all the columns
pd.set_option('display.max_rows', 100) # display all the rows
np.set_printoptions(suppress=True,formatter={'float_kind':'{:f}'.format})


# Loading Data

In [471]:
# this loads the file that contains the fetures and info. abou the features:
# desctption, categorical/numerical, to use or not to use.
dictionary = pd.read_csv('Data/LCDataDictionary.csv',names=['action','feature','description'])
dictionary['feature'].replace('total_rev_hi_lim \xa0','total_rev_hi_lim',inplace=True)
cols_to_use= list(dictionary['feature'].values)

In [472]:
df = pd.read_csv('Data/loan.csv',low_memory=False)
print('df is loaded')
df = df[cols_to_use]
df.dropna(axis=1,how='any',thresh=int(0.35*len(df)),inplace=True)
df.shape

df is loaded


(2260668, 76)

# Label Manipulations
* Specific to the dataset

In [473]:
labels_to_drop = ['Current','Late (31-120 days)','Late (16-30 days)','In Grace Period','Default']
df = df[~df.loan_status.isin(labels_to_drop)]

dictionary = {'Does not meet the credit policy. Status:Fully Paid':'Fully Paid',
             'Does not meet the credit policy. Status:Charged Off':'Charged Off'}

df['loan_status'].replace(dictionary,inplace=True)
df['loan_status'].value_counts(normalize=True)

Fully Paid    0.7991
Charged Off   0.2009
Name: loan_status, dtype: float64

In [474]:
# As the result of this this function the following features are dropped:
# pymnt_plan
# policy_code

# delete all the columns that contain single unique values
for col in df.columns:
    if len(df[col].unique()) == 1:
        print("'{}' is dropped".format(col))
        df.drop(col,inplace=True,axis=1)
print(df.shape)        

'policy_code' is dropped
'pymnt_plan' is dropped
(1306356, 74)


# Trying to reduce memory usage

In [475]:
columns_to_use = df.drop('loan_status',axis='columns').columns
X = df[columns_to_use]
y = df['loan_status'].values.ravel()

In [476]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                            random_state=42, stratify=None)

In [479]:
numerical_columns = X_train.columns[:59]
categorical_columns = X_train.columns[59:]

# Dividing data into categorical and numerical parts

In [480]:
# dividing training and testing data into categorical and numerical parts
nmrcl_X_train = X_train[numerical_columns]
nmrcl_X_test = X_test[numerical_columns]

ctgrcl_X_train = X_train[categorical_columns]
ctgrcl_X_test = X_test[categorical_columns]

print('Numerical part:')
print(nmrcl_X_train.shape)
print(nmrcl_X_test.shape)
print('Categorical part:')
print(ctgrcl_X_train.shape)
print(ctgrcl_X_test.shape)

Numerical part:
(979767, 59)
(326589, 59)
Categorical part:
(979767, 14)
(326589, 14)


# Treating numerical data
* starting with pd.fillna

In [481]:
""" 
Training df medians have to be saved as a pd.Series object othervise replace() 
method does not work when replacing NaN in testing df.
"""

training_medians = pd.Series(nmrcl_X_train.median()) # get the training medians 

nmrcl_X_train = nmrcl_X_train.fillna(training_medians) # fillna first
nmrcl_X_test = nmrcl_X_test.fillna(training_medians)

# Pipelining numerical features treatment
* I actually don't know if it's a good idea, but if we pipeline, we decrease the number of points of failure.

In [482]:
# 2-step pipeline
# 1.outlier replacement
# 3.scaling removed for now

# defining the sklearn-native pre-processors
capper = outlier_removers.Winsorizer(distribution='skewed', tail='both', fold=1.5)
scaler = StandardScaler()

# piepline
numerical_pipeline = Pipeline([('capper',capper),('scaler',scaler)])
# apply pipeline
nmrcl_X_train = numerical_pipeline.fit_transform(nmrcl_X_train)
nmrcl_X_test = numerical_pipeline.transform(nmrcl_X_test)


# get the features names
nmrc_feature_cols = numerical_pipeline.named_steps['capper'].variables

# Cast produced np.arrays back to pd.DataFrame

In [483]:
# pipeline output is a numpy array, cast it back to pandas df
nmrcl_X_train = pd.DataFrame(nmrcl_X_train, columns=nmrc_feature_cols)
nmrcl_X_test = pd.DataFrame(nmrcl_X_test, columns=nmrc_feature_cols)

# Pipelining categorical features treatment

In [487]:
ctgrcl_X_train.fillna('other',inplace=True)
ctgrcl_X_test.fillna('other',inplace=True)

# two step pipeline:
# 1. rare labels (frequency below 1% are changed to 'rare')
# 2. n-1 OneHot encoding

encoder = RareLabelCategoricalEncoder(tol=0.01)
ohe_enc = OneHotCategoricalEncoder(top_categories=None,drop_last=True)

categorical_pipeline = Pipeline([('rare_label',encoder),('onehot',ohe_enc)])

ctgrcl_X_train = categorical_pipeline.fit_transform(ctgrcl_X_train)
ctgrcl_X_test = categorical_pipeline.transform(ctgrcl_X_test)

# reseting the index so all the dfs are alinable
ctgrcl_X_train.reset_index(drop=True,inplace=True)
ctgrcl_X_test.reset_index(drop=True,inplace=True)

# Label Encoding 

In [488]:
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)

# Cast encoded labels back to a dataframe object

In [489]:
"""
LabelEncoder() output is a numpy array, it's missing the index which is later used for
concatanation of categorical, numercial and label data together. The following is a 
primitive solution but it works and there is no missalignment in the final df.

"""
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

y_train.columns = ['labels']
y_test.columns = ['labels']

In [490]:
y_train.head(10)

Unnamed: 0,labels
0,1
1,1
2,1
3,1
4,0
5,1
6,1
7,1
8,1
9,1


In [491]:
pd.Series(le.inverse_transform(y_train.values.ravel())).head(10)

0     Fully Paid
1     Fully Paid
2     Fully Paid
3     Fully Paid
4    Charged Off
5     Fully Paid
6     Fully Paid
7     Fully Paid
8     Fully Paid
9     Fully Paid
dtype: object

# Stacking all the dataframes together

In [495]:
final_train = pd.concat([nmrcl_X_train,ctgrcl_X_train,y_train],axis=1)
final_test = pd.concat([nmrcl_X_test,ctgrcl_X_test,y_test],axis=1)

print(final_train.shape)
print(final_test.shape)

(979767, 160)
(326589, 160)


In [496]:
# it's a temporary measurement as there are no resources for sound binning
# delete all the columns that contain single unique values
for col in final_train.columns:
    if len(final_train[col].unique()) == 1:
        final_train.drop(col,inplace=True,axis=1)
print(final_train.shape)

(979767, 141)


In [497]:
columns = final_train.columns
final_train = final_train[columns]
final_test = final_test[columns]

In [498]:
final_train.to_csv('Data/train.csv',index=False)
final_test.to_csv('Data/test.csv',index=False)