In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dtale
#from caimcaim import CAIMD # https://github.com/airysen/caimcaim - not working for me.
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import StratifiedShuffleSplit
from feature_engine.categorical_encoders import OneHotCategoricalEncoder
from feature_engine import outlier_removers as outr
from feature_engine import categorical_encoders as ce

# Display options

%matplotlib
%matplotlib inline
#pd.options.mode.chained_assignment = None #set it to None to remove SettingWithCopyWarning
pd.options.display.float_format = '{:.4f}'.format #set it to convert scientific noations such as 4.225108e+11 to 422510842796.00
pd.set_option('display.max_columns', 100) #  display all the columns
#pd.set_option('display.max_rows', 100) # display all the rows
np.set_printoptions(suppress=True,formatter={'float_kind':'{:f}'.format})

In [1128]:
# loading data
df = pd.read_csv('Data/loan.csv')
print('df is loaded')
df.drop(['issue_d','earliest_cr_line','last_pymnt_d',
        'last_credit_pull_d','id','member_id','settlement_date',
        'next_pymnt_d','zip_code'],axis='columns',inplace=True)



Columns (19,47,55,112,123,124,125,128,129,130,133,139,140,141) have mixed types. Specify dtype option on import or set low_memory=False.



df is loaded


In [1129]:
df = df.infer_objects()
print(df.shape)


(2260668, 136)


In [1130]:
df.dropna(axis=1,how='any',thresh=int(0.3*len(df)),inplace=True)
print(df.shape)

(2260668, 98)


In [1131]:
def remove_single_unique_values(dataframe):
    
    """
    Drop all the columns that only contain one unique value.
    not optimized for categorical features yet.
    
    """    
    cols_to_drop = dataframe.nunique()
    cols_to_drop = cols_to_drop.loc[cols_to_drop.values==1].index
    dataframe = dataframe.drop(cols_to_drop,axis=1)
    return dataframe


# Train test split

In [1167]:
# loading data, dropping features with single unique values

y = df[['loan_status']]
X = df.drop('loan_status',axis='columns')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                            random_state=42, stratify=None)


# Dividing data into categorical and numerical parts

In [1168]:
# dividing training and testing data into categorical and numerical parts
ctgrcl_X_train = X_train.select_dtypes(include=['object'])
nmrcl_X_train = X_train.select_dtypes(exclude=['object'])

ctgrcl_X_test = X_test.select_dtypes(include=['object'])
nmrcl_X_test = X_test.select_dtypes(exclude=['object'])

In [1169]:
print(nmrcl_X_train.shape)
print(nmrcl_X_test.shape)
print('********')
print(ctgrcl_X_train.shape)
print(ctgrcl_X_test.shape)

(1695501, 81)
(565167, 81)
********
(1695501, 16)
(565167, 16)


# Outlier Treatment (Training part)

In [1170]:
# cleaning
nmrcl_X_train = nmrcl_X_train.fillna(nmrcl_X_train.median())
nmrcl_X_train_medians = pd.Series(nmrcl_X_train.median())
nmrcl_X_train = remove_single_unique_values(nmrcl_X_train)
print(nmrcl_X_train.shape)


(1695501, 80)


In [1171]:
# outlier treatment
capper = outr.Winsorizer(distribution='skewed', tail='both', fold=1.5)
capper.fit(nmrcl_X_train)
nmrcl_X_train_columns = nmrcl_X_train.columns
nmrcl_X_train = capper.transform(nmrcl_X_train)
print(nmrcl_X_train.shape)


(1695501, 80)


# Outlier Treatment Testing part

In [1172]:
nmrcl_X_test = nmrcl_X_test[nmrcl_X_train_columns]
nmrcl_X_test.fillna(nmrcl_X_train_medians,inplace=True)
nmrcl_X_test = capper.transform(nmrcl_X_test)


In [1178]:
# saving columns and indexes before discretization and rescaling

num_train_cols = nmrcl_X_train.columns
num_train_index = nmrcl_X_train.index

num_train_cols = nmrcl_X_test.columns
num_train_index = nmrcl_X_test.index

# Discretizing and rescaling

In [None]:
# Discretization
discretizer = KBinsDiscretizer(n_bins=12, encode='ordinal', strategy='kmeans')
discretizer.fit(nmrcl_X_train)
nmrcl_X_train_discr = discretizer.transform(nmrcl_X_train)

print('train discretization - done')

# Min_max
scaler = preprocessing.MinMaxScaler()
scaler.fit(nmrcl_X_train_discr)
5 = scaler.transform(nmrcl_X_train_discr)
print('train rescaling - done')

nmrcl_X_test_discr = discretizer.transform(nmrcl_X_test)
print('test discretization  - done')
nmrcl_X_test_discr_rescl = scaler.transform(nmrcl_X_test_discr)
print('test rescaling - done')


In [1186]:
print(nmrcl_X_train_discr_rescl.shape)
print(nmrcl_X_test_discr_rescl.shape)

(1695501, 80)
(565167, 80)


# Treating categorical data

In [1187]:
ctgrcl_X_train.fillna('other',inplace=True)
ctgrcl_X_test.fillna('other',inplace=True)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [1190]:
encoder = ce.RareLabelCategoricalEncoder(tol=0.01)
encoder.fit(ctgrcl_X_train)

"""
The default behaviour of the function is such that it replaces infrequent categories with
the word "Rare" which would be fine if the dataframe didn't have category 'other',
that's why 'replace' method is used
"""

ctgrcl_X_train = encoder.transform(ctgrcl_X_train)
ctgrcl_X_train.replace('Rare','other',inplace=True)
ctgrcl_X_test = encoder.transform(ctgrcl_X_test)
ctgrcl_X_test.replace('Rare','other',inplace=True)


In [1195]:
ctgrcl_X_test.nunique() == ctgrcl_X_train.nunique()


term                    True
grade                   True
sub_grade               True
emp_title               True
emp_length              True
home_ownership          True
verification_status     True
pymnt_plan              True
purpose                 True
title                   True
addr_state              True
initial_list_status     True
application_type        True
hardship_flag           True
disbursement_method     True
debt_settlement_flag    True
dtype: bool

# OneHotEncoding and stacking all the frames together

In [1196]:
# OneHotEncoding for categorical variables
ohe_enc = OneHotCategoricalEncoder(
    top_categories=None,
    drop_last=True)

ohe_enc.fit(ctgrcl_X_train)

ctgrcl_X_train = ohe_enc.transform(ctgrcl_X_train)
ctgrcl_X_test = ohe_enc.transform(ctgrcl_X_test)


In [1197]:
print(ctgrcl_X_train.shape)
print(ctgrcl_X_test.shape)


(1695501, 101)
(565167, 101)


In [1212]:
# After rescaling a 2D numpy is returned, the following procedure
# turns it into a pandas df and stacks it together with the
# categorical dummies.

nmrcl_X_train_discr_rescl = pd.DataFrame(nmrcl_X_train_discr_rescl)
nmrcl_X_test_discr_rescl= pd.DataFrame(nmrcl_X_test_discr_rescl) 

nmrcl_X_train_discr_rescl.index = ctgrcl_X_train.index
nmrcl_X_test_discr_rescl.index = ctgrcl_X_test.index

nmrcl_X_train_discr_rescl.columns = nmrcl_X_train_columns
nmrcl_X_test_discr_rescl.columns = nmrcl_X_train_columns


In [1215]:
final_train = pd.concat([nmrcl_X_train_discr_rescl,ctgrcl_X_train,y_train],axis=1)
final_test = pd.concat([nmrcl_X_test_discr_rescl,ctgrcl_X_test,y_test],axis=1)


In [None]:
final_train.to_csv('Data/loan_club_train.csv')
final_test.to_csv('Data/loan_club_test.csv')

In [1216]:
print(final_train.shape)
print(final_test.shape)

(1695501, 182)
(565167, 182)
