In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dtale
from caimcaim import CAIMD # https://github.com/airysen/caimcaim - not working for me.
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import StratifiedShuffleSplit
from feature_engine.categorical_encoders import OneHotCategoricalEncoder
from feature_engine import outlier_removers as outr
from feature_engine import categorical_encoders as ce

# Display options

%matplotlib
%matplotlib inline
#pd.options.mode.chained_assignment = None #set it to None to remove SettingWithCopyWarning
pd.options.display.float_format = '{:.4f}'.format #set it to convert scientific noations such as 4.225108e+11 to 422510842796.00
pd.set_option('display.max_columns', 100) #  display all the columns
#pd.set_option('display.max_rows', 100) # display all the rows
np.set_printoptions(suppress=True,formatter={'float_kind':'{:f}'.format})

Using matplotlib backend: MacOSX


In [None]:
def remove_single_unique_values(dataframe):
    
    """
    Drop all the columns that only contain one unique value.
    not optimized for categorical features yet.
    
    """    
    cols_to_drop = dataframe.nunique()
    cols_to_drop = cols_to_drop.loc[cols_to_drop.values==1].index
    dataframe = dataframe.drop(cols_to_drop,axis=1)
    return dataframe

# Loading Data

In [100]:
df = pd.read_csv('Data/loan.csv')
print('df is loaded')
df.drop(['issue_d','earliest_cr_line','last_pymnt_d',
        'last_credit_pull_d','id','member_id','settlement_date',
        'next_pymnt_d','zip_code'],axis='columns',inplace=True)

df = df.infer_objects()
df.dropna(axis=1,how='any',thresh=int(0.3*len(df)),inplace=True)
print(df.shape)

df is loaded
(2260668, 98)


# Train test split

In [91]:
# loading data, dropping features with single unique values

y = df[['loan_status']]
X = df.drop('loan_status',axis='columns')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                            random_state=42, stratify=None)

# Dividing data into categorical and numerical parts

In [92]:
# dividing training and testing data into categorical and numerical parts
ctgrcl_X_train = X_train.select_dtypes(include=['object'])
nmrcl_X_train = X_train.select_dtypes(exclude=['object'])

ctgrcl_X_test = X_test.select_dtypes(include=['object'])
nmrcl_X_test = X_test.select_dtypes(exclude=['object'])

print(nmrcl_X_train.shape)
print(nmrcl_X_test.shape)
print('********')
print(ctgrcl_X_train.shape)
print(ctgrcl_X_test.shape)

# Initial cleaning

In [94]:
# cleaning
nmrcl_X_train = nmrcl_X_train.fillna(nmrcl_X_train.median())

# train medians have to be saved as a pd.Series object othervise replace() 
# method does not work when replacing NaN in testing df.
nmrcl_X_train_medians = pd.Series(nmrcl_X_train.median())
nmrcl_X_train = remove_single_unique_values(nmrcl_X_train)
print(nmrcl_X_train.shape)


(1695501, 80)


# Outlier Treatment

In [96]:
# outlier treatment
capper = outr.Winsorizer(distribution='skewed', tail='both', fold=1.5)
capper.fit(nmrcl_X_train)
nmrcl_X_train_columns = nmrcl_X_train.columns
nmrcl_X_train = capper.transform(nmrcl_X_train)
print(nmrcl_X_train.shape)

nmrcl_X_test = nmrcl_X_test[nmrcl_X_train_columns]
nmrcl_X_test.fillna(nmrcl_X_train_medians,inplace=True)
nmrcl_X_test = capper.transform(nmrcl_X_test)

(1695501, 80)


In [98]:
# saving columns and indexes before discretization and rescaling
num_train_cols = nmrcl_X_train.columns
num_train_index = nmrcl_X_train.index

num_train_cols = nmrcl_X_test.columns
num_train_index = nmrcl_X_test.index

# Discretizing

In [None]:
# It does not matter if I use KBinsDiscretizer or CAIMD, I end up having columns that 
# only have 1 unique value useless for any machine learning.
caim = CAIMD()
caim.fit(nmrcl_X_train,y_train)
nmrcl_X_train_discr = caim.transform(nmrcl_X_train)
nmrcl_X_test_discr = caim.transform(nmrcl_X_train)

# Rescaling

In [74]:
scaler = preprocessing.MinMaxScaler()
scaler.fit(nmrcl_X_train_discr)
nmrcl_X_train_discr_rscld = scaler.transform(nmrcl_X_train_discr)
nmrcl_X_test_discr_rscld = scaler.transform(nmrcl_X_test_discr)

In [78]:
# a quick check to make sure df shapes are the same.
print(nmrcl_X_test_discr_rscld.shape == nmrcl_X_train_discr_rscld.shape)
print(nmrcl_X_train_discr_rscld.shape)
print(nmrcl_X_test_discr_rscld.shape)

True

# Treating categorical data

In [81]:
ctgrcl_X_train.fillna('other',inplace=True)
ctgrcl_X_test.fillna('other',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  **kwargs


In [83]:
encoder = ce.RareLabelCategoricalEncoder(tol=0.01)
encoder.fit(ctgrcl_X_train)

"""
The default behaviour of the function is such that it replaces infrequent categories with
the word "Rare" which would be fine if the dataframe didn't have category 'other',
that's why 'replace' method is used
"""

ctgrcl_X_train = encoder.transform(ctgrcl_X_train)
ctgrcl_X_train.replace('Rare','other',inplace=True)
ctgrcl_X_test = encoder.transform(ctgrcl_X_test)
ctgrcl_X_test.replace('Rare','other',inplace=True)

# quick check to make sure that the categories are the same.
ctgrcl_X_test.nunique() == ctgrcl_X_train.nunique()

term                    True
grade                   True
sub_grade               True
emp_title               True
emp_length              True
home_ownership          True
verification_status     True
pymnt_plan              True
purpose                 True
title                   True
addr_state              True
initial_list_status     True
application_type        True
hardship_flag           True
disbursement_method     True
debt_settlement_flag    True
dtype: bool

# OneHotEncoding and stacking all the frames together

In [85]:
# OneHotEncoding for categorical variables
ohe_enc = OneHotCategoricalEncoder(
    top_categories=None,
    drop_last=True)

ohe_enc.fit(ctgrcl_X_train)

ctgrcl_X_train = ohe_enc.transform(ctgrcl_X_train)
ctgrcl_X_test = ohe_enc.transform(ctgrcl_X_test)

In [1197]:
print(ctgrcl_X_train.shape)
print(ctgrcl_X_test.shape)


(1695501, 101)
(565167, 101)


In [1212]:
# After rescaling a 2D numpy is returned, the following procedure
# turns it into a pandas df and stacks it together with the
# categorical dummies.

nmrcl_X_train_discr_rescl = pd.DataFrame(nmrcl_X_train_discr_rescl)
nmrcl_X_test_discr_rescl= pd.DataFrame(nmrcl_X_test_discr_rescl) 

nmrcl_X_train_discr_rescl.index = ctgrcl_X_train.index
nmrcl_X_test_discr_rescl.index = ctgrcl_X_test.index

nmrcl_X_train_discr_rescl.columns = nmrcl_X_train_columns
nmrcl_X_test_discr_rescl.columns = nmrcl_X_train_columns


In [1215]:
final_train = pd.concat([nmrcl_X_train_discr_rescl,ctgrcl_X_train,y_train],axis=1)
final_test = pd.concat([nmrcl_X_test_discr_rescl,ctgrcl_X_test,y_test],axis=1)


In [None]:
final_train.to_csv('Data/loan_club_train.csv')
final_test.to_csv('Data/loan_club_test.csv')

In [1216]:
print(final_train.shape)
print(final_test.shape)

(1695501, 182)
(565167, 182)
