In [151]:
import pandas as pd
import numpy as np
import dtale
#from caimcaim import CAIMD
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from feature_engine.categorical_encoders import OneHotCategoricalEncoder
from feature_engine import outlier_removers as outr
from feature_engine import categorical_encoders as ce

# Display options

%matplotlib
%matplotlib inline
#pd.options.mode.chained_assignment = None #set it to None to remove SettingWithCopyWarning
pd.options.display.float_format = '{:.4f}'.format #set it to convert scientific noations such as 4.225108e+11 to 422510842796.00
pd.set_option('display.max_columns', 100) #  display all the columns
#pd.set_option('display.max_rows', 100) # display all the rows
np.set_printoptions(suppress=True,formatter={'float_kind':'{:f}'.format})


def remove_single_unique_values(dataframe):
    
    """
    Drop all the columns that only contain one unique value.
    not optimized for categorical features yet.
    
    """    
    cols_to_drop = dataframe.nunique()
    cols_to_drop = cols_to_drop.loc[cols_to_drop.values==1].index
    dataframe = dataframe.drop(cols_to_drop,axis=1)
    return dataframe

Using matplotlib backend: MacOSX


# Loading Data

In [152]:
df = pd.read_csv('Data/loan.csv')
print('df is loaded')
df.drop(['issue_d','earliest_cr_line','last_pymnt_d',
        'last_credit_pull_d','id','member_id','settlement_date',
        'next_pymnt_d','zip_code'],axis='columns',inplace=True)

df = df.infer_objects()
df.dropna(axis=1,how='any',thresh=int(0.3*len(df)),inplace=True)
print(df.shape)

  interactivity=interactivity, compiler=compiler, result=result)


df is loaded
(2260668, 98)


In [153]:
pd.DataFrame(df['loan_status'].value_counts(normalize=True)*100)

Unnamed: 0,loan_status
Fully Paid,46.0904
Current,40.6824
Charged Off,11.5742
Late (31-120 days),0.9686
In Grace Period,0.396
Late (16-30 days),0.1653
Does not meet the credit policy. Status:Fully Paid,0.0879
Does not meet the credit policy. Status:Charged Off,0.0337
Default,0.0014


# Label-specific manipulations

Some of the labels have no ML value as they do no provide any terminal status of a loan.

* 46.09% of the loans are lebeled 'Current'.
* 1.17% of the loans are lebeled 'Late'.
* 0.4% of the loans are lebeled 'IN Grace Period'.
* 0.0014% of the loans are lebeled 'Default'.

There is no way to know what the actual outcome of those loans is going to be.
They will are dropped.

* 'Does not meet the credit policy. Status:Fully Paid' 
* 'Does not meet the credit policy. Status:Charged Off'

These labels differ from 'Fully Paid' or 'Charged Off' however they can also be classified as their status suggest into : 'Fully Paid' and 'Charged Off' correspondingly.

In [154]:
df = df.loc[df.loan_status!='Current']
df = df.loc[df.loan_status!='Late (31-120 days)']
df = df.loc[df.loan_status!='Late (16-30 days)']
df = df.loc[df.loan_status!='In Grace Period']
df = df.loc[df.loan_status!='Default']

dictionary = {'Does not meet the credit policy. Status:Fully Paid':'Fully Paid',
             'Does not meet the credit policy. Status:Charged Off':'Charged Off'}

df['loan_status'].replace(dictionary,inplace=True)
df['loan_status'].value_counts(normalize=True)

Fully Paid    0.7991
Charged Off   0.2009
Name: loan_status, dtype: float64

# Train test split

In [168]:
y = df[['loan_status']]
X = df.drop('loan_status',axis='columns')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                            random_state=42, stratify=None)

# Label Encoding 

In [169]:
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)


In [170]:
"""
LabelEncoder() output is a numpy array, it's missing the index which is later used for
concatanation of categorical, numercial and label data together. The following is a 
primitive solution but it works and there is no missalignment in the final df.

"""
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

y_train.columns = ['training labels']
y_test.columns = ['testing labels']

y_train.index = X_train.index
y_test.index = X_test.index

# Dividing data into categorical and numerical parts

In [171]:
# dividing training and testing data into categorical and numerical parts
ctgrcl_X_train = X_train.select_dtypes(include=['object'])
nmrcl_X_train = X_train.select_dtypes(exclude=['object'])

ctgrcl_X_test = X_test.select_dtypes(include=['object'])
nmrcl_X_test = X_test.select_dtypes(exclude=['object'])

print('Numerical part:')
print(nmrcl_X_train.shape)
print(nmrcl_X_test.shape)
print('Categorical part:')
print(ctgrcl_X_train.shape)
print(ctgrcl_X_test.shape)

Numerical part:
(979767, 81)
(326589, 81)
Categorical part:
(979767, 16)
(326589, 16)


# Initial cleaning (numerical data)

In [172]:
# cleaning
nmrcl_X_train = nmrcl_X_train.fillna(nmrcl_X_train.median())

""" 
Training df medians have to be saved as a pd.Series object othervise replace() 
method does not work when replacing NaN in testing df.
"""

nmrcl_X_train_medians = pd.Series(nmrcl_X_train.median())
nmrcl_X_train = remove_single_unique_values(nmrcl_X_train)
print(nmrcl_X_train.shape)


(979767, 78)


# Outlier Treatment

In [173]:
# outlier treatment
capper = outr.Winsorizer(distribution='skewed', tail='both', fold=1.5)
capper.fit(nmrcl_X_train)
nmrcl_X_train_columns = nmrcl_X_train.columns
nmrcl_X_train = capper.transform(nmrcl_X_train)

nmrcl_X_test = nmrcl_X_test[nmrcl_X_train_columns]
nmrcl_X_test.fillna(nmrcl_X_train_medians,inplace=True)
nmrcl_X_test = capper.transform(nmrcl_X_test)

print(nmrcl_X_train.shape)
print(nmrcl_X_test.shape)

(979767, 78)
(326589, 78)


# Saving indeces and column names of the dfs.

In [174]:
# saving columns and indexes before discretization and rescaling
num_train_cols = nmrcl_X_train.columns
num_train_index = nmrcl_X_train.index

num_test_cols = nmrcl_X_test.columns
num_test_index = nmrcl_X_test.index

# Discretization using Kbins method

In [175]:
discretizer = KBinsDiscretizer(n_bins=12, encode='ordinal', strategy='kmeans')
discretizer.fit(nmrcl_X_train)
nmrcl_X_train_discr = discretizer.transform(nmrcl_X_train)
nmrcl_X_test_discr = discretizer.transform(nmrcl_X_test)

  "replaced with 0." % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  "replaced with 0." % jj)
  "replaced with 0." % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  "replaced with 0.

# Rescaling

In [176]:
scaler = preprocessing.MinMaxScaler()
scaler.fit(nmrcl_X_train_discr)
nmrcl_X_train_discr_rscld = scaler.transform(nmrcl_X_train_discr)
nmrcl_X_test_discr_rscld = scaler.transform(nmrcl_X_test_discr)

print(nmrcl_X_train_discr_rscld.shape)
print(nmrcl_X_test_discr_rscld.shape)

(979767, 78)
(326589, 78)


# Treating categorical data

In [177]:
ctgrcl_X_train.fillna('other',inplace=True)
ctgrcl_X_test.fillna('other',inplace=True)

encoder = ce.RareLabelCategoricalEncoder(tol=0.01)
encoder.fit(ctgrcl_X_train)

"""
The default behaviour of the function is such that it replaces infrequent categories with
the word "Rare" which would be fine if the dataframe didn't have category 'other',
that's why 'replace' method is used
"""

ctgrcl_X_train = encoder.transform(ctgrcl_X_train)
ctgrcl_X_train.replace('Rare','other',inplace=True)
ctgrcl_X_test = encoder.transform(ctgrcl_X_test)
ctgrcl_X_test.replace('Rare','other',inplace=True)

# quick check to make sure that the categories are the same.
ctgrcl_X_test.nunique() == ctgrcl_X_train.nunique()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  **kwargs


term                     True
grade                    True
sub_grade                True
emp_title                True
emp_length               True
home_ownership           True
verification_status      True
pymnt_plan               True
purpose                  True
title                    True
addr_state               True
initial_list_status      True
application_type         True
hardship_flag           False
disbursement_method      True
debt_settlement_flag     True
dtype: bool

# OneHotEncoding

In [178]:
# OneHotEncoding for categorical variables
ohe_enc = OneHotCategoricalEncoder(top_categories=None,drop_last=True)

ohe_enc.fit(ctgrcl_X_train)

ctgrcl_X_train = ohe_enc.transform(ctgrcl_X_train)
ctgrcl_X_test = ohe_enc.transform(ctgrcl_X_test)

print(ctgrcl_X_train.shape)
print(ctgrcl_X_test.shape)

(979767, 98)
(326589, 98)


# Stacking all the dataframes together

In [179]:
""" 
After rescaling numerical data 2D numpy arrays are returned, the following procedure
turns them into pandas dfs and stacks them with categorical dummies and corresponding 
labels. 
"""

nmrcl_X_train_discr_rescl = pd.DataFrame(nmrcl_X_train_discr_rscld)
nmrcl_X_test_discr_rescl= pd.DataFrame(nmrcl_X_test_discr_rscld) 

nmrcl_X_train_discr_rescl.index = ctgrcl_X_train.index
nmrcl_X_test_discr_rescl.index = ctgrcl_X_test.index

nmrcl_X_train_discr_rescl.columns = nmrcl_X_train_columns
nmrcl_X_test_discr_rescl.columns = nmrcl_X_train_columns

final_train = pd.concat([nmrcl_X_train_discr_rescl,ctgrcl_X_train,y_train],axis=1)
final_test = pd.concat([nmrcl_X_test_discr_rescl,ctgrcl_X_test,y_test],axis=1)

print(final_train.shape)
print(final_test.shape)

(979767, 177)
(326589, 177)


# Saving dfs

In [None]:
final_train.to_csv('Data/Binary_label_K_bins_train.csv')
final_test.to_csv('Data/Binary_label_K_bins_test.csv')