In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dtale
#from caimcaim import CAIMD # https://github.com/airysen/caimcaim 
# not working for me.
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import StratifiedShuffleSplit
from feature_engine.categorical_encoders import OneHotCategoricalEncoder


# Display options

%matplotlib
%matplotlib inline
#pd.options.mode.chained_assignment = None #set it to None to remove SettingWithCopyWarning
pd.options.display.float_format = '{:.4f}'.format #set it to convert scientific noations such as 4.225108e+11 to 422510842796.00
pd.set_option('display.max_columns', 100) #  display all the columns
pd.set_option('display.max_rows', 100) # display all the rows
np.set_printoptions(suppress=True,formatter={'float_kind':'{:f}'.format})

Using matplotlib backend: MacOSX


# Additional functions

In [2]:
# functions to be used later in the code.


def less_than_1pct_cat_replace(df,pct=0.01):
    """
    Replace all the categorical values who's proportion
    is less than pct with 'other'.
    """

    for column in df.columns:
        selection = df[column].value_counts(normalize=True)<pct
        list_to_replace = selection[selection==True].index
        mask = df[column].isin(list_to_replace)
        df[column][mask] = 'other'
    
    return df



def outlier_replacer(df):
    """
    Replace all the outliers in a numerical df using 
    IQR methodology.
    
    """
    for column in df.columns:
        quartile_1, quartile_3 = np.percentile(df[column], [25, 75])
        iqr = quartile_3 - quartile_1
        lower_bound = quartile_1 - (float(iqr) * 1.5)
        upper_bound = quartile_3 + (float(iqr) * 1.5)
        df.loc[(df[column]<lower_bound) | (df[column] > upper_bound),[column]] = df[column].median()
        
    return df


def remove_single_unique_values(dataframe):
    
    """
    Drop all the columns that only contain one unique value.
    not optimized for categorical features yet.
    
    """
    
    cols_to_drop = dataframe.nunique()
    cols_to_drop = cols_to_drop.loc[cols_to_drop.values==1].index
    dataframe = dataframe.drop(cols_to_drop,axis=1)
    return dataframe

In [3]:
# loading data
df = pd.read_csv('Data/2nd_clean.csv')
print('df is loaded')
df.drop(['Unnamed: 0','issue_d','earliest_cr_line','last_pymnt_d',
        'last_credit_pull_d'],axis='columns',inplace=True)

df is loaded


* Any observation with the loan status 'current' has no value for the analysis as there is no way of knowing for sure the outcome of a loan. Similarlry other cases of status such as 'Does not meet the credit policy. Status:Fully Paid', 'Does not meet the credit policy. Status:Charged Off', 'Default' have been classified accordingly to their outocme, to reduce data sparcity. So those observations are dropped.

In [5]:
df = df[df.loan_status!='Current']
df = df[df.loan_status!='Late (31-120 days)']
df = df[df.loan_status!='Late (16-30 days)']
df.loan_status.value_counts()

Fully Paid                                             1041952
Charged Off                                             261655
In Grace Period                                           8952
Does not meet the credit policy. Status:Fully Paid        1988
Does not meet the credit policy. Status:Charged Off        761
Default                                                     31
Name: loan_status, dtype: int64

In [6]:
df['loan_status'].replace(
    {'Does not meet the credit policy. Status:Fully Paid':'Fully Paid',
    'Does not meet the credit policy. Status:Charged Off':'Charged Off',
    'Default':'Charged Off'},inplace=True)

In [7]:
df.loan_status.value_counts()

Fully Paid         1043940
Charged Off         262447
In Grace Period       8952
Name: loan_status, dtype: int64

# Treating categorical part of the df

In [9]:
# splitting the df into categorical and numerical parts
ctgrcl_df = df.select_dtypes(include=['object'])
nmrcl_df = df.select_dtypes(exclude=['object'])

# have to drop the y feature
ctgrcl_df.drop(['loan_status'],axis='columns',inplace=True)

ctgrcl_df.fillna('other',inplace=True)

# '<,>,+' have to be replaced for correct working of get_dummies function
emp_lengthdict = {'10+ years':'ten years or more', 
 '2 years':'two years', 
 '< 1 year':'less than a year', 
 '3 years':'three years', 
 'other':'other', 
 '1 year':'one year',
 '5 years':'five years', 
 '4 years':'four years', 
 '6 years':'six years', 
 '7 years':'seven years', 
 '8 years':'eight years', 
 '9 years':'nine years'}

ctgrcl_df.emp_length.replace(emp_lengthdict,inplace=True)
ctgrcl_df = less_than_1pct_cat_replace(ctgrcl_df,pct=0.01)

# now the categorical part of the df is ready for train_test_split
# stiching categorical and numerical data together
df = pd.concat([nmrcl_df,ctgrcl_df,df[['loan_status']]],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  **kwargs
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Train test split

In [60]:
# loading data, dropping features with single uniqu values

y = df[['loan_status']]
X = df.drop('loan_status',axis='columns')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

# Dividing data into categorical and numerical parts

In [61]:
# dividing training and testing data into categorical and numerical parts
ctgrcl_X_train = X_train.select_dtypes(include=['object'])
nmrcl_X_train = X_train.select_dtypes(exclude=['object'])

ctgrcl_X_test = X_test.select_dtypes(include=['object'])
nmrcl_X_test = X_test.select_dtypes(exclude=['object'])

In [62]:
print(nmrcl_X_train.shape)
print(nmrcl_X_test.shape)

(986504, 64)
(328835, 64)


# Treating numerical X_train

In [18]:
# imputation and outlier treatment (median, IQR)
nmrcl_X_train = nmrcl_X_train.fillna(nmrcl_X_train.median())

nmrcl_X_train = outlier_replacer(nmrcl_X_train)
nmrcl_X_train = remove_single_unique_values(nmrcl_X_train)
nmrcl_X_train_columns = nmrcl_X_train.columns
X_train_median_values = nmrcl_X_train.median()

print('imputation and outlier treatment - done')

# This is a sub-optimal, temporary measure, as I haven't figured out 
# how to do supervised discretization.
discretizer = KBinsDiscretizer(n_bins=12, encode='ordinal', strategy='kmeans')
discretized_X_train = discretizer.fit_transform(nmrcl_X_train)

print('discretization - done')

# Min_max
scaler = preprocessing.MinMaxScaler()
rescaled_discretized_X_train = scaler.fit_transform(np.array(discretized_X_train))

print('rescaling - done')
print('numerical data is ready')

imputation and outlier treatment - done


  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)


discretization - done
rescaling - done
numerical data is ready


# Treating numerical X_test

In [19]:
# imputation and outlier treatment (median, IQR)
# nmrcl_X_test[nmrcl_X_train_columns].shape

nmrcl_X_test = nmrcl_X_test[nmrcl_X_train_columns]

nmrcl_X_test = nmrcl_X_test.fillna(nmrcl_X_train.median())

nmrcl_X_test = outlier_replacer(nmrcl_X_test)

nmrcl_X_test_columns = nmrcl_X_test.columns


print('imputation and outlier treatment - done')

# This is a sub-optimal, temporary measure, as I haven't figured out 
# how to do supervised discretization.
## fit_transform fot training and transform for testing
#discretizer = KBinsDiscretizer(n_bins=12, encode='ordinal', strategy='kmeans')
discretized_nmrcl_X_test = discretizer.transform(nmrcl_X_test)

print('discretization - done')

# Min_max
#scaler = preprocessing.MinMaxScaler()
rescaled_discretized_X_test = scaler.transform(np.array(discretized_nmrcl_X_test))

print('rescaling - done')
print('numerical data is ready')

imputation and outlier treatment - done
discretization - done
rescaling - done
numerical data is ready


In [21]:
print(rescaled_discretized_X_test.shape)
print(rescaled_discretized_X_train.shape)

(328835, 46)


# OneHotEncoding and stacking all the frames together

In [63]:
# OneHotEncoding for categorical variables

# using feature-engine open-source librabry
# https://feature-engine.readthedocs.io/en/latest/index.html 

ohe_enc = OneHotCategoricalEncoder(
    top_categories=None,
    drop_last=True)

ohe_enc.fit(ctgrcl_X_train)

ctgrcl_X_train = ohe_enc.transform(ctgrcl_X_train)
ctgrcl_X_test = ohe_enc.transform(ctgrcl_X_test)

In [65]:
print(ctgrcl_X_train.shape)
print(ctgrcl_X_test.shape)

(986504, 98)
(328835, 98)


In [68]:
# After rescaling a 2D numpy is returned, the following procedure
# turns it into a pandas df and stacks it together with the
# categorical dummies.

rescaled_discretized_X_train = pd.DataFrame(rescaled_discretized_X_train)
rescaled_discretized_X_test = pd.DataFrame(rescaled_discretized_X_test) 

rescaled_discretized_X_train.index = ctgrcl_X_train.index
rescaled_discretized_X_test.index = ctgrcl_X_test.index

rescaled_discretized_X_train.columns = nmrcl_X_train_columns
rescaled_discretized_X_test.columns = nmrcl_X_test_columns


In [76]:
final_X_train = pd.concat([rescaled_discretized_X_train,ctgrcl_X_train],axis=1)
final_X_test = pd.concat([rescaled_discretized_X_test,ctgrcl_X_test],axis=1)

In [77]:
final_X_train = pd.concat([final_X_train,y_train],axis=1)
final_X_test = pd.concat([final_X_test,y_test],axis=1)

In [83]:
final_X_train.to_csv('Data/loan_club_train.csv')
final_X_test.to_csv('Data/loan_club_test.csv')

In [86]:
print(final_X_train.shape)
print(final_X_test.shape)

(986504, 145)
(328835, 145)
