In [239]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dtale
#from caimcaim import CAIMD # https://github.com/airysen/caimcaim 
# not working for me.
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import StratifiedShuffleSplit
from feature_engine.categorical_encoders import OneHotCategoricalEncoder

# Display options

%matplotlib
%matplotlib inline
#pd.options.mode.chained_assignment = None #set it to None to remove SettingWithCopyWarning
pd.options.display.float_format = '{:.4f}'.format #set it to convert scientific noations such as 4.225108e+11 to 422510842796.00
pd.set_option('display.max_columns', 100) #  display all the columns
pd.set_option('display.max_rows', 100) # display all the rows
np.set_printoptions(suppress=True,formatter={'float_kind':'{:f}'.format})

Using matplotlib backend: MacOSX


2020-02-29 11:51:58,998 - ERROR    - Exception in callback functools.partial(<function Kernel.enter_eventloop.<locals>.advance_eventloop at 0x1a6a2ddf80>)
Traceback (most recent call last):
  File "/Users/babyhandzzz/anaconda3/lib/python3.7/site-packages/tornado/ioloop.py", line 743, in _run_callback
    ret = callback()
  File "/Users/babyhandzzz/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 310, in advance_eventloop
    eventloop(self)
TypeError: 'NoneType' object is not callable


# Additional functions

In [514]:
# functions to be used later in the code.


def less_than_1pct_cat_replace(df,pct=0.01):
    """
    Replace all the categorical values who's proportion
    is less than pct with 'other'. 
    !!! Returns two objects.
    """
    list_of_stuff_to_replace = []
    
    for column in df.columns:
        selection = df[column].value_counts(normalize=True)<pct
        list_to_replace = selection[selection==True].index
        
        for item in list_to_replace:
            list_of_stuff_to_replace.append(item)
        
        mask = df[column].isin(list_to_replace)
        df[column][mask] = 'other'
    
    return  df,list_of_stuff_to_replace



def outlier_replacer(df,test_df):    
    """
    Replace all the outliers in a numerical df using 
    IQR methodology. 
    
    """
    # preparing the training data    
    df = remove_single_unique_values(df)
    df = df.fillna(df.median())
    df = remove_single_unique_values(df)
    nmrcl_X_train_columns = df.columns
    
    # preparing testing data
    test_df = test_df[nmrcl_X_train_columns]
    test_df = test_df.fillna(df.median())
    
    upper_bound_dict = {}
    lower_bound_dict = {}
    
    # looping over every column
    for column in df.columns:
        quartile_1, quartile_3 = np.percentile(df[column], [25, 75])
        iqr = quartile_3 - quartile_1
        lower_bound = quartile_1 - (float(iqr) * 1.5)
        upper_bound = quartile_3 + (float(iqr) * 1.5)
        
        # populating dictionary with boundary values
        upper_bound_dict[column] = upper_bound
        lower_bound_dict[column] = lower_bound
        
        upper_bound_df = pd.DataFrame(upper_bound_dict,index=np.arange(0,1)).T
        lower_bound_df = pd.DataFrame(lower_bound_dict,index=np.arange(0,1)).T
        
        df.loc[(df[column]<lower_bound) | (df[column] > upper_bound),[column]] = df[column].median()
        test_df.loc[(test_df[column]<lower_bound) | (test_df[column] > upper_bound),[column]] = df[column].median()
        
    iqr_bounds = pd.merge(upper_bound_df, lower_bound_df, left_index=True, right_index=True)    
    iqr_bounds.columns = ['upper bound','lower bound']
    iqr_bounds['medians'] = df.median()    
    
    df = remove_single_unique_values(df)
    test_df = remove_single_unique_values(test_df)
 
    return  df,test_df,iqr_bounds


def remove_single_unique_values(dataframe):
    
    """
    Drop all the columns that only contain one unique value.
    not optimized for categorical features yet.
    
    """    
    cols_to_drop = dataframe.nunique()
    cols_to_drop = cols_to_drop.loc[cols_to_drop.values==1].index
    dataframe = dataframe.drop(cols_to_drop,axis=1)
    return dataframe


In [406]:
# loading data
df = pd.read_csv('Data/loan.csv')
print('df is loaded')
df.drop(['issue_d','earliest_cr_line','last_pymnt_d',
        'last_credit_pull_d','id','member_id','settlement_date',
        'next_pymnt_d','zip_code'],axis='columns',inplace=True)

df is loaded


In [407]:
df = df.infer_objects()
print(df.shape)

(2260668, 138)


In [408]:
df.dropna(axis=1,how='any',thresh=int(0.3*len(df)),inplace=True)
print(df.shape)

(2260668, 100)


# Train test split

In [530]:
# loading data, dropping features with single uniqu values

y = df[['loan_status']]
X = df.drop('loan_status',axis='columns')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, 
                                            random_state=42, stratify=None)

# Dividing data into categorical and numerical parts

In [531]:
# dividing training and testing data into categorical and numerical parts
ctgrcl_X_train = X_train.select_dtypes(include=['object'])
nmrcl_X_train = X_train.select_dtypes(exclude=['object'])

ctgrcl_X_test = X_test.select_dtypes(include=['object'])
nmrcl_X_test = X_test.select_dtypes(exclude=['object'])

In [532]:
print(nmrcl_X_train.shape)
print(nmrcl_X_test.shape)
print('********')
print(ctgrcl_X_train.shape)
print(ctgrcl_X_test.shape)

(1695501, 81)
(565167, 81)
********
(1695501, 18)
(565167, 18)


# Outlier Treatment (both training and testing data) 

In [412]:
nmrcl_X_train,nmrcl_X_test,boundaries = outlier_replacer(nmrcl_X_train,nmrcl_X_test)

In [420]:
# saving columns and indexes before discretization and rescaling

num_train_cols = nmrcl_X_train.columns
num_train_index = nmrcl_X_train.index

num_train_cols = nmrcl_X_test.columns
num_train_index = nmrcl_X_test.index

# Discretizing and rescaling (both train and test)

In [None]:
discretizer = KBinsDiscretizer(n_bins=12, encode='ordinal', strategy='kmeans')
discretized_X_train = discretizer.fit_transform(nmrcl_X_train)

print('train discretization - done')

# Min_max
scaler = preprocessing.MinMaxScaler()
rescaled_discretized_X_train = scaler.fit_transform(np.array(discretized_X_train))
print('train rescaling - done')

In [424]:
discretized_nmrcl_X_test = discretizer.transform(nmrcl_X_test)
rescaled_discretized_X_test = scaler.transform(np.array(discretized_nmrcl_X_test))

discretization - done


In [429]:
print(rescaled_discretized_X_train.shape)
print(rescaled_discretized_X_test.shape)

(1695501, 60)
(565167, 60)


# Treating categorical data

In [None]:
ctgrcl_X_train.drop(['next_pymnt_d','zip_code'],axis=1,inplace=True)
ctgrcl_X_test.drop(['next_pymnt_d','zip_code'],axis=1,inplace=True)

ctgrcl_X_train.fillna('other',inplace=True)
ctgrcl_X_test.fillna('other',inplace=True)


# !!!!!!!!!!!!!!!!
Because emp_title and title features are waaaaaay tooooo sparce for the general logic of 
"less than 1% replacer" I have decided to use a simple list to replacement logic.
It's not generalizable but this is the optimal strategy in this case.

* even when I use the dictionary or a list (from train df) to replace the test df with "other", there are still titles that are not overlaping, that's how sparce titles are. Sure I could use some set logic but seems like an overkill in this P

In [625]:
emp_title_to_keep = ['Teacher','Manager','Owner']
title_to_keep = ['Debt consolidation','Credit card refinancing','Home improvement','Major purchase',            
'Medical expenses','Business' ]

In [622]:
def emp_title(title):
    if title in emp_title_to_keep:
        return title
    else:
        return 'other'
    
    
def title(title):
    if title in title_to_keep:
        return title
    else:
        return 'other'        

In [None]:
ctgrcl_X_train['emp_title'] = ctgrcl_X_train['emp_title'].apply(emp_title)
ctgrcl_X_train['title'] = ctgrcl_X_train['title'].apply(title)
ctgrcl_X_test['emp_title'] = ctgrcl_X_test['emp_title'].apply(emp_title)
ctgrcl_X_test['title'] = ctgrcl_X_test['title'].apply(title)

In [594]:
def less_than_1pct_cat_replace(df,test_df,pct=0.01):
    """
    Replace all the categorical values who's proportion
    is less than pct with 'other'. 
    !!! Returns two objects.
    """

    for column in df.columns:
        selection = df[column].value_counts(normalize=True)<pct
        list_to_replace = selection[selection==True].index
        mask = df[column].isin(list_to_replace)
        df[column][mask] = 'other'
        mask = test_df[column].isin(list_to_replace)
        test_df[column][mask] = 'other'
    
    return  df,test_df

# OneHotEncoding and stacking all the frames together

In [None]:
# OneHotEncoding for categorical variables

# using feature-engine open-source librabry
# https://feature-engine.readthedocs.io/en/latest/index.html 

ohe_enc = OneHotCategoricalEncoder(
    top_categories=None,
    drop_last=True)

ohe_enc.fit(ctgrcl_X_train)

ctgrcl_X_train = ohe_enc.transform(ctgrcl_X_train)
ctgrcl_X_test = ohe_enc.transform(ctgrcl_X_test)

In [65]:
print(ctgrcl_X_train.shape)
print(ctgrcl_X_test.shape)

(986504, 98)
(328835, 98)


In [68]:
# After rescaling a 2D numpy is returned, the following procedure
# turns it into a pandas df and stacks it together with the
# categorical dummies.

rescaled_discretized_X_train = pd.DataFrame(rescaled_discretized_X_train)
rescaled_discretized_X_test = pd.DataFrame(rescaled_discretized_X_test) 

rescaled_discretized_X_train.index = ctgrcl_X_train.index
rescaled_discretized_X_test.index = ctgrcl_X_test.index

rescaled_discretized_X_train.columns = nmrcl_X_train_columns
rescaled_discretized_X_test.columns = nmrcl_X_test_columns


In [76]:
final_X_train = pd.concat([rescaled_discretized_X_train,ctgrcl_X_train],axis=1)
final_X_test = pd.concat([rescaled_discretized_X_test,ctgrcl_X_test],axis=1)

In [77]:
final_X_train = pd.concat([final_X_train,y_train],axis=1)
final_X_test = pd.concat([final_X_test,y_test],axis=1)

In [83]:
final_X_train.to_csv('Data/loan_club_train.csv')
final_X_test.to_csv('Data/loan_club_test.csv')

In [86]:
print(final_X_train.shape)
print(final_X_test.shape)

(986504, 145)
(328835, 145)
