In [239]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from caimcaim import CAIMD # https://github.com/airysen/caimcaim 
# not working for me.
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import StratifiedShuffleSplit


# Display options

%matplotlib
%matplotlib inline
#pd.options.mode.chained_assignment = None #set it to None to remove SettingWithCopyWarning
pd.options.display.float_format = '{:.4f}'.format #set it to convert scientific noations such as 4.225108e+11 to 422510842796.00
pd.set_option('display.max_columns', 100) #  display all the columns
pd.set_option('display.max_rows', 100) # display all the rows
np.set_printoptions(suppress=True,formatter={'float_kind':'{:f}'.format})

Using matplotlib backend: MacOSX


In [None]:
# functions to be used later in the code.

def impute_numerical_data(num_df):
    """
    Impute all nan values in a numerical
    columns with the median of a correspondingx
    column.
    
    """
    NA_col = num_df.isnull().sum()
    NA_col = NA_col[NA_col>0].index
    
    for column in num_df[NA_col].columns:
        num_df[column].fillna(num_df[column].median(),inplace=True)

    return num_df


def outlier_replacer(df):
    """
    Replace all the outliers in a numerical df using 
    IQR methodology.
    
    """
    for column in df.columns:
        quartile_1, quartile_3 = np.percentile(df[column], [25, 75])
        iqr = quartile_3 - quartile_1
        lower_bound = quartile_1 - (float(iqr) * 1.5)
        upper_bound = quartile_3 + (float(iqr) * 1.5)
        df.loc[(df[column]<lower_bound) | (df[column] > upper_bound),[column]] = df[column].median()
        
    return df


def remove_single_unique_values(dataframe):
    
    """
    Drop all the columns that only contain one unique value.
    not optimized for categorical features yet.
    
    """
    
    cols_to_drop = dataframe.nunique()
    cols_to_drop = cols_to_drop.loc[cols_to_drop.values==1].index
    dataframe = dataframe.drop(cols_to_drop,axis=1)
    return dataframe

In [357]:
# loading data, dropping features with single uniqu values
df = pd.read_csv('2nd_clean.csv')
print('df is loaded')
df.drop('Unnamed: 0',axis='columns',inplace=True)

In [360]:
#train test split
y = df[['loan_status']]
X = df.drop('loan_status',axis='columns')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

In [559]:
# treatment of the X_train df

# dividing data into categorical and numerical parts
ctgrcl_X_train = X_train.select_dtypes(include=['object'])
nmrcl_X_train = X_train.select_dtypes(exclude=['object'])

#ctgrcl_X_train_columns = ctgrcl_X_train.columns
nmrcl_X_train_columns = nmrcl_X_train.columns

# Continuous data treatment

In [563]:
# imputation and outlier treatment (median, IQR)
nmrcl_X_train = impute_numerical_data(numerical)
nmrcl_X_train = outlier_replacer(numerical)
nmrcl_X_train = remove_single_unique_values(nmrcl_X_train)
X_train_median_values = nmrcl_X_train.median()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43
0,0.0000,0.0000,0.0000,0.4545,0.0000,0.3636,0.4545,0.5000,0.3636,0.1818,0.0000,0.6364,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.9091,0.7273,0.0909,0.5455,0.0000,0.6364,0.0000,0.0000,0.0000,0.0000,0.2500,0.1818,0.3636,0.3000,0.0909,0.9091,0.3636,0.6364,0.3636,0.3636,0.6000,0.0000,0.0909,0.1818,0.5455,0.1818
1,0.8182,0.8182,0.8182,0.7273,0.9091,0.7273,0.7273,0.0000,0.5455,0.5455,0.1818,0.9091,0.0000,0.0000,0.0000,0.3636,0.8182,0.2727,0.5455,0.8182,0.5455,0.5455,0.0000,0.4545,0.2727,0.1818,0.6000,0.0909,0.5000,0.1818,0.2727,0.4000,0.4545,0.2727,0.4545,0.4545,0.2727,0.5455,1.0000,0.0000,1.0000,0.4545,0.4545,0.3636
2,0.0909,0.0909,0.0909,0.7273,0.0909,0.4545,0.1818,0.0000,0.3636,0.1818,0.0909,0.1818,0.0000,0.0000,0.0000,0.0000,0.0909,0.3636,0.5455,0.2727,0.5455,0.5455,0.0000,0.1818,0.5455,1.0000,0.4000,0.5455,0.6250,0.0000,0.1818,0.3000,0.0909,0.1818,0.3636,0.1818,0.1818,0.3636,0.0000,0.0000,0.3636,0.0909,0.4545,0.0909
3,0.0909,0.0909,0.0909,0.7273,0.0909,0.7273,0.4545,0.0000,0.4545,0.1818,0.8182,0.1818,0.0000,0.0000,0.0000,0.0000,0.0909,1.0000,0.0909,0.3636,0.5455,0.0909,0.5455,0.4545,0.5455,0.2727,0.6000,0.5455,0.6250,0.0909,0.0909,0.1000,0.0909,0.3636,0.0909,0.0000,0.0909,0.4545,0.6000,1.0000,1.0000,0.4545,0.0909,0.5455
4,0.8182,0.8182,0.8182,0.5455,0.8182,0.3636,0.7273,0.0000,0.5455,0.0000,0.0000,0.4545,0.0000,0.0000,0.0000,0.2727,0.8182,0.3636,0.4545,0.7273,0.7273,0.8182,0.0000,0.6364,0.0909,0.0909,0.4000,0.0000,0.6250,0.1818,0.0909,0.3000,0.4545,0.4545,0.3636,0.4545,0.0909,0.5455,0.9000,0.0000,0.3636,0.3636,0.2727,0.3636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2260663,0.2727,0.2727,0.2727,0.4545,0.1818,0.4545,0.5455,0.0000,0.5455,0.3636,0.3636,0.6364,0.1818,0.1818,0.0909,0.8182,0.2727,0.1818,0.5455,0.3636,0.2727,0.1818,0.5455,0.4545,0.0000,0.0000,0.4000,0.3636,0.6250,0.1818,0.2727,0.3000,0.4545,0.6364,0.6364,0.7273,0.2727,0.5455,0.6000,0.7273,0.1818,0.5455,0.1818,0.6364
2260664,0.2727,0.2727,0.2727,0.3636,0.3636,0.1818,0.4545,0.5000,0.4545,0.0909,0.0909,0.4545,0.6364,0.6364,0.5455,1.0000,0.0909,0.2727,0.3636,0.2727,0.5455,0.9091,0.0909,0.5455,0.5455,0.5455,0.8000,0.5455,0.0000,0.0909,0.2727,0.2000,0.2727,0.1818,0.3636,0.6364,0.2727,0.4545,0.2000,0.4545,1.0000,0.0909,0.3636,0.0909
2260665,0.2727,0.2727,0.2727,0.3636,0.2727,0.4545,0.2727,0.0000,0.2727,0.2727,0.7273,0.1818,0.1818,0.1818,0.1818,0.4545,0.2727,0.6364,0.1818,0.1818,0.5455,0.0000,0.8182,0.9091,0.5455,0.1818,0.6000,0.5455,0.3750,0.0909,0.1818,0.1000,0.0909,0.1818,0.1818,0.1818,0.1818,0.2727,0.2000,1.0000,0.5455,0.4545,0.0000,0.3636
2260666,0.2727,0.2727,0.2727,0.9091,0.2727,0.4545,0.8182,0.5000,0.3636,0.5455,0.2727,0.2727,0.1818,0.1818,0.0909,0.5455,0.2727,0.1818,0.8182,0.3636,0.6364,0.5455,0.0909,0.0909,0.2727,0.2727,0.0000,0.6364,0.6250,0.1818,0.1818,0.2000,0.0909,0.6364,0.1818,0.0909,0.1818,0.3636,0.6000,0.0000,0.2727,0.9091,0.7273,0.3636


In [564]:
nmrcl_X_train# This is a sub-optimal, temporary measure, as I haven't figured out 
# how to do supervised discretization.
discretizer = KBinsDiscretizer(n_bins=12, encode='ordinal', strategy='kmeans')
x = discretizer.fit_transform(nmrcl_X_train)

  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)


In [565]:
discretized_X_train = x 

In [566]:
# Min_max
scaler = preprocessing.MinMaxScaler()
rescaled_discretized_X_train = scaler.fit_transform(np.array(discretized_X_train))

In [567]:
rescaled_discretized_X_train

array([[0.000000, 0.000000, 0.000000, ..., 0.181818, 0.545455, 0.181818],
       [0.818182, 0.818182, 0.818182, ..., 0.454545, 0.454545, 0.363636],
       [0.090909, 0.090909, 0.090909, ..., 0.090909, 0.454545, 0.090909],
       ...,
       [0.272727, 0.272727, 0.272727, ..., 0.454545, 0.000000, 0.363636],
       [0.272727, 0.272727, 0.272727, ..., 0.909091, 0.727273, 0.363636],
       [0.454545, 0.454545, 0.454545, ..., 0.272727, 0.636364, 0.000000]])

# Question 

I want to use StratifiedShuffleSplit for cross-validation, is it a good choice ?

# Cleaning the categorical data

I have a few ideas on how to deal with the huge number of employee titles but there are enough questions related to the numerical observations for now. So nothing is done with the categorical data.

In [398]:
ctgrcl_X_train.head(2)

Unnamed: 0,term,grade,sub_grade,emp_title,emp_length,home_ownership,verification_status,issue_d,pymnt_plan,purpose,title,addr_state,earliest_cr_line,initial_list_status,last_pymnt_d,last_credit_pull_d,application_type,hardship_flag,disbursement_method,debt_settlement_flag
1404520,36 months,C,C5,Asst. Professor,3 years,RENT,Not Verified,Feb-2017,n,credit_card,Credit card refinancing,AL,Jan-2003,w,Dec-2017,Feb-2019,Individual,N,Cash,N
1686362,60 months,D,D1,"Manager, Channel Sales",10+ years,MORTGAGE,Verified,Dec-2013,n,debt_consolidation,Payoff Debts,CA,Nov-1999,f,Jan-2019,Jan-2019,Individual,N,Cash,N


In [248]:
def check_nan_and_categories(df_column):
    """
    Print value_counts and count of nan
    of a categotical feature.
    
    """
    value_counts = df_column.value_counts()
    nans = df_column.isna().value_counts()
    print(value_counts)
    print('*************************')
    print(nans) 

In [399]:
"""
You probably don't need this, just fillna on the entire df.

"""


# emp_title
ctgrcl_X_train.emp_title.fillna('other',inplace=True)

#emp_length
ctgrcl_X_train.emp_length.fillna('other',inplace=True)

# title
ctgrcl_X_train.title.fillna('other',inplace=True)

# last_pymnt_d
ctgrcl_X_train.last_pymnt_d.fillna('other',inplace=True)

# last_credit_pull_d
ctgrcl_X_train.last_credit_pull_d.fillna('other',inplace=True)

In [400]:
emp_lengthdict = {'10+ years':'ten years or more', 
 '2 years':'two years', 
 '< 1 year':'less than a year', 
 '3 years':'three years', 
 'other':'other', 
 '1 year':'one year',
 '5 years':'five years', 
 '4 years':'four years', 
 '6 years':'six years', 
 '7 years':'seven years', 
 '8 years':'eight years', 
 '9 years':'nine years'}

ctgrcl_X_train.emp_length.replace(emp_lengthdict,inplace=True)

In [443]:
ctgrcl_X_train.drop(['issue_d',
                     'earliest_cr_line',
                     'last_pymnt_d',
                     'last_credit_pull_d'],axis='columns',
                   inplace=True)

# Getting dummies

I tried a few different methods but I haven't been able to 
replace employee titles that are less than 1% with 'other', 
can you please suggest something?

In [540]:
ctgrcl_X_train.drop(['emp_title','title'],axis='columns',inplace=True)

In [551]:
ctgrcl_dummies = pd.get_dummies(ctgrcl_X_train,drop_first=True)

In [None]:
##########

# Stacking categorical and numerical dfs

In [568]:
nmrcl_X_train = pd.DataFrame(rescaled_discretized_X_train)

In [None]:
nmrcl_X_train.columns=nmrcl_X_train_columns