In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from caimcaim import CAIMD # https://github.com/airysen/caimcaim 
# not working for me.
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import StratifiedShuffleSplit


# Display options

%matplotlib
%matplotlib inline
#pd.options.mode.chained_assignment = None #set it to None to remove SettingWithCopyWarning
pd.options.display.float_format = '{:.4f}'.format #set it to convert scientific noations such as 4.225108e+11 to 422510842796.00
pd.set_option('display.max_columns', 100) #  display all the columns
pd.set_option('display.max_rows', 100) # display all the rows
np.set_printoptions(suppress=True,formatter={'float_kind':'{:f}'.format})

Using matplotlib backend: MacOSX


In [2]:
# functions to be used later in the code.

"""
df[NA_cols].apply(lambda x: x.fillna(x.median()),axis=1)
"""


def impute_numerical_data(num_df):
    """
    Impute all nan values in a numerical
    columns with the median of a correspondingx
    column.
    
    """
    NA_col = num_df.isnull().sum()
    NA_col = NA_col[NA_col>0].index
    
    for column in num_df[NA_col].columns:
        num_df[column].fillna(num_df[column].median(),inplace=True)

    return num_df


def outlier_replacer(df):
    """
    Replace all the outliers in a numerical df using 
    IQR methodology.
    
    """
    for column in df.columns:
        quartile_1, quartile_3 = np.percentile(df[column], [25, 75])
        iqr = quartile_3 - quartile_1
        lower_bound = quartile_1 - (float(iqr) * 1.5)
        upper_bound = quartile_3 + (float(iqr) * 1.5)
        df.loc[(df[column]<lower_bound) | (df[column] > upper_bound),[column]] = df[column].median()
        
    return df


def remove_single_unique_values(dataframe):
    
    """
    Drop all the columns that only contain one unique value.
    not optimized for categorical features yet.
    
    """
    
    cols_to_drop = dataframe.nunique()
    cols_to_drop = cols_to_drop.loc[cols_to_drop.values==1].index
    dataframe = dataframe.drop(cols_to_drop,axis=1)
    return dataframe

In [3]:
# loading data, dropping features with single uniqu values
df = pd.read_csv('2nd_clean.csv')
print('df is loaded')
df.drop('Unnamed: 0',axis='columns',inplace=True)

df is loaded


In [7]:
#train test split
y = df[['loan_status']]
X = df.drop('loan_status',axis='columns')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

In [8]:
# dividing data into categorical and numerical parts
ctgrcl_X_train = X_train.select_dtypes(include=['object'])
nmrcl_X_train = X_train.select_dtypes(exclude=['object'])

# Continuous data treatment

In [10]:
# imputation and outlier treatment (median, IQR)
nmrcl_X_train = impute_numerical_data(nmrcl_X_train)
nmrcl_X_train = outlier_replacer(nmrcl_X_train)
nmrcl_X_train = remove_single_unique_values(nmrcl_X_train)
nmrcl_X_train_columns = nmrcl_X_train.columns
X_train_median_values = nmrcl_X_train.median()

print('imputation and outlier treatment - done')

# This is a sub-optimal, temporary measure, as I haven't figured out 
# how to do supervised discretization.
discretizer = KBinsDiscretizer(n_bins=12, encode='ordinal', strategy='kmeans')
discretized_X_train = discretizer.fit_transform(nmrcl_X_train)

print('discretization - done')

# Min_max
scaler = preprocessing.MinMaxScaler()
rescaled_discretized_X_train = scaler.fit_transform(np.array(discretized_X_train))

print('rescaling - done')
print('numerical data is ready')

imputation and outlier treatment - done


  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)


discretization - done
rescaling - done
numerical data is ready


# Categorical data treatment

In [None]:
ctgrcl_X_train.drop(['issue_d',
                     'earliest_cr_line',
                     'last_pymnt_d',
                     'last_credit_pull_d'],axis='columns',
                   inplace=True)

In [32]:
ctgrcl_X_train.columns

Index(['term', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       'home_ownership', 'verification_status', 'pymnt_plan', 'purpose',
       'title', 'addr_state', 'initial_list_status', 'application_type',
       'hardship_flag', 'disbursement_method', 'debt_settlement_flag'],
      dtype='object')

In [248]:
def check_nan_and_categories(df_column):
    """
    Print value_counts and count of nan
    of a categotical feature.
    
    """
    value_counts = df_column.value_counts()
    nans = df_column.isna().value_counts()
    print(value_counts)
    print('*************************')
    print(nans) 

# Less than 1% other replacer:

In [11]:
ctgrcl_X_train.fillna('other')


emp_title_list = ['Teacher','Manager','Owner']
mask = ctgrcl_X_train['emp_title'].isin(emp_title_list)==False
ctgrcl_X_train['emp_title'][mask] = 'other'

title_list = ['Debt consolidation',        
    'Credit card refinancing',   
    'Home improvement',          
    'Other',                     
    'Major purchase',            
    'Medical expenses'] 

mask = ctgrcl_X_train['title'].isin(title_list)==False
ctgrcl_X_train['title'][mask] = 'other'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [15]:
emp_lengthdict = {'10+ years':'ten years or more', 
 '2 years':'two years', 
 '< 1 year':'less than a year', 
 '3 years':'three years', 
 'other':'other', 
 '1 year':'one year',
 '5 years':'five years', 
 '4 years':'four years', 
 '6 years':'six years', 
 '7 years':'seven years', 
 '8 years':'eight years', 
 '9 years':'nine years'}

ctgrcl_X_train.emp_length.replace(emp_lengthdict,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


# Getting dummies

I tried a few different methods but I haven't been able to 
replace employee titles that are less than 1% with 'other', 
can you please suggest something?

In [18]:
ctgrcl_dummies = pd.get_dummies(ctgrcl_X_train,drop_first=True)

# Stacking categorical and numerical dfs

In [19]:
final_nmrcl_X_train = pd.DataFrame(rescaled_discretized_X_train)
final_nmrcl_X_train.columns = nmrcl_X_train_columns

In [20]:
final_nmrcl_X_train.index = ctgrcl_dummies.index

In [21]:
final_X_train = pd.concat([final_nmrcl_X_train,ctgrcl_dummies], axis=1)

In [30]:
final_X_train.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,inq_last_6mths,open_acc,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,last_pymnt_amnt,tot_cur_bal,total_rev_hi_lim,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_inq,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,term_ 60 months,grade_B,...,addr_state_CT,addr_state_DC,addr_state_DE,addr_state_FL,addr_state_GA,addr_state_HI,addr_state_IA,addr_state_ID,addr_state_IL,addr_state_IN,addr_state_KS,addr_state_KY,addr_state_LA,addr_state_MA,addr_state_MD,addr_state_ME,addr_state_MI,addr_state_MN,addr_state_MO,addr_state_MS,addr_state_MT,addr_state_NC,addr_state_ND,addr_state_NE,addr_state_NH,addr_state_NJ,addr_state_NM,addr_state_NV,addr_state_NY,addr_state_OH,addr_state_OK,addr_state_OR,addr_state_PA,addr_state_RI,addr_state_SC,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,initial_list_status_w,application_type_Joint App,hardship_flag_Y,disbursement_method_DirectPay,debt_settlement_flag_Y
1404520,0.0909,0.0909,0.0909,0.5455,0.0909,0.3636,0.7273,0.5,0.7273,0.1818,1.0,0.4545,0.0,0.0,0.1818,0.1818,0.1818,0.0909,0.3636,0.3636,0.0909,0.6364,0.2727,0.0,0.8182,0.8182,0.1818,0.7273,0.2727,0.0,0.4545,0.2727,0.1818,0.2727,0.2,0.0909,0.9091,0.2727,0.2727,0.2727,0.7273,0.1429,1.0,0.4545,0.2727,0.3636,0.0909,0.2727,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1686362,0.9091,0.9091,0.9091,0.6364,0.8182,0.7273,0.2727,0.5,0.5455,0.2727,0.9091,0.3636,0.0,0.0,0.2727,0.2727,0.2727,0.2727,0.0,0.8182,0.1818,0.0,0.8182,0.0,0.8182,0.6364,0.4545,0.2727,0.2727,1.0,0.2727,0.0909,0.3636,0.5455,0.3,0.1818,0.3636,0.3636,0.2727,0.5455,0.5455,0.0,1.0,0.7273,0.8182,0.3636,0.0909,0.1818,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1919269,0.6364,0.6364,0.6364,0.0909,0.8182,0.9091,0.1818,0.0,0.4545,0.2727,0.6364,0.3636,0.0,0.0,0.9091,0.9091,1.0,0.5455,0.6364,0.0909,1.0,0.0,0.0909,0.9091,0.5455,0.5455,0.6364,0.6364,0.9091,0.0,0.8182,0.2727,0.5455,0.4545,0.7,0.9091,0.0,0.5455,0.6364,0.4545,0.4545,0.0,1.0,0.2727,0.0909,0.4545,0.2727,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
749673,0.2727,0.2727,0.2727,0.4545,0.1818,0.4545,0.1818,0.0,0.3636,0.3636,0.3636,0.2727,0.0,0.0,0.3636,0.3636,0.3636,0.0909,0.3636,0.0,0.4545,0.1818,0.0,0.7273,0.1818,0.5455,0.4545,0.6364,0.4545,0.0,0.3636,0.9091,0.3636,0.4545,0.4,0.3636,0.0,0.3636,0.4545,0.4545,0.3636,0.1429,1.0,0.1818,0.0,0.1818,0.4545,0.0909,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
869079,0.2727,0.2727,0.2727,0.0909,0.2727,0.5455,0.2727,0.5,0.3636,0.2727,0.9091,0.4545,0.0,0.0,0.3636,0.3636,0.3636,0.0909,0.3636,0.0909,0.0909,0.1818,0.0909,0.0909,0.8182,0.5455,0.5455,0.0,0.0,0.4,0.0,0.0,0.8182,0.4545,0.4,0.6364,0.2727,0.3636,0.4545,0.4545,0.3636,0.1429,1.0,0.8182,0.0909,0.4545,0.1818,0.3636,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


Unnamed: 0,loan_status
0,Current
1,Current
2,Current
3,Current
4,Current


# !!!
There are probably more but the 1 thing that I haven't done yet 
 is the proper discretizer.

In [None]:
############

In [26]:
sample_df = final_X_train.sample(1000)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,inq_last_6mths,open_acc,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,last_pymnt_amnt,tot_cur_bal,total_rev_hi_lim,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_inq,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,term_ 60 months,grade_B,...,addr_state_CT,addr_state_DC,addr_state_DE,addr_state_FL,addr_state_GA,addr_state_HI,addr_state_IA,addr_state_ID,addr_state_IL,addr_state_IN,addr_state_KS,addr_state_KY,addr_state_LA,addr_state_MA,addr_state_MD,addr_state_ME,addr_state_MI,addr_state_MN,addr_state_MO,addr_state_MS,addr_state_MT,addr_state_NC,addr_state_ND,addr_state_NE,addr_state_NH,addr_state_NJ,addr_state_NM,addr_state_NV,addr_state_NY,addr_state_OH,addr_state_OK,addr_state_OR,addr_state_PA,addr_state_RI,addr_state_SC,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,initial_list_status_w,application_type_Joint App,hardship_flag_Y,disbursement_method_DirectPay,debt_settlement_flag_Y
2200497,0.6364,0.6364,0.6364,0.9091,0.7273,0.3636,0.8182,0.5,0.3636,0.0,0.0909,0.2727,0.0,0.0,0.1818,0.1818,0.0,0.3636,0.4545,0.0909,0.2727,0.2727,0.0909,0.6364,0.0909,0.2727,0.8182,0.1818,0.2727,0.0,0.0909,0.0,0.5455,0.3636,0.6,0.4545,0.2727,0.3636,0.2727,0.3636,0.3636,0.7143,0.0909,0.0,0.1818,0.4545,0.2727,0.8182,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1386479,0.2727,0.2727,0.2727,1.0,0.2727,0.1818,0.6364,0.0,0.4545,0.1818,0.9091,0.3636,0.0,0.0,0.3636,0.3636,0.3636,0.0909,0.3636,0.0909,0.0909,0.3636,0.0909,0.0,0.8182,0.0909,0.0909,0.0,0.0909,0.0,0.1818,0.2727,0.3636,0.7273,0.3,0.0909,0.5455,0.5455,0.2727,0.7273,0.4545,0.2857,0.7273,1.0,0.0909,0.5455,0.0,0.4545,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
273443,0.2727,0.2727,0.2727,0.0,0.2727,0.1818,0.0,0.0,0.1818,0.0909,0.0909,0.0909,1.0,1.0,0.0,0.0,0.0,0.0,0.1818,0.0,0.3636,0.2727,0.0,0.2727,0.0909,0.5455,0.0909,0.4545,0.6364,0.0,0.2727,0.6364,0.1818,0.0909,0.4,0.2727,0.0,0.2727,0.1818,0.0909,0.1818,0.4286,1.0,0.0,0.0,0.0,0.5455,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
189293,0.2727,0.2727,0.2727,0.0,0.3636,0.9091,0.5455,0.0,0.5455,0.5455,0.2727,0.5455,0.0,0.0,0.0,0.0,0.0,0.0,0.1818,0.0909,0.8182,0.0,0.0909,0.2727,0.1818,0.5455,0.3636,0.2727,0.2727,0.2,0.9091,0.2727,0.4545,0.4545,0.5,0.2727,0.5455,0.5455,0.5455,0.4545,0.5455,0.0,0.8182,0.0,0.1818,0.4545,1.0,0.7273,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1495329,0.2727,0.2727,0.2727,0.3636,0.2727,0.1818,0.9091,0.0,0.1818,0.2727,0.8182,0.0909,0.0,0.0,0.1818,0.1818,0.0,0.8182,0.1818,0.0909,0.7273,0.0909,0.1818,0.4545,0.6364,0.5455,0.5455,0.1818,0.2727,0.0,0.0909,0.3636,0.5455,0.3636,0.6,0.2727,0.0,0.2727,0.1818,0.3636,0.1818,0.2857,1.0,0.8182,0.0,0.3636,0.9091,0.0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
