In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#from caimcaim import CAIMD # https://github.com/airysen/caimcaim 
# not working for me.
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import StratifiedShuffleSplit


# Display options

%matplotlib
%matplotlib inline
#pd.options.mode.chained_assignment = None #set it to None to remove SettingWithCopyWarning
pd.options.display.float_format = '{:.4f}'.format #set it to convert scientific noations such as 4.225108e+11 to 422510842796.00
pd.set_option('display.max_columns', 100) #  display all the columns
pd.set_option('display.max_rows', 100) # display all the rows
np.set_printoptions(suppress=True,formatter={'float_kind':'{:f}'.format})

Using matplotlib backend: MacOSX


# Additional functions

In [2]:
# functions to be used later in the code.


def less_than_1pct_cat_replace(df,pct=0.01):
    """
    Replace all the categorical values who's proportion
    is less than pct with 'other'.
    """

    for column in df.columns:
        selection = df[column].value_counts(normalize=True)<pct
        list_to_replace = selection[selection==True].index
        mask = df[column].isin(list_to_replace)
        df[column][mask] = 'other'
    
    return df



def outlier_replacer(df):
    """
    Replace all the outliers in a numerical df using 
    IQR methodology.
    
    """
    for column in df.columns:
        quartile_1, quartile_3 = np.percentile(df[column], [25, 75])
        iqr = quartile_3 - quartile_1
        lower_bound = quartile_1 - (float(iqr) * 1.5)
        upper_bound = quartile_3 + (float(iqr) * 1.5)
        df.loc[(df[column]<lower_bound) | (df[column] > upper_bound),[column]] = df[column].median()
        
    return df


def remove_single_unique_values(dataframe):
    
    """
    Drop all the columns that only contain one unique value.
    not optimized for categorical features yet.
    
    """
    
    cols_to_drop = dataframe.nunique()
    cols_to_drop = cols_to_drop.loc[cols_to_drop.values==1].index
    dataframe = dataframe.drop(cols_to_drop,axis=1)
    return dataframe

In [60]:
# loading data
df = pd.read_csv('2nd_clean.csv')
print('df is loaded')
df.drop(['Unnamed: 0','issue_d','earliest_cr_line','last_pymnt_d',
        'last_credit_pull_d'],axis='columns',inplace=True)

df is loaded


# Treating categorical part of the df

In [92]:
# splitting the df into categorical and numerical parts
ctgrcl_df = df.select_dtypes(include=['object'])
nmrcl_df = df.select_dtypes(exclude=['object'])

# have to drop the y feature
ctgrcl_df.drop(['loan_status'],axis='columns',inplace=True)

In [93]:
ctgrcl_df.fillna('other',inplace=True)

In [94]:
# '<,>,+' have to be replaced for correct working of get_dummies function
emp_lengthdict = {'10+ years':'ten years or more', 
 '2 years':'two years', 
 '< 1 year':'less than a year', 
 '3 years':'three years', 
 'other':'other', 
 '1 year':'one year',
 '5 years':'five years', 
 '4 years':'four years', 
 '6 years':'six years', 
 '7 years':'seven years', 
 '8 years':'eight years', 
 '9 years':'nine years'}

ctgrcl_df.emp_length.replace(emp_lengthdict,inplace=True)

In [95]:
ctgrcl_df = less_than_1pct_cat_replace(ctgrcl_df,pct=0.01)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [98]:
# now the categorical part of the df is ready for train_test_split
df = pd.concat([nmrcl_df,ctgrcl_df,df[['loan_status']]],axis=1)

# Train test split

In [99]:
# loading data, dropping features with single uniqu values

y = df[['loan_status']]
X = df.drop('loan_status',axis='columns')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

# Dividing data into categorical and numerical parts

In [156]:
# dividing training and testing data into categorical and numerical parts
ctgrcl_X_train = X_train.select_dtypes(include=['object'])
nmrcl_X_train = X_train.select_dtypes(exclude=['object'])

ctgrcl_X_test = X_test.select_dtypes(include=['object'])
nmrcl_X_test = X_test.select_dtypes(exclude=['object'])

In [101]:
print(nmrcl_X_train.shape)
print(nmrcl_X_test.shape)

(1695501, 64)
(565167, 64)


# Treating numerical X_train

In [102]:
# imputation and outlier treatment (median, IQR)
nmrcl_X_train = nmrcl_X_train.fillna(nmrcl_X_train.median())


nmrcl_X_train = outlier_replacer(nmrcl_X_train)
nmrcl_X_train = remove_single_unique_values(nmrcl_X_train)
nmrcl_X_train_columns = nmrcl_X_train.columns
X_train_median_values = nmrcl_X_train.median()

print('imputation and outlier treatment - done')

# This is a sub-optimal, temporary measure, as I haven't figured out 
# how to do supervised discretization.
discretizer = KBinsDiscretizer(n_bins=12, encode='ordinal', strategy='kmeans')
discretized_X_train = discretizer.fit_transform(nmrcl_X_train)

print('discretization - done')

# Min_max
scaler = preprocessing.MinMaxScaler()
rescaled_discretized_X_train = scaler.fit_transform(np.array(discretized_X_train))

print('rescaling - done')
print('numerical data is ready')

imputation and outlier treatment - done


  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)


discretization - done
rescaling - done
numerical data is ready


# Treating numerical X_test

In [160]:
# imputation and outlier treatment (median, IQR)
#nmrcl_X_test[nmrcl_X_train_columns].shape

nmrcl_X_test = nmrcl_X_test[nmrcl_X_train_columns]

nmrcl_X_test = nmrcl_X_test.fillna(nmrcl_X_train.median())

nmrcl_X_test = outlier_replacer(nmrcl_X_test)

nmrcl_X_test_columns = nmrcl_X_test.columns


print('imputation and outlier treatment - done')

# This is a sub-optimal, temporary measure, as I haven't figured out 
# how to do supervised discretization.
discretizer = KBinsDiscretizer(n_bins=12, encode='ordinal', strategy='kmeans')
discretized_nmrcl_X_test = discretizer.fit_transform(nmrcl_X_test)

print('discretization - done')

# Min_max
scaler = preprocessing.MinMaxScaler()
rescaled_discretized_X_test = scaler.fit_transform(np.array(discretized_nmrcl_X_test))

print('rescaling - done')
print('numerical data is ready')

imputation and outlier treatment - done


  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)
  return_n_iter=True)
  'decreasing the number of bins.' % jj)


discretization - done
rescaling - done
numerical data is ready


# Getting dummies and stacking all the frames together

In [164]:
# get dummies for the categorical variables
ctgrcl_X_train = pd.get_dummies(ctgrcl_X_train,drop_first=True)
ctgrcl_X_test = pd.get_dummies(ctgrcl_X_test,drop_first=True)

In [165]:
print(ctgrcl_X_train.shape)
print(ctgrcl_X_test.shape)

(1695501, 97)
(565167, 97)


In [175]:
# After rescaling a 2D numpy is returned, the following procedure
# turns it into a pandas df and stacks it together with the
# categorical dummies.

rescaled_discretized_X_train = pd.DataFrame(rescaled_discretized_X_train)
rescaled_discretized_X_test = pd.DataFrame(rescaled_discretized_X_test) 

rescaled_discretized_X_train.index = ctgrcl_X_train.index
rescaled_discretized_X_test.index = ctgrcl_X_test.index

rescaled_discretized_X_train.columns = nmrcl_X_train_columns
rescaled_discretized_X_test.columns = nmrcl_X_test_columns


In [177]:
final_X_train = pd.concat([rescaled_discretized_X_train,ctgrcl_X_train],axis=1)
final_X_test = pd.concat([rescaled_discretized_X_test,ctgrcl_X_test],axis=1)

In [180]:
final_X_train.shape

(1695501, 145)

In [181]:
final_X_test.shape

(565167, 145)