# Data pre-processing

In [1]:

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn.model_selection import StratifiedKFold
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
loan_data = pd.read_csv("loan_update.csv",index_col=0)

In [3]:
#imputing missing values
#import missingno as msno

#msno.matrix(loan_data)

In [4]:
from sklearn.impute import SimpleImputer

In [5]:
def median_impute(df,feature) :
    median_imputer = SimpleImputer(strategy='median')
    df[feature] = median_imputer.fit_transform(df[feature].values.reshape(-1,1))

In [6]:
median_impute(loan_data,["pub_rec_bankruptcies"])
median_impute(loan_data,['revol_util'])
median_impute(loan_data,['dti'])
median_impute(loan_data,['mort_acc'])
median_impute(loan_data,['emp_length'])

In [7]:
loan_data.isnull().sum()

loan_amnt                   0
funded_amnt_inv             0
funded_amnt                 0
term                        0
int_rate                    0
installment                 0
grade                       0
sub_grade                   0
emp_title               85791
emp_length                  0
fico_range_high             0
fico_range_low              0
home_ownership              0
annual_inc                  0
verification_status         0
issue_d                     0
loan_status                 0
purpose                     0
title                   16659
zip_code                    1
addr_state                  0
dti                         0
earliest_cr_line            0
open_acc                    0
revol_bal                   0
revol_util                  0
total_acc                   0
initial_list_status         0
application_type            0
mort_acc                    0
pub_rec_bankruptcies        0
Fico_average                0
log_annual_inc              0
region    

In [8]:
#X_train.columns.to_list()

In [9]:
class Outlier_Drop_and_Skewness_handler(BaseEstimator, TransformerMixin):
    def __init__(self,feature = ['loan_amnt','int_rate','installment','annual_inc','dti',
                                'open_acc','revol_bal','revol_util','total_acc','Fico_average','mort_acc']):
        self.feature = feature
    def fit(self,df):
        return self
    def transform(self,df):
        if (set(self.feature).issubset(df.columns)):
            df[self.feature] = np.cbrt(df[self.feature])
            # 25% quantile
            Q1 = df[self.feature].quantile(.25)
            # 75% quantile
            Q3 = df[self.feature].quantile(.75)
            IQR = Q3 - Q1
            # keep the data within 3 IQR
            df = df[~((df[self.feature] < (Q1 - 3 * IQR)) |(df[self.feature] > (Q3 + 3 * IQR))).any(axis=1)]
            return df
        else:
            print("One or more features are not in the dataframe")
            return df

In [10]:
class features_to_drop(BaseEstimator,TransformerMixin):
    def __init__(self,feature= ['funded_amnt_inv','funded_amnt','grade','emp_title',
                               'fico_range_high','fico_range_low','issue_d','loan_status',
                               'title','addr_state','zip_code','log_annual_inc','earliest_cr_line']):
        self.feature = feature
    def fit(self,df):
        return self
    def transform(self,df):
        if (set(self.feature).issubset(df.columns)):
            df.drop(self.feature,axis=1,inplace=True)
            return df
        else:
            print("One or more features are not in the dataframe")
            return df

In [11]:
class one_hot_encoding(BaseEstimator, TransformerMixin):
    def __init__(self,feature = ['term','home_ownership','verification_status',
                                 'purpose','initial_list_status',
                                 'application_type','region']):
        self.feature = feature                         
    def fit(self,df):
        return self
    def transform(self,df):
        if (set(self.feature).issubset(df.columns)):
            df = pd.get_dummies(df, columns = self.feature)
            return df
        else:
            print("One or more features are not in the dataframe")
            return df

In [12]:
class OrdinalFeatNames(BaseEstimator,TransformerMixin):
    def __init__(self,feature = ['sub_grade']):
        self.feature = feature
    def fit(self,df):
        return self
    def transform(self,df):
        if (set(self.feature).issubset(df.columns)):
            ordinal_enc = OrdinalEncoder()
            df[self.feature] = ordinal_enc.fit_transform(df[self.feature])
            return df
        else:
            print("One or more features are not in the dataframe")
            return df

In [13]:
class MinMaxWithFeatNames(BaseEstimator,TransformerMixin):
    def __init__(self,feature = ['loan_amnt','int_rate','installment','emp_length','annual_inc','dti',
                                 'open_acc','revol_bal','revol_util','total_acc', 'mort_acc',
                                 'pub_rec_bankruptcies','Fico_average','year']):
        self.feature = feature
    def fit(self,df):
        return self
    def transform(self,df):
        if (set(self.feature).issubset(df.columns)):
            min_max_enc = MinMaxScaler()
            df[self.feature] = min_max_enc.fit_transform(df[self.feature])
            return df
        else:
            print("One or more features are not in the dataframe")
            return df

In [14]:

class Oversample(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,df):
        return self
    def transform(self,df):
        if 'status_int' in df.columns:
            # smote function to oversample the minority class to fix the imbalance data
            oversample = SMOTE(sampling_strategy='minority')
            X_bal, y_bal = oversample.fit_resample(df.loc[:, df.columns != 'status_int'],df['status_int'])
            df_bal = pd.concat([pd.DataFrame(X_bal),pd.DataFrame(y_bal)],axis=1)
            return df_bal
        else:
            print("status_int is not in the dataframe")
            return df


In [16]:
#train test split loan_data
from sklearn.model_selection import train_test_split
Train_df,Test_df = train_test_split(loan_data,stratify=loan_data["status_int"], test_size=0.20)

In [17]:
    # Create the pipeline that will call all the class from OutlierRemoval to OversampleSMOTE in one go
pipeline1 = Pipeline([
        ('Outlier_Drop_and_Skewness_handler',Outlier_Drop_and_Skewness_handler()),
        ('drop feature',features_to_drop()),
        ('one_hot encoding',one_hot_encoding()),
        ('ordinal encoding',OrdinalFeatNames()),
        ('standardize',MinMaxWithFeatNames()),
        ('class inblance',Oversample()),
    ])

In [18]:
Train_df= pipeline1.fit_transform(Train_df)
Train_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(self.feature,axis=1,inplace=True)


Unnamed: 0,loan_amnt,int_rate,installment,sub_grade,emp_length,annual_inc,dti,open_acc,revol_bal,revol_util,total_acc,mort_acc,pub_rec_bankruptcies,Fico_average,year,month,term_ 36 months,term_ 60 months,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,region_MidWest,region_NorthEast,region_SouthEast,region_SouthWest,region_West,status_int
0,0.943309,0.000784,0.824485,0.0,0.5,0.65115,0.461218,0.65082,0.283746,0.1655,0.53654,0.388911,0.0,0.890449,1.0,2,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1
1,0.320545,0.52883,0.353991,12.0,1.0,0.370985,0.307102,0.310192,0.380145,0.670453,0.253308,0.0,0.0,0.231229,0.454545,7,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,1
2,0.3564,0.476184,0.380493,13.0,1.0,0.629338,0.658399,0.535313,0.682009,0.747723,0.466822,0.515833,0.0,0.476345,0.727273,4,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,1
3,0.876306,0.423181,0.805753,10.0,0.6,0.39819,0.524941,0.596484,0.530933,0.538957,0.494357,0.339745,0.0,0.231229,0.818182,12,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,1
4,0.657264,0.502903,0.632422,14.0,0.2,0.541157,0.561568,0.346563,0.436678,0.489743,0.215499,0.0,0.0,0.231229,0.727273,6,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0


In [19]:
pipeline2 = Pipeline([
        ('Outlier_Drop_and_Skewness_handler',Outlier_Drop_and_Skewness_handler()),
        ('drop feature',features_to_drop()),
        ('one_hot encoding',one_hot_encoding()),
        ('ordinal encoding',OrdinalFeatNames()),
        ('standardize',MinMaxWithFeatNames()),
        #('class inblance',Oversample()),
    ])

In [20]:
Test_df = pipeline2.fit_transform(Test_df)
Test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(self.feature,axis=1,inplace=True)


Unnamed: 0_level_0,loan_amnt,int_rate,installment,sub_grade,emp_length,annual_inc,dti,open_acc,revol_bal,revol_util,total_acc,mort_acc,pub_rec_bankruptcies,Fico_average,status_int,year,month,term_ 36 months,term_ 60 months,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,region_MidWest,region_NorthEast,region_SouthEast,region_SouthWest,region_West
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1
23142780,0.7483,0.43405,0.564927,10.0,1.0,0.538868,0.406411,0.535313,0.478438,0.398067,0.445707,0.517872,0.0,0.562114,1,0.636364,8,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0
96709908,0.731353,0.366587,0.542698,9.0,0.0,0.571668,0.62318,0.556598,0.536306,0.680705,0.534332,0.43679,0.0,0.092181,0,0.909091,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1
80728911,0.389689,0.086426,0.342274,1.0,0.7,0.499542,0.516741,0.512988,0.349562,0.50175,0.477395,0.43679,0.111111,0.331519,1,0.818182,6,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0
117207565,0.636844,0.479876,0.484229,12.0,0.5,0.499542,0.423415,0.379616,0.480579,0.656481,0.330949,0.302853,0.0,0.212995,1,0.909091,8,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1
17804840,0.544609,0.4965,0.413948,13.0,0.6,0.428129,0.479049,0.310192,0.385677,0.740446,0.506814,0.579338,0.0,0.360805,1,0.636364,6,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0


In [21]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import Perceptron,SGDClassifier,PassiveAggressiveClassifier

In [22]:
classifiers = SGDClassifier(random_state = 52,max_iter = 1000,n_jobs=-1,loss = 'log_loss')

In [23]:
X_train, y_train = Train_df.loc[:, Train_df.columns != 'status_int'], Train_df['status_int']

In [24]:
classifiers.fit(X_train, y_train)

In [25]:
X_test,y_test=Test_df.loc[:, Test_df.columns != 'status_int'], Test_df['status_int']

In [26]:
classifiers.predict(X_test)

array([1, 0, 1, ..., 1, 0, 1])

In [28]:
classifiers.predict_proba(X_test)

array([[0.41199856, 0.58800144],
       [0.68493309, 0.31506691],
       [0.25926815, 0.74073185],
       ...,
       [0.28128728, 0.71871272],
       [0.59701722, 0.40298278],
       [0.39574716, 0.60425284]])