# Feature Engineering Using Feature Tools

## Pros & Cons

### Pros -

- Takes care of automatic feature generation
- Checks relationship within a single dataset
- Has an exhaustive list of aggregate and Transformation features
- Works very well on normalized datasets having 2 or more tables

### Cons -

- Takes very long to indentify relationship within data if there are no categorical columns present in the dataset
- Many aggregation and transformation functions are provided for numerical columns but the same is not true for categorical  
  columns
- Produces too many features even for a small dataset - using all aggregation and transformative primitives provided to it,  
  thus may take long time to generate features for larger datasets


In [4]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsRegressor
# import matplotlib.pyplot as plt
# import seaborn as sns
from scipy.stats import chisquare,chi2_contingency
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, auc, classification_report, roc_curve
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pickle
import re
# import lightgbm as lgb
import pandas as pd
import numpy as np
import featuretools as ft
import warnings
import autonormalize as an
warnings.filterwarnings("ignore")

In [5]:
## Tools Initialization
stdscaler = StandardScaler()

In [6]:
data = pd.read_csv(r'C:\Users\Administrator\Downloads\ML 360 Data\loan_train.csv')

In [7]:
def column_dtypes(data):
    global train_columns_dtypes
    train_columns_dtypes = {}
    for col in data.columns:
        dtype = data[col].dtype
        train_columns_dtypes[col] = dtype.name
    return train_columns_dtypes

column_dtypes(data)

{'Loan_ID': 'object',
 'Gender': 'object',
 'Married': 'object',
 'Dependents': 'object',
 'Education': 'object',
 'Self_Employed': 'object',
 'Applicant_Income': 'int64',
 'Coapplicant_Income': 'float64',
 'Loan_Amount': 'float64',
 'Loan_Amount_Term': 'float64',
 'Credit_History': 'float64',
 'Property_Area': 'object',
 'Loan_Status': 'object'}

In [8]:
print('Original shape of the data :',data.shape)
data.head()

Original shape of the data : (614, 13)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Removal of columns having all unique values

In [9]:
##Removing columns having all unique values
def all_unique_values(data):

    col_names = data.columns

    col_to_drop = []

    for i in col_names:
        all_unique = len(data[i].value_counts().index)
        if all_unique == len(data):
            col_to_drop.append(i)

    ##Dropping those columns from Dataframe
    data = data.drop(col_to_drop, axis = 1)
    
#     print('Columns dropped :', col_to_drop)
#     print('Shape of the data after removing columns with all unique value :', data.shape)
    
    return data

## Replacing Junk words

In [10]:
## Removing all junk words

def junk_words(data):
    data = data.replace('?', np.nan)
    data = data.replace('*', np.nan)
    data = data.replace('NA.', np.nan)
    data = data.replace('N.A.', np.nan)
    data = data.replace('#', np.nan)
    data = data.replace(' ', np.nan)
    return data

## Removal of columns having more than 85% of missing values

In [11]:
##Getting count and percentage of missing value in each column
def missing_value_column(data):
    count_of_null = data.isnull().sum()
    percent_of_missing = data.isnull().sum() * 100 / len(data)
    missing_value_data = pd.DataFrame({'percent_missing': percent_of_missing,'Count_of_Missing_Values ': count_of_null })

    ##Dropping columns having more than 85% null values

    columns_to_be_removed = missing_value_data[missing_value_data['percent_missing'] >= 80].index
    data = data.drop(columns_to_be_removed, axis = 1)

#     print('\n\nColumns dropped :', columns_to_be_removed)
#     print('Shape of the data after removing columns with missing values more than 85% : ', data.shape)
    
    return data

## Removal of rows having more  than 60% missing value

In [9]:
# def missing_value_row(data):
#     row_wise_null = (data.isnull().sum(axis=1) / data.shape[1]) * 100
#     data['row_wise_null'] = row_wise_null

#     ##Dropping rows having more than 65% missing values

#     i = data[data['row_wise_null'] > 60].index
#     num_of_rows_removed = len(i)
#     data = data.drop(i)

#     data = data.drop('row_wise_null', axis = 1)

# #     print('\n\nNumber of rows dropped :', num_of_rows_removed)
# #     print('Shape of the data after removing rows with missing values more than 65% : ', data.shape)
    
#     return data   

## Removal of columns having only one unique class or values

In [12]:
##Removing columns which has only 1 unique values which will be of no use

def one_unique(data):
    col_names= data.columns

    col_drop = []

    for i in col_names:
        check_unique = len(data[i].value_counts().index)
        if check_unique ==1:
            col_drop.append(i)

    #print('\n\nColumns dropped :',col_drop)

    ##Dropping those columns from dataframe
    data = data.drop(col_drop, axis = 1)

    #print('Shape of the data after removing columns with single unique value :', data.shape)
    
    return data
    


## Basics EDA of Above all four fucntions

In [13]:
def basic_eda(data):
    data = all_unique_values(data)
    data = junk_words(data)
    data = missing_value_column(data)
#     data = missing_value_row(data)
    data = one_unique(data)
    return data

In [14]:
data = basic_eda(data)

## Separation of categorical and numerical columns

In [15]:
##Separating numerical and categorical columns 

def num_cat_separation(data):
    col_names = data.columns
    
    col_names_updated_cat = []
    col_names_updated_num = []

    for i in col_names:
        counts_of_individual_cols = data[i].value_counts()
        check = len(data[i])/len(counts_of_individual_cols.index)
        if check >30:

            col_names_updated_cat.append(i)
            cat_col_names = data[col_names_updated_cat].columns

            for i in cat_col_names:
                data[i] = data[i].astype('object')

        else:
            col_names_updated_num.append(i)
            num_col_names = data[col_names_updated_num].columns

            for i in num_col_names:
                data[i] = data[i].astype('float64')
            
    return list(col_names_updated_num), list(col_names_updated_cat)

## Imputing missing values

In [16]:
def imputation(data):
    
    data = basic_eda(data)
    
    count_of_null = data.isnull().sum()
    percent_of_missing = data.isnull().sum() * 100 / len(data)
    missing_value_data = pd.DataFrame({'percent_missing': percent_of_missing,'Count_of_Missing_Values ': count_of_null })
    
    global numerical_column_names
    global categorical_column_names
    
    numerical_column_names, categorical_column_names = num_cat_separation(data)
    
    global data_null_treated 
    data_null_treated = data.copy()
    label_encoder =  LabelEncoder()
     

    cols_to_be_imputed =  missing_value_data[missing_value_data['percent_missing'] > 0].sort_values('percent_missing', ascending=False).index
    cols_to_be_imputed = list(cols_to_be_imputed)
    #print("cols_to_be_imputed :", cols_to_be_imputed)
#     print(cols_to_be_imputed)
#     if target in cols_to_be_imputed:
#         cols_to_be_imputed.remove(target)
        
# #     cols_to_be_imputed = [ x for x in cols_to_be_imputed if x is not target ]
#     print(cols_to_be_imputed, '\n\n')

    Imputed_column_array = []
    for i in cols_to_be_imputed:
        print(i)

        data_dup = data_null_treated.copy()

        ##Replacing column having below 2 percent missing values with median and mode

        below_2_percent_columns = missing_value_data[missing_value_data['percent_missing'] < 2].index
        below_2_percent_columns = list(below_2_percent_columns)
        if i in below_2_percent_columns:
            below_2_percent_columns.remove(i)
        #print("below_2_percent_columns :", below_2_percent_columns)

        for j in below_2_percent_columns:

            if j in numerical_column_names:
                data_dup[j] = data_dup[[j]].apply(lambda x:x.fillna(x.median()), axis = 0)
            else:
                data_dup[j] = data_dup[[j]].apply(lambda x:x.fillna(data_dup[j].value_counts().index.max()))


        ##Seperating rows without null for train 
        data_dup_train = data_dup[data_dup[i].isna()==False]

        data_dup_train_copy = data_dup_train.copy()

        ##Dropping null values in other columns
        data_dup_train = data_dup_train.dropna() 

        ##Seperating rows with null for test
        data_dup_test = data_dup[data_dup[i].isna()] 

        ##Removing column having above 15 percent missing values except the column to be imputed in particular iteration

        above_15_percent_columns = missing_value_data[missing_value_data['percent_missing'] > 15].index
        above_15_percent_columns = list(above_15_percent_columns)
        #print("above_15_percent_columns :", above_15_percent_columns)
        if i in above_15_percent_columns:
            above_15_percent_columns.remove(i)
        data_dup_train = data_dup_train.drop(above_15_percent_columns, axis = 1)
        data_dup_test = data_dup_test.drop(above_15_percent_columns, axis = 1)

        ##Train test split

        x_test = data_dup_test.drop(i, axis = 1)    
        x_test = pd.get_dummies(x_test, drop_first=True)
        x_test_columns = x_test.columns
        #print("x_test_columns :", x_test_columns)
        for k in x_test_columns:
            if x_test[k].dtype == 'float64':
                x_test[k] = x_test[[k]].apply(lambda x:x.fillna(x.median()), axis = 0)
            else:
                x_test[k] = x_test[[k]].apply(lambda x:x.fillna(x_test[k].value_counts().index.max()))

        x_train = data_dup_train.drop(i, axis = 1)
        x_train = pd.get_dummies(x_train, drop_first=True)
        x_train = x_train[x_test.columns]

        y_train = data_dup_train[[i]]
        if y_train[i].dtype == 'O':
            y_train[i] = label_encoder.fit_transform(y_train[i])
            y_train[[i]] = y_train[[i]].astype('int')


        ##Building model
        #print("numerical_column_names :", numerical_column_names)
        #print("categorical_column_names :", categorical_column_names)
        if i in numerical_column_names:
            model_rf = RandomForestRegressor(n_estimators=100, max_depth=6)
        else:
            model_rf = RandomForestClassifier(n_estimators=100, max_depth=6)
        #print("Random Forest Regressor/Classifier for col :", i)
        model_rf.fit(x_train , y_train)
        print("model_rf.fit done")
        rf_score = model_rf.score(x_train , y_train)
        print('RandomForest Score :' , rf_score)

        if i in numerical_column_names:
            model_lr = LinearRegression()
        else:
            model_lr = LogisticRegression()

        model_lr.fit(x_train , y_train)
        lr_score = model_lr.score(x_train , y_train)
        print('\nLogisticRegression Score :' , lr_score)

        ##Checking which model is better
        if rf_score > lr_score:
            print('\nFor', i, ' RandomForest performs better. So we will go with this.\n')
            model = model_rf
            Imputed_column_array.append({i:'Random Forest'})
        else:
            print('\n\nFor', i ,' Logistic Regression performs better. So we will go with this.')
            model = model_lr   
            Imputed_column_array.append({i:'Logistic Regression'})

        prediction = model.predict(x_test)  
        print(prediction.dtype,'\n\n')
        if prediction.dtype == 'int32':
            prediction = label_encoder.inverse_transform(prediction)

        prediction_df = pd.DataFrame(prediction)
        #print('\n\n Predicted count of ', i , '  :' , prediction_df[0].value_counts())

        data_dup_test = data_dup_test.drop(i , axis = 1)

        data_dup_test[i] = prediction

        data_dup_complete = pd.concat([data_dup_train_copy , data_dup_test])

        data_dup_complete = data_dup_complete.sort_index()

        predicted = data_dup_complete[[i]]

        data_null_treated = data_null_treated.drop(i , axis = 1)

        data_null_treated[i] = predicted  
        
    return Imputed_column_array
          

In [17]:
imputation(data)

Credit_History
model_rf.fit done
RandomForest Score : 0.9104166666666667

LogisticRegression Score : 0.8708333333333333

For Credit_History  RandomForest performs better. So we will go with this.

int32 


Self_Employed
model_rf.fit done
RandomForest Score : 0.8910133843212237

LogisticRegression Score : 0.8604206500956023

For Self_Employed  RandomForest performs better. So we will go with this.

int32 


Loan_Amount
model_rf.fit done
RandomForest Score : 0.7845906991348433

LogisticRegression Score : 0.3835130743065618

For Loan_Amount  RandomForest performs better. So we will go with this.

float64 


Dependents
model_rf.fit done
RandomForest Score : 0.6509598603839442

LogisticRegression Score : 0.5706806282722513

For Dependents  RandomForest performs better. So we will go with this.

int32 


Loan_Amount_Term
model_rf.fit done
RandomForest Score : 0.8654173764906303

LogisticRegression Score : 0.848381601362862

For Loan_Amount_Term  RandomForest performs better. So we will go wi

[{'Credit_History': 'Random Forest'},
 {'Self_Employed': 'Random Forest'},
 {'Loan_Amount': 'Random Forest'},
 {'Dependents': 'Random Forest'},
 {'Loan_Amount_Term': 'Random Forest'},
 {'Gender': 'Random Forest'},
 {'Married': 'Random Forest'}]

In [18]:
data_types = []
for col in data_null_treated.columns:
    data_types.append(data_null_treated[col].dtype)

primitives = ft.list_primitives()
agg_primitives = primitives[primitives['type'] == 'aggregation']['name']
agg_primitives = agg_primitives.to_list()
trans_primitives = primitives[primitives['type'] == "transform"]['name']
trans_primitives = trans_primitives.to_list()

def auto_feature_gen(data):
    
    '''
        Function for generating features automatically
        Takes 1 positional arguments
        
        Parameters:
            data (dataframe): Dataframe to work on
        
        Returns:
            features (dataframe)
            features_names (list) 
    '''
    
    if 'object' not in data_types:
        print(True)
        es = ft.EntitySet(id='UserDefined')
        try:
            es = es.entity_from_dataframe(entity_id = 'data', dataframe = data, make_index = True, index = 'index')
        except RuntimeError:
            es = es.entity_from_dataframe(entity_id = 'data', dataframe = data, index = 'index')
            
#         features, features_names = ft.dfs(entityset=es, target_entity='data', 
#                                       verbose=True, agg_primitives = ['min','max','count','mean','median','mode','std','skew'],
#                                       trans_primitives = ['cum_sum','less_than','not_equal','greater_than','diff','absolute',
#                                                           'percentile','equal'])
        
        features, features_names = ft.dfs(entityset=es, target_entity='data', 
                                      verbose=True, agg_primitives = agg_primitives,
                                      trans_primitives = trans_primitives)

    else:
        print(False)
        es = an.auto_entityset(data, accuracy = 1, name="UserDefined", index = 'index')
        
#         features, features_names = ft.dfs(entityset=es, target_entity='index', 
#                                       verbose=True, agg_primitives = ['min','max','count','mean','median','mode','std','skew'],
#                                       trans_primitives = ['cum_sum','less_than','not_equal','greater_than','diff','absolute',
#                                                           'percentile','equal'])
        
        features, features_names = ft.dfs(entityset=es, target_entity='index', 
                                      verbose=True, agg_primitives = agg_primitives,
                                      trans_primitives = trans_primitives)
    return features, features_names

features, features_names = auto_feature_gen(data_null_treated)

cols_to_drop = features.columns[features.isnull().any()].to_list()
features.drop(cols_to_drop, inplace = True, axis = 1)

num_features = []
for col in features.columns:
    if features[col].dtype in ['int32', 'int64', 'float32', 'float64']:
        num_features.append(col)
        
for col in num_features:
    if np.any(np.isinf(features[col])):
        indices = features[[col]].isin([np.nan, np.inf, -np.inf]).any(1)
        features.loc[indices,col] = 0

features

  0%|          | 0/12 [00:00<?, ?it/s]

False


100%|██████████| 12/12 [00:04<00:00,  2.86it/s]


Built 325 features
Elapsed: 00:00 | Progress: 100%|██████████


Unnamed: 0_level_0,Education,Applicant_Income,Coapplicant_Income,Property_Area,Loan_Status,Credit_History,Self_Employed,Loan_Amount,Dependents,Loan_Amount_Term,...,PERCENTILE(Loan_Amount),0 - Applicant_Income,0 - Coapplicant_Income,0 - Loan_Amount,Applicant_Income - Coapplicant_Income,Applicant_Income - Loan_Amount,Coapplicant_Income - Loan_Amount,Applicant_Income - 0,Coapplicant_Income - 0,Loan_Amount - 0
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Graduate,5849.0,0.0,Urban,Y,1,No,153.059393,0,360,...,0.679153,-5849.0,0.0,-153.059393,5849.0,5695.940607,-153.059393,5849.0,0.0,153.059393
1,Graduate,4583.0,1508.0,Rural,N,1,No,128.000000,1,360,...,0.508143,-4583.0,-1508.0,-128.000000,3075.0,4455.000000,1380.000000,4583.0,1508.0,128.000000
2,Graduate,3000.0,0.0,Urban,Y,1,Yes,66.000000,0,360,...,0.074104,-3000.0,0.0,-66.000000,3000.0,2934.000000,-66.000000,3000.0,0.0,66.000000
3,Not Graduate,2583.0,2358.0,Urban,Y,1,No,120.000000,0,360,...,0.430782,-2583.0,-2358.0,-120.000000,225.0,2463.000000,2238.000000,2583.0,2358.0,120.000000
4,Graduate,6000.0,0.0,Urban,Y,1,No,141.000000,0,360,...,0.624593,-6000.0,0.0,-141.000000,6000.0,5859.000000,-141.000000,6000.0,0.0,141.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,Graduate,2900.0,0.0,Rural,Y,1,No,71.000000,0,360,...,0.099349,-2900.0,0.0,-71.000000,2900.0,2829.000000,-71.000000,2900.0,0.0,71.000000
610,Graduate,4106.0,0.0,Rural,Y,1,No,40.000000,3+,180,...,0.017101,-4106.0,0.0,-40.000000,4106.0,4066.000000,-40.000000,4106.0,0.0,40.000000
611,Graduate,8072.0,240.0,Urban,Y,1,No,253.000000,1,360,...,0.915309,-8072.0,-240.0,-253.000000,7832.0,7819.000000,-13.000000,8072.0,240.0,253.000000
612,Graduate,7583.0,0.0,Urban,Y,1,No,187.000000,2,360,...,0.831433,-7583.0,0.0,-187.000000,7583.0,7396.000000,-187.000000,7583.0,0.0,187.000000


In [20]:
data_null_treated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               614 non-null    int32  
 1   Education           614 non-null    object 
 2   Applicant_Income    614 non-null    float64
 3   Coapplicant_Income  614 non-null    float64
 4   Property_Area       614 non-null    object 
 5   Loan_Status         614 non-null    object 
 6   Credit_History      614 non-null    object 
 7   Self_Employed       614 non-null    object 
 8   Loan_Amount         614 non-null    float64
 9   Dependents          614 non-null    object 
 10  Loan_Amount_Term    614 non-null    object 
 11  Gender              614 non-null    object 
 12  Married             614 non-null    object 
dtypes: float64(3), int32(1), object(9)
memory usage: 60.1+ KB


## Feature selection

In [308]:
def feature_selection(data_null_treated, target, target_type):
    
    final_features_choosed = []
    ## Running Chisqaure for Categorical columns
    if target_type == 'Categorical':
        chi_columns = data_null_treated[categorical_column_names].drop(target, axis = 1).columns
    else:
        chi_columns = data_null_treated[categorical_column_names].columns
    p_value_for_chisq = []
    name = []
    for i in chi_columns:
        cont = pd.crosstab(data_null_treated[i],
                           data_null_treated[target])
        name.append(i)
        p_value_for_chisq.append(chi2_contingency(cont)[1])
        chisqaure_df = pd.DataFrame({'Variables':name,'P_value':p_value_for_chisq})

    ## Getting columns which are dependent to our target column at 90% confidence interval    
    chi_square_imp_feature = chisqaure_df[chisqaure_df['P_value'] < 0.10]['Variables']

    ## Getting dataframe with Categorical(which are dependent) and numerical columns
    data_complete = pd.concat([data_null_treated[chi_square_imp_feature], data_null_treated[numerical_column_names]], axis = 1)
    data_complete[target] = data_null_treated[[target]]
    print(data_complete.info())


    ## Running Random Forest for those important columns from chisquare
    rf = RandomForestClassifier(n_estimators=500, max_depth=6, random_state=5000, min_samples_leaf=2)
    x = data_complete.drop(target, axis = 1)
    #y = data_complete[[target]]
    y = np.asarray(data_complete[target], dtype = '|S6')
    x = pd.get_dummies(x,  drop_first=True)
    rf.fit(x,  y)
    rf.score(x ,y)

#     ## Plotting those feature with their importance
#     variables = x.columns
#     importances = rf.feature_importances_
#     indices = np.argsort(importances)

#     plt.figure(figsize=(10,8))
#     plt.title('Feature Importances')
#     plt.barh(range(len(indices)), importances[indices], color='b', align='center')
#     plt.yticks(range(len(indices)), [variables[i] for i in indices])
#     plt.xlabel('Relative Importance')
#     plt.show()


    ## Creating a dataframe with variables and their importance as well as the running total of the importance
    variables = x.columns
    feature_imp = pd.DataFrame({'Variables':variables, 'Importance':rf.feature_importances_})
    feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index().drop('index', axis=1)
    variable_imp_values = pd.Series(feature_imp['Importance'])
    running_total = variable_imp_values.cumsum()
    feature_imp['Running_Total'] = running_total
    feature_imp

    ##Proceeding with variables which contribute upto 90% 
    final_variables = list(feature_imp[feature_imp['Running_Total'] < 0.94]['Variables'])

    x_dummies = pd.get_dummies(data_complete.drop(target, axis = 1), drop_first=True)
    global final_data_for_modelling
    final_data_for_modelling = x_dummies[final_variables]
    final_data_for_modelling[target] = data_complete[[target]]
    final_features_choosed = final_data_for_modelling.columns
    print(feature_imp,'\n\n')
    return final_features_choosed

In [309]:
feature_selection(data_null_treated, 'price', 'Numerical')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 999 entries, 0 to 998
Columns: 115 entries, bathrooms to sqft_lot15
dtypes: float64(11), object(104)
memory usage: 905.3+ KB
None
                          Variables  Importance  Running_Total
0                               lat    0.057026       0.057026
1                          sqft_lot    0.054248       0.111273
2                        sqft_lot15    0.053126       0.164399
3                              long    0.050602       0.215001
4                     sqft_living15    0.050590       0.265591
..                              ...         ...            ...
170               yr_renovated_1955    0.000000       1.000000
171               yr_renovated_1954    0.000000       1.000000
172               yr_renovated_1940    0.000000       1.000000
173  bathrooms = sqft_basement_True    0.000000       1.000000
174               yr_renovated_1958    0.000000       1.000000

[175 rows x 3 columns] 




Index(['lat', 'sqft_lot', 'sqft_lot15', 'long', 'sqft_living15', 'sqft_living',
       'sqft_above', 'yr_built', 'zipcode', 'sqft_basement',
       ...
       'view > floors_True', 'floors = view_True',
       'bathrooms = condition_True', 'yr_renovated_2014', 'bathrooms_4.25',
       'floors > view_True', 'condition != grade_True',
       'sqft_basement < yr_built_True', 'yr_built < sqft_basement_True',
       'price'],
      dtype='object', length=101)

## Final Modelling 

In [310]:
final_data_for_modelling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 999 entries, 0 to 998
Columns: 101 entries, lat to price
dtypes: float64(11), uint8(90)
memory usage: 181.5 KB


In [311]:
final_data_for_modelling

Unnamed: 0_level_0,lat,sqft_lot,sqft_lot15,long,sqft_living15,sqft_living,sqft_above,yr_built,zipcode,sqft_basement,...,view > floors_True,floors = view_True,bathrooms = condition_True,yr_renovated_2014,bathrooms_4.25,floors > view_True,condition != grade_True,sqft_basement < yr_built_True,yr_built < sqft_basement_True,price
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,47.3262,9373.0,7316.0,-122.214,2060.0,2400.0,2400.0,1991.0,98002.0,0.0,...,0,0,1,0,0,1,1,1,0,280000.0
1,47.4444,26036.0,21891.0,-122.351,2590.0,2060.0,1160.0,1947.0,98166.0,900.0,...,0,0,0,0,0,1,1,1,0,647500.0
2,47.4434,43000.0,20023.0,-122.347,2250.0,1460.0,1460.0,1952.0,98166.0,0.0,...,0,0,0,0,0,1,1,1,0,400000.0
3,47.4783,7599.0,10320.0,-122.265,1290.0,1430.0,1010.0,1930.0,98168.0,420.0,...,0,0,0,0,0,1,1,1,0,235000.0
4,47.5803,3504.0,3504.0,-122.294,1480.0,1650.0,760.0,1951.0,98144.0,890.0,...,0,0,0,0,0,1,1,1,0,402500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,47.6691,6733.0,6343.0,-122.360,1770.0,2500.0,1770.0,1979.0,98107.0,730.0,...,0,0,0,0,0,1,1,1,0,685000.0
995,47.6698,7975.0,5722.0,-122.361,1080.0,870.0,870.0,1946.0,98107.0,0.0,...,1,0,0,0,0,0,1,1,0,475000.0
996,47.6885,3440.0,5080.0,-122.376,1520.0,2020.0,1480.0,1928.0,98117.0,540.0,...,0,0,0,0,0,1,1,1,0,542000.0
997,47.6886,3740.0,5080.0,-122.375,1090.0,1540.0,770.0,1946.0,98117.0,770.0,...,0,0,0,0,0,1,1,1,0,525000.0


## Label Encoding Target Feature

In [364]:
def encoding(data, target):
    global labelencoder
    labelencoder = LabelEncoder()
    data[[target]] = labelencoder.fit_transform(data[target])
    return data

#### For categorical

In [403]:
# for col in features.columns:
#     if features[col].dtype in ['int32', 'int64', 'float32', 'float64']:
#         print(col, ":",np.max(features[col]))
#     else:
#         pass
#np.any(np.isnan(features))
#np.all(np.isfinite(features))
#features.drop(['index'],axis = 1, inplace=True)
#features.reset_index(inplace=True)
#features.columns
def clean_dataset(df): 
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame" 
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1) 
    return df[indices_to_keep].astype(np.float64)

In [422]:
for col in num_features:
    if np.any(np.isinf(features[col])):
        indices = features[[col]].isin([np.nan, np.inf, -np.inf]).any(1)
        features.loc[indices,col] = 0
    print(np.any(np.isinf(features[col])))

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [423]:
features

Unnamed: 0,Education,Applicant_Income,Coapplicant_Income,Property_Area,Loan_Status,Credit_History,Self_Employed,Loan_Amount,Dependents,Loan_Amount_Term,...,PERCENTILE(Loan_Amount),0 - Applicant_Income,0 - Coapplicant_Income,0 - Loan_Amount,Applicant_Income - Coapplicant_Income,Applicant_Income - Loan_Amount,Coapplicant_Income - Loan_Amount,Applicant_Income - 0,Coapplicant_Income - 0,Loan_Amount - 0
0,Graduate,5849.0,0.0,Urban,Y,1,No,148.156472,0,360,...,0.648208,-5849.0,0.0,-148.156472,5849.0,5700.843528,-148.156472,5849.0,0.0,148.156472
1,Graduate,4583.0,1508.0,Rural,N,1,No,128.000000,1,360,...,0.508143,-4583.0,-1508.0,-128.000000,3075.0,4455.000000,1380.000000,4583.0,1508.0,128.000000
2,Graduate,3000.0,0.0,Urban,Y,1,Yes,66.000000,0,360,...,0.070847,-3000.0,0.0,-66.000000,3000.0,2934.000000,-66.000000,3000.0,0.0,66.000000
3,Not Graduate,2583.0,2358.0,Urban,Y,1,No,120.000000,0,360,...,0.430782,-2583.0,-2358.0,-120.000000,225.0,2463.000000,2238.000000,2583.0,2358.0,120.000000
4,Graduate,6000.0,0.0,Urban,Y,1,No,141.000000,0,360,...,0.622964,-6000.0,0.0,-141.000000,6000.0,5859.000000,-141.000000,6000.0,0.0,141.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,Graduate,2900.0,0.0,Rural,Y,1,No,71.000000,0,360,...,0.099349,-2900.0,0.0,-71.000000,2900.0,2829.000000,-71.000000,2900.0,0.0,71.000000
610,Graduate,4106.0,0.0,Rural,Y,1,No,40.000000,3+,180,...,0.017101,-4106.0,0.0,-40.000000,4106.0,4066.000000,-40.000000,4106.0,0.0,40.000000
611,Graduate,8072.0,240.0,Urban,Y,1,No,253.000000,1,360,...,0.915309,-8072.0,-240.0,-253.000000,7832.0,7819.000000,-13.000000,8072.0,240.0,253.000000
612,Graduate,7583.0,0.0,Urban,Y,1,No,187.000000,2,360,...,0.831433,-7583.0,0.0,-187.000000,7583.0,7396.000000,-187.000000,7583.0,0.0,187.000000


In [404]:
num_features = []
for col in features.columns:
    if features[col].dtype in ['int32', 'int64', 'float32', 'float64']:
        num_features.append(col)
    else:
        pass
num_features

['Applicant_Income',
 'Coapplicant_Income',
 'Loan_Amount',
 'ABSOLUTE(Applicant_Income)',
 'ABSOLUTE(Coapplicant_Income)',
 'ABSOLUTE(Loan_Amount)',
 'Applicant_Income + Coapplicant_Income',
 'Applicant_Income + Loan_Amount',
 'Coapplicant_Income + Loan_Amount',
 'Applicant_Income + 0',
 'Coapplicant_Income + 0',
 'Loan_Amount + 0',
 'CUM_MAX(Applicant_Income)',
 'CUM_MAX(Coapplicant_Income)',
 'CUM_MAX(Loan_Amount)',
 'CUM_MEAN(Applicant_Income)',
 'CUM_MEAN(Coapplicant_Income)',
 'CUM_MEAN(Loan_Amount)',
 'CUM_MIN(Applicant_Income)',
 'CUM_MIN(Coapplicant_Income)',
 'CUM_MIN(Loan_Amount)',
 'CUM_SUM(Applicant_Income)',
 'CUM_SUM(Coapplicant_Income)',
 'CUM_SUM(Loan_Amount)',
 '1 / Applicant_Income',
 '1 / Coapplicant_Income',
 '1 / Loan_Amount',
 'Applicant_Income / Coapplicant_Income',
 'Applicant_Income / Loan_Amount',
 'Coapplicant_Income / Applicant_Income',
 'Coapplicant_Income / Loan_Amount',
 'Loan_Amount / Applicant_Income',
 'Loan_Amount / Coapplicant_Income',
 'Applicant_I

In [415]:
num_features_df = clean_dataset(features[num_features])

In [416]:
num_features_df

Unnamed: 0,Applicant_Income,Coapplicant_Income,Loan_Amount,ABSOLUTE(Applicant_Income),ABSOLUTE(Coapplicant_Income),ABSOLUTE(Loan_Amount),Applicant_Income + Coapplicant_Income,Applicant_Income + Loan_Amount,Coapplicant_Income + Loan_Amount,Applicant_Income + 0,...,PERCENTILE(Loan_Amount),0 - Applicant_Income,0 - Coapplicant_Income,0 - Loan_Amount,Applicant_Income - Coapplicant_Income,Applicant_Income - Loan_Amount,Coapplicant_Income - Loan_Amount,Applicant_Income - 0,Coapplicant_Income - 0,Loan_Amount - 0
1,4583.0,1508.0,128.000000,4583.0,1508.0,128.000000,6091.0,4711.000000,1636.000000,4583.0,...,0.508143,-4583.0,-1508.0,-128.000000,3075.0,4455.000000,1380.000000,4583.0,1508.0,128.000000
3,2583.0,2358.0,120.000000,2583.0,2358.0,120.000000,4941.0,2703.000000,2478.000000,2583.0,...,0.430782,-2583.0,-2358.0,-120.000000,225.0,2463.000000,2238.000000,2583.0,2358.0,120.000000
5,5417.0,4196.0,267.000000,5417.0,4196.0,267.000000,9613.0,5684.000000,4463.000000,5417.0,...,0.934853,-5417.0,-4196.0,-267.000000,1221.0,5150.000000,3929.000000,5417.0,4196.0,267.000000
6,2333.0,1516.0,95.000000,2333.0,1516.0,95.000000,3849.0,2428.000000,1611.000000,2333.0,...,0.195440,-2333.0,-1516.0,-95.000000,817.0,2238.000000,1421.000000,2333.0,1516.0,95.000000
7,3036.0,2504.0,158.000000,3036.0,2504.0,158.000000,5540.0,3194.000000,2662.000000,3036.0,...,0.706026,-3036.0,-2504.0,-158.000000,532.0,2878.000000,2346.000000,3036.0,2504.0,158.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2400.0,3800.0,146.454093,2400.0,3800.0,146.454093,6200.0,2546.454093,3946.454093,2400.0,...,0.643322,-2400.0,-3800.0,-146.454093,-1400.0,2253.545907,3653.545907,2400.0,3800.0,146.454093
606,3400.0,2500.0,173.000000,3400.0,2500.0,173.000000,5900.0,3573.000000,2673.000000,3400.0,...,0.770358,-3400.0,-2500.0,-173.000000,900.0,3227.000000,2327.000000,3400.0,2500.0,173.000000
607,3987.0,1411.0,157.000000,3987.0,1411.0,157.000000,5398.0,4144.000000,1568.000000,3987.0,...,0.699511,-3987.0,-1411.0,-157.000000,2576.0,3830.000000,1254.000000,3987.0,1411.0,157.000000
608,3232.0,1950.0,108.000000,3232.0,1950.0,108.000000,5182.0,3340.000000,2058.000000,3232.0,...,0.301303,-3232.0,-1950.0,-108.000000,1282.0,3124.000000,1842.000000,3232.0,1950.0,108.000000


In [424]:
def LogisticRegression_modelling(data, target):
    global Logistic_Classification
    data_encoded = data.copy()
    data_encoded = encoding(data_encoded, target)
    X = data_encoded.drop(target, axis = 1)
    X = pd.get_dummies(X,drop_first=True)
    #print(X)
    Y = data_encoded[[target]]
    #print(Y)
    #print(type(Y))
    #Y=Y.astype('int')
    model = LogisticRegression()
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 10)
#     print('Number of Samples in Test File:',X_test.shape[0])
    model.fit(X_train, Y_train)
    Logistic_Classification = pickle.dumps(model)
    prediction = model.predict(X_test)
    prediction_prob = model.predict_proba(X_test)
    Accuracy = metrics.accuracy_score(Y_test, prediction)
    print('Test Accuracy :', Accuracy )
    confusion = confusion_matrix(Y_test[target], prediction)
#     print('\nConfusion Matrix :\n',confusion,'\n')
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
#     print('TP:', TP)
#     print('TN:', TN)
#     print('FP:', FP)
#     print('FN:', FN)
    print('True Positive Rate :', (TP / (TP + FN)))
    print('True Negative Rate :', (TN / (TN + FP)))
    comparison = pd.DataFrame()
    comparison['Actual'] = Y_test[target]
    comparison['Predicted'] = prediction
    comparison.to_csv('Logistic_Regression.csv', index = False)

In [430]:
def RandomForest_modelling(data, target):
    data_encoded = data.copy()
    global RF_Classification
    data_encoded = encoding(data_encoded, target)
    X = data_encoded.drop(target,axis=1)
    X = pd.get_dummies(X,drop_first=True)
    Y = data_encoded[[target]]

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 10)

    param_grid = {'n_estimators':(100,150),'min_samples_split':np.arange(2,6),'max_depth':(5,6)}
    gs = GridSearchCV(RandomForestClassifier(),param_grid=param_grid,cv=10)
    gs.fit(X_train,Y_train)
#     print("RandomForest : %0.4f [%s]"%(gs.best_score_,gs.best_params_))
    n_estimators_gv, min_sample_leaf_gv, max_depth_gv = gs.best_params_['n_estimators'], gs.best_params_['max_depth'], gs.best_params_['min_samples_split']

    model = RandomForestClassifier(max_depth =max_depth_gv , n_estimators=n_estimators_gv, min_samples_leaf=min_sample_leaf_gv)
    model.fit(X_train, Y_train)
    RF_Classification = pickle.dumps(model)
    prediction = model.predict(X_test)
#     model_score = model.score(X_train, Y_train)
    prediction_prob = model.predict_proba(X_test)
    Accuracy = metrics.accuracy_score(Y_test, prediction)
    print('Test Accuracy :', Accuracy )
    confusion = confusion_matrix(Y_test[target], prediction)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    print('True Positive Rate :', (TP / (TP + FN)))
    print('True Negative Rate :', (TN / (TN + FP)))
    comparison = pd.DataFrame()
    comparison['Actual'] = Y_test[target]
    comparison['Predicted'] = prediction
    comparison.to_csv('Random_Forest.csv', index = False)

In [431]:
def GB_modelling(data, target):
    data_encoded = data.copy()
    global GB_Classification
    data_encoded = encoding(data_encoded, target)
    X = data_encoded.drop(target,axis=1)
    X = pd.get_dummies(X,drop_first=True)
    Y = data_encoded[[target]]

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 10)

    param_grid = {'n_estimators':(100,150),'min_samples_split':np.arange(2,6),'max_depth':(5,6)}
    gs = GridSearchCV(RandomForestClassifier(),param_grid=param_grid,cv=10)
    gs.fit(X_train,Y_train)
#     print("RandomForest : %0.4f [%s]"%(gs.best_score_,gs.best_params_))
    n_estimators_gv, min_sample_leaf_gv, max_depth_gv = gs.best_params_['n_estimators'], gs.best_params_['max_depth'], gs.best_params_['min_samples_split']

    model = GradientBoostingClassifier(max_depth =max_depth_gv , n_estimators=n_estimators_gv, min_samples_leaf=min_sample_leaf_gv)
    model.fit(X_train, Y_train)
    GB_Classification = pickle.dumps(model)
    prediction = model.predict(X_test)
#     model_score = model.score(X_train, Y_train)
    prediction_prob = model.predict_proba(X_test)
    Accuracy = metrics.accuracy_score(Y_test, prediction)
    print('Test Accuracy :', Accuracy )
    confusion = confusion_matrix(Y_test[target], prediction)
    TP = confusion[1, 1]
    TN = confusion[0, 0]
    FP = confusion[0, 1]
    FN = confusion[1, 0]
    print('True Positive Rate :', (TP / (TP + FN)))
    print('True Negative Rate :', (TN / (TN + FP)))
    comparison = pd.DataFrame()
    comparison['Actual'] = Y_test[target]
    comparison['Predicted'] = prediction
    comparison.to_csv('Gradient_Boosting.csv', index = False)

In [378]:
features.columns[features.isnull().any()]

Index([], dtype='object')

In [425]:
LogisticRegression_modelling(features, 'Loan_Status')

Test Accuracy : 0.7135135135135136
True Positive Rate : 0.9924812030075187
True Negative Rate : 0.0


In [432]:
RandomForest_modelling(features, 'Loan_Status')

Test Accuracy : 0.7402597402597403
True Positive Rate : 0.9734513274336283
True Negative Rate : 0.0975609756097561


In [434]:
GB_modelling(features,'Loan_Status')

Test Accuracy : 0.7792207792207793
True Positive Rate : 0.9203539823008849
True Negative Rate : 0.3902439024390244


#### For numerical

In [313]:
## Evaluation Metric
def AdjustedRsqaure(N, p, R_square):
    # N = length of dataset, p = Number of predictors 
    result = 1 - (1 - R_square) * (N - 1) / (N - p - 1)
    return result

In [314]:
def LinearRegression_modelling(data, target):
    global Linear_Regression
    X = data.drop(target, axis = 1)
    X = pd.get_dummies(X)
    
    ##Variables for Adjusted R_sqaure
    N = len(X)
    p = X.shape[1]
    
    Y = data[[target]]
    model = LinearRegression()
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 555)
    model.fit(X_train, Y_train)
    Linear_Regression = pickle.dumps(model)
#     print('Train score', model.score(X_train, Y_train))
    prediction = model.predict(X_test)
    RMSE = np.sqrt(metrics.mean_squared_error(Y_test, prediction))
    R_square = metrics.r2_score(Y_test, prediction)
    Adjusted_r_sqaure = AdjustedRsqaure(N, p, R_square)
    print('RMSE :', RMSE)
    comparison = pd.DataFrame()
    comparison['Actual'] = Y_test[target]
    comparison['Predicted'] = prediction.round()
    comparison.to_csv('Linear_Regression_output.csv', index = False)
    print('RSquare :', R_square)
    print('AdjustedRSqaure :', Adjusted_r_sqaure)
#     return RMSE, R_square , Adjusted_r_sqaure

In [315]:
def RandomForest_modelling(data, target):
    global RF_Regression
    X = data.drop(target,axis=1)
    X = pd.get_dummies(X)
    Y = data[[target]]
    
    ##Variables for Adjusted R_sqaure
    N = len(X)
    p = X.shape[1]
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 7)

    param_grid = {'n_estimators':(100,150),'min_samples_split':np.arange(2,6),'max_depth':(5,6)}
    gs = GridSearchCV(RandomForestRegressor(),param_grid=param_grid,cv=10)
    gs.fit(X_train,Y_train)
#     print("RandomForest : %0.4f [%s]"%(gs.best_score_,gs.best_params_))
    n_estimators_gv, min_sample_leaf_gv, max_depth_gv = gs.best_params_['n_estimators'], gs.best_params_['max_depth'], gs.best_params_['min_samples_split']

    model = RandomForestRegressor(max_depth =max_depth_gv , n_estimators=n_estimators_gv, min_samples_leaf=min_sample_leaf_gv)
    model.fit(X_train, Y_train)
    RF_Regression = pickle.dumps(model)
    model_score = model.score(X_train, Y_train)
    prediction = model.predict(X_test)
    RMSE = np.sqrt(metrics.mean_squared_error(Y_test, prediction))
    R_square = metrics.r2_score(Y_test, prediction)
    Adjusted_r_sqaure = AdjustedRsqaure(N, p, R_square)
    print('RMSE : ', RMSE)
    print('RSquare :', R_square)
    print('AdjustedRSqaure :', Adjusted_r_sqaure)
    comparison = pd.DataFrame()
    comparison['Actual'] = Y_test[target]
    comparison['Predicted'] = prediction.round()
    comparison.to_csv('Random_Forest_output.csv', index = False)
#     return RMSE, R_square , Adjusted_r_sqaure

In [316]:
def KNN_modelling(data, target):
    global KNN_Regression
    X = data.drop(target, axis = 1)
    X = pd.get_dummies(X)
    Y = data[[target]]
    
    ##Variables for Adjusted R_sqaure
    N = len(X)
    p = X.shape[1]
    
    model = KNeighborsRegressor()
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 555)
    model.fit(X_train, Y_train)
    KNN_Regression = pickle.dumps(model)
#     print('Train score', model.score(X_train, Y_train))
    prediction = model.predict(X_test)
    RMSE = np.sqrt(metrics.mean_squared_error(Y_test, prediction))
    R_square = metrics.r2_score(Y_test, prediction)
    Adjusted_r_sqaure = AdjustedRsqaure(N, p, R_square)
    print('RMSE : ', RMSE)
    print('RSquare :', R_square)
    print('AdjustedRSqaure :', Adjusted_r_sqaure)
    comparison = pd.DataFrame()
    comparison['Actual'] = Y_test[target]
    comparison['Predicted'] = prediction.round()
    comparison.to_csv('KNN_output.csv', index = False)
#     return RMSE, R_square , Adjusted_r_sqaure

In [317]:
def KNN_modelling_scaled(data, target):
    global KNNScaled_Regression
    X = data.drop(target, axis = 1)
    X = pd.get_dummies(X)
    X = stdscaler.fit_transform(X)
    Y = data[[target]]
    
    ##Variables for Adjusted R_sqaure
    N = len(X)
    p = X.shape[1]
    
    model = KNeighborsRegressor()
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 555)
    model.fit(X_train, Y_train)
    KNNScaled_Regression = pickle.dumps(model)
#     print('Train score', model.score(X_train, Y_train))
    prediction = model.predict(X_test)
    RMSE = np.sqrt(metrics.mean_squared_error(Y_test, prediction))
    R_square = metrics.r2_score(Y_test, prediction)
    Adjusted_r_sqaure = AdjustedRsqaure(N, p, R_square)
    print('RMSE : ', RMSE)
    print('RSquare :', R_square)
    print('AdjustedRSqaure :', Adjusted_r_sqaure)
    comparison = pd.DataFrame()
    comparison['Actual'] = Y_test[target]
    comparison['Predicted'] = prediction.round()
    comparison.to_csv('KNN_Scaled.csv', index = False)
#     return RMSE, R_square , Adjusted_r_sqaure

In [318]:
LinearRegression_modelling(final_data_for_modelling, 'price')

RMSE : 179356.728766591
RSquare : 0.6267186679267454
AdjustedRSqaure : 0.5851505908584542


In [319]:
RandomForest_modelling(final_data_for_modelling, 'price')

RMSE :  246035.24249166274
RSquare : 0.6118619224421493
AdjustedRSqaure : 0.5686394193733464


In [320]:
KNN_modelling(final_data_for_modelling, 'price')

RMSE :  197919.17284732405
RSquare : 0.5454552384330567
AdjustedRSqaure : 0.4948377816884082


In [321]:
KNN_modelling_scaled(final_data_for_modelling, 'price')

RMSE :  196384.30957189697
RSquare : 0.5524778917618032
AdjustedRSqaure : 0.5026424676818257


## Live Predictions

In [202]:
test = pd.read_csv(r'C:\Users\Administrator\Downloads\ML 360 Data\loan_test_1.csv')
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001028,Male,Yes,2,Graduate,No,3073,8106,200,360,1,Urban
1,LP001029,Male,No,0,Graduate,No,1853,2840,114,360,1,Rural


In [203]:
def live_prediction(test, target, target_type, Model):
    ## Changing test column datatypes similar to train column datatypes
    for col in test.columns:
        test[col] = test[col].astype(train_columns_dtypes[col])
    ## Changing the categorical columns datatypes in test data as 'object' using reference from Cat - Num separation Logic
    for i in categorical_column_names:
        if i in test.columns:
            test[i] = test[i].astype('object')
    ## Getting dummy variables for test data
    test_dummies = pd.get_dummies(test, drop_first=False)
    test_columns = test_dummies.columns
    ## Final columns used for modelling
    train_columns = final_data_for_modelling.columns
    ## Checking whatever train columns exist in test columns and separating it
    cols_in = []
    cols_out = []
    for col in train_columns:
        if col in test_columns:
            cols_in.append(col)
        else:
            cols_out.append(col)
    ## Removing target column which won't exist in test columns
    if target in cols_out:
        cols_out.remove(target)
    ## Creating a dataframe with the columns which exist in test
    final_test_data = test_dummies[cols_in]
    final_test_data
    ## Creating new columns and adding 0 for those columns which doesn't exist in test columns
    for col in cols_out:
        final_test_data[col] = 0
    ## Prediction
    model = pickle.loads(Model)
    prediction = model.predict(final_test_data)
    ## If classification problem then reverse_encoding the target column
    if target_type == 'Categorical':
        prediction = labelencoder.inverse_transform(prediction)
        test[target] = prediction
    else:
        test[target] = prediction
    return test

In [204]:
live_prediction(test, 'Loan_Status', 'Categorical', Logistic_Classification)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001028,Male,Yes,2,Graduate,No,3073,8106.0,200.0,360,1,Urban,N
1,LP001029,Male,No,0,Graduate,No,1853,2840.0,114.0,360,1,Rural,N


In [205]:
live_prediction(test, 'Loan_Status', 'Categorical', RF_Classification)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001028,Male,Yes,2,Graduate,No,3073,8106.0,200.0,360,1,Urban,Y
1,LP001029,Male,No,0,Graduate,No,1853,2840.0,114.0,360,1,Rural,N


In [206]:
live_prediction(test, 'Loan_Status', 'Categorical', GB_Classification)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001028,Male,Yes,2,Graduate,No,3073,8106.0,200.0,360,1,Urban,Y
1,LP001029,Male,No,0,Graduate,No,1853,2840.0,114.0,360,1,Rural,N


## Process followed finally for Live Prediction

In [207]:
## Changing test column datatypes similar to train column datatypes
for col in test.columns:
    test[col] = test[col].astype(train_columns_dtypes[col])

In [208]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Loan_ID             2 non-null      object 
 1   Gender              2 non-null      object 
 2   Married             2 non-null      object 
 3   Dependents          2 non-null      object 
 4   Education           2 non-null      object 
 5   Self_Employed       2 non-null      object 
 6   Applicant_Income    2 non-null      int64  
 7   Coapplicant_Income  2 non-null      float64
 8   Loan_Amount         2 non-null      float64
 9   Loan_Amount_Term    2 non-null      float64
 10  Credit_History      2 non-null      float64
 11  Property_Area       2 non-null      object 
 12  Loan_Status         2 non-null      object 
dtypes: float64(4), int64(1), object(8)
memory usage: 336.0+ bytes


In [209]:
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001028,Male,Yes,2,Graduate,No,3073,8106.0,200.0,360.0,1.0,Urban,Y
1,LP001029,Male,No,0,Graduate,No,1853,2840.0,114.0,360.0,1.0,Rural,N


In [210]:
## Changing the categorical columns datatypes in test data as 'object' using reference from Cat - Num separation Logic
for i in categorical_column_names:
    if i in test.columns:
        test[i] = test[i].astype('object')

In [211]:
categorical_column_names

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Loan_Status',
 'Applicant_Income = Coapplicant_Income',
 'Coapplicant_Income = Credit_History',
 'Coapplicant_Income = index',
 'Credit_History = index',
 'Loan_Amount = Loan_Amount_Term',
 'Married = Self_Employed',
 'index = Loan_Amount',
 'index = Loan_Amount_Term',
 'Applicant_Income > Coapplicant_Income',
 'Applicant_Income > Credit_History',
 'Applicant_Income > Loan_Amount',
 'Applicant_Income > Loan_Amount_Term',
 'Coapplicant_Income > Applicant_Income',
 'Coapplicant_Income > Credit_History',
 'Coapplicant_Income > Loan_Amount',
 'Coapplicant_Income > Loan_Amount_Term',
 'Credit_History > Coapplicant_Income',
 'Loan_Amount > Coapplicant_Income',
 'Loan_Amount > Credit_History',
 'Loan_Amount > Loan_Amount_Term',
 'Loan_Amount_Term > Applicant_Income',
 'Loan_Amount_Term > Coapplicant_Income',
 'Loan_Amount_Term > Credit_History',
 'Loan_Amount_Term 

In [212]:
numerical_column_names

['Applicant_Income', 'Coapplicant_Income', 'Loan_Amount']

In [213]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Loan_ID             2 non-null      object 
 1   Gender              2 non-null      object 
 2   Married             2 non-null      object 
 3   Dependents          2 non-null      object 
 4   Education           2 non-null      object 
 5   Self_Employed       2 non-null      object 
 6   Applicant_Income    2 non-null      int64  
 7   Coapplicant_Income  2 non-null      float64
 8   Loan_Amount         2 non-null      float64
 9   Loan_Amount_Term    2 non-null      object 
 10  Credit_History      2 non-null      object 
 11  Property_Area       2 non-null      object 
 12  Loan_Status         2 non-null      object 
dtypes: float64(2), int64(1), object(10)
memory usage: 336.0+ bytes


In [48]:
test.head()

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7400062,2,1,790,5240,1,0,0,4,6,790,0,1925,0,98118,47.5303,-122.288,1430,5320
1,7600057,3,2,1410,2700,2,0,0,4,7,1410,0,1902,0,98122,47.6029,-122.302,1750,4000


In [214]:
## Getting dummy variables for test data
test_dummies = pd.get_dummies(test, drop_first=False)
test_columns = test_dummies.columns

In [215]:
test_columns

Index(['Applicant_Income', 'Coapplicant_Income', 'Loan_Amount',
       'Loan_ID_LP001028', 'Loan_ID_LP001029', 'Gender_Male', 'Married_No',
       'Married_Yes', 'Dependents_0', 'Dependents_2', 'Education_Graduate',
       'Self_Employed_No', 'Loan_Amount_Term_360.0', 'Credit_History_1.0',
       'Property_Area_Rural', 'Property_Area_Urban', 'Loan_Status_N',
       'Loan_Status_Y'],
      dtype='object')

In [216]:
## Final columns used for modelling
train_columns = final_data_for_modelling.columns

In [217]:
train_columns

Index(['Credit_History_1.0', 'Applicant_Income', 'Loan_Amount',
       'Coapplicant_Income = Credit_History_True',
       'Coapplicant_Income != Credit_History_True', 'Coapplicant_Income',
       'Property_Area_Semiurban', 'Married_Yes', 'Education_Not Graduate',
       'Loan_Status'],
      dtype='object')

In [218]:
## Checking whatever train columns exist in test columns and separating it
cols_in = []
cols_out = []
for col in train_columns:
    if col in test_columns:
        cols_in.append(col)
    else:
        cols_out.append(col)

In [219]:
cols_in

['Credit_History_1.0',
 'Applicant_Income',
 'Loan_Amount',
 'Coapplicant_Income',
 'Married_Yes']

In [220]:
cols_out

['Coapplicant_Income = Credit_History_True',
 'Coapplicant_Income != Credit_History_True',
 'Property_Area_Semiurban',
 'Education_Not Graduate',
 'Loan_Status']

In [221]:
## Creating a dataframe with the columns which exist in test
final_test_data = test_dummies[cols_in]
final_test_data

Unnamed: 0,Credit_History_1.0,Applicant_Income,Loan_Amount,Coapplicant_Income,Married_Yes
0,1,3073,200.0,8106.0,1
1,1,1853,114.0,2840.0,0


In [222]:
## Removing target column which won't exist in test columns
cols_out.remove('Loan_Status')

In [223]:
cols_out

['Coapplicant_Income = Credit_History_True',
 'Coapplicant_Income != Credit_History_True',
 'Property_Area_Semiurban',
 'Education_Not Graduate']

In [224]:
## Creating new columns and adding 0 for those columns which doesn't exist in test columns
for col in cols_out:
    final_test_data[col] = 0

In [61]:
final_test_data

Unnamed: 0,Credit_History_1.0,Loan_Amount,Applicant_Income,Coapplicant_Income,Property_Area_Semiurban
0,1,17.0,1299,1086.0,0
1,1,125.0,4950,0.0,0


In [225]:
model = pickle.loads(RF_Classification)
test['Loan_Status'] = labelencoder.inverse_transform(model.predict(final_test_data))
test

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001028,Male,Yes,2,Graduate,No,3073,8106.0,200.0,360,1,Urban,Y
1,LP001029,Male,No,0,Graduate,No,1853,2840.0,114.0,360,1,Rural,N


## Process tried for Live Prediction

In [226]:
X = ['a', 'b', 'c']

y = ['male', 'female', 'female']

X_encoded = labelencoder.fit_transform(X)

rf_model = RandomForestClassifier()

rf_model.fit(X_encoded[:, None], y)

x = ['a']

x_encoded = labelencoder.transform(x)
rf_model.predict(x_encoded[:, None])

array(['male'], dtype='<U6')

In [227]:
X_encoded[:, None]

array([[0],
       [1],
       [2]], dtype=int64)

In [228]:
test['Credit_History'] = test['Credit_History'].astype('object')

In [229]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Loan_ID             2 non-null      object 
 1   Gender              2 non-null      object 
 2   Married             2 non-null      object 
 3   Dependents          2 non-null      object 
 4   Education           2 non-null      object 
 5   Self_Employed       2 non-null      object 
 6   Applicant_Income    2 non-null      int64  
 7   Coapplicant_Income  2 non-null      float64
 8   Loan_Amount         2 non-null      float64
 9   Loan_Amount_Term    2 non-null      object 
 10  Credit_History      2 non-null      object 
 11  Property_Area       2 non-null      object 
 12  Loan_Status         2 non-null      object 
dtypes: float64(2), int64(1), object(10)
memory usage: 336.0+ bytes


In [230]:
pd.get_dummies(test)

Unnamed: 0,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_ID_LP001028,Loan_ID_LP001029,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_2,Education_Graduate,Self_Employed_No,Loan_Amount_Term_360.0,Credit_History_1.0,Property_Area_Rural,Property_Area_Urban,Loan_Status_N,Loan_Status_Y
0,3073,8106.0,200.0,1,0,1,0,1,0,1,1,1,1,1,0,1,0,1
1,1853,2840.0,114.0,0,1,1,1,0,1,0,1,1,1,1,1,0,1,0


In [231]:
final_data_for_modelling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 0 to 613
Data columns (total 10 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Credit_History_1.0                         614 non-null    uint8  
 1   Applicant_Income                           614 non-null    float64
 2   Loan_Amount                                614 non-null    float64
 3   Coapplicant_Income = Credit_History_True   614 non-null    uint8  
 4   Coapplicant_Income != Credit_History_True  614 non-null    uint8  
 5   Coapplicant_Income                         614 non-null    float64
 6   Property_Area_Semiurban                    614 non-null    uint8  
 7   Married_Yes                                614 non-null    uint8  
 8   Education_Not Graduate                     614 non-null    uint8  
 9   Loan_Status                                614 non-null    object 
dtypes: float64(3), object(1), 

In [232]:
train_columns = final_data_for_modelling.columns
print('Train columns:')
print(train_columns)

Train columns:
Index(['Credit_History_1.0', 'Applicant_Income', 'Loan_Amount',
       'Coapplicant_Income = Credit_History_True',
       'Coapplicant_Income != Credit_History_True', 'Coapplicant_Income',
       'Property_Area_Semiurban', 'Married_Yes', 'Education_Not Graduate',
       'Loan_Status'],
      dtype='object')


In [233]:
test_original_columns = test.columns
print('\nTest columns:')
print(test_original_columns)


Test columns:
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Applicant_Income', 'Coapplicant_Income',
       'Loan_Amount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area',
       'Loan_Status'],
      dtype='object')


In [234]:
Credit_History_1.0, Loan_Amount, Applicant_Income, Coapplicant_Income

SyntaxError: invalid syntax (<ipython-input-234-12c0d10041e0>, line 1)

In [235]:
test_dummies = pd.get_dummies(test,  drop_first=False)
test_dummies_columns = test_dummies.columns
print('\nTest columns after getting dummies:')
print(test_dummies_columns)


Test columns after getting dummies:
Index(['Applicant_Income', 'Coapplicant_Income', 'Loan_Amount',
       'Loan_ID_LP001028', 'Loan_ID_LP001029', 'Gender_Male', 'Married_No',
       'Married_Yes', 'Dependents_0', 'Dependents_2', 'Education_Graduate',
       'Self_Employed_No', 'Loan_Amount_Term_360.0', 'Credit_History_1.0',
       'Property_Area_Rural', 'Property_Area_Urban', 'Loan_Status_N',
       'Loan_Status_Y'],
      dtype='object')


In [236]:
for col in test_original_columns:
    if col not in test_dummies_columns:
        test_dummies = pd.concat([test_dummies, test[col]], axis=1)
test_columns = test_dummies.columns
print('\nTest columns dummies plus original columns:')
print(test_columns)


Test columns dummies plus original columns:
Index(['Applicant_Income', 'Coapplicant_Income', 'Loan_Amount',
       'Loan_ID_LP001028', 'Loan_ID_LP001029', 'Gender_Male', 'Married_No',
       'Married_Yes', 'Dependents_0', 'Dependents_2', 'Education_Graduate',
       'Self_Employed_No', 'Loan_Amount_Term_360.0', 'Credit_History_1.0',
       'Property_Area_Rural', 'Property_Area_Urban', 'Loan_Status_N',
       'Loan_Status_Y', 'Loan_ID', 'Gender', 'Married', 'Dependents',
       'Education', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History',
       'Property_Area', 'Loan_Status'],
      dtype='object')


In [237]:
final_test_columns = []
for col in test_columns:
    if col in train_columns:
        final_test_columns.append(col)
    else:
        pattern = str('^'+col+'+')
#     pattern = '^Credit_History+'
#         print(pattern)
    
        for coll in train_columns:
#             print(coll)
            result = re.match(pattern, coll)
            if result:
                final_test_columns.append(col)
                
test_df = test[final_test_columns]
print('\nTest columns similar to train columns:')
print(test_df.columns)

KeyError: "['Married_Yes', 'Credit_History_1.0'] not in index"

In [238]:
for col in test_df.columns:
    pattern = str('^'+col+'+')
    for coll in train_columns:
        result = re.match(pattern, coll)
        if result:
            test_df.rename(columns = {col:coll}, inplace = True) 

print('\nFinal Test columns after matching the name similar to train columns:')            
test_df.columns

NameError: name 'test_df' is not defined

In [95]:
for col in test_columns:
    if col in train_columns:
        continue
    else:
        pattern = str('^'+col+'_'+'+')    
        for col1 in train_columns:
            result = re.match(pattern, col1)
            if result:
                string1 = col1
                string2 = col+'_'
                if string2 in string1:
                    word_to_be_checked = string1.replace(string2,'')
#                     print(word_to_be_checked)
                    for col2 in test_df.columns:
                        result = re.match(pattern, col2)
                        if result:
                            test_df[col2] = test_df[col2].apply(lambda x:1 if x == word_to_be_checked else 0)

Unnamed: 0,Applicant_Income,Coapplicant_Income,Loan_Amount,Credit_History_1.0,Property_Area_Semiurban
0,1299,1086,17,0,0
1,4950,0,125,0,0


In [79]:
model = pickle.loads(Logistic_Model)
test['Loan_Status'] = model.predict(test_df)
test

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001030,Male,Yes,2,Graduate,No,1299,1086,17,120,1,Urban,1
1,LP001032,Male,No,0,Graduate,No,4950,0,125,360,1,Urban,1


## Date Detection

In [9]:
data = pd.read_excel('RiskBasedModelcopy.xlsx')

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 7 columns):
EmployeeId              56 non-null int64
BDIP Waiver             56 non-null object
Last Audited            56 non-null object
Disciplinary History    56 non-null object
Function                56 non-null object
Selected_For_Audit      56 non-null object
Date                    56 non-null int64
dtypes: int64(2), object(5)
memory usage: 3.1+ KB


In [6]:
## To find if any date present. Not neccessary as of now.

for col in data.columns:
#     if data[col].dtype == 'object':
        try:
            data[col] = pd.to_datetime(data[col])
        except ValueError:
            pass

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 7 columns):
EmployeeId              56 non-null datetime64[ns]
BDIP Waiver             56 non-null object
Last Audited            56 non-null object
Disciplinary History    56 non-null object
Function                56 non-null object
Selected_For_Audit      56 non-null object
Date                    56 non-null datetime64[ns]
dtypes: datetime64[ns](2), object(5)
memory usage: 3.1+ KB


In [8]:
data.head()

Unnamed: 0,EmployeeId,BDIP Waiver,Last Audited,Disciplinary History,Function,Selected_For_Audit,Date
0,1970-01-01 00:00:00.000000001,No,Never Audited,Level 3,Market Development,Yes,1970-01-01 00:00:00.000043419
1,1970-01-01 00:00:00.000000002,Yes,Audit > 5 Years ago,Level 1,Audit,Yes,1970-01-01 00:00:00.000043420
2,1970-01-01 00:00:00.000000003,No,Under 5 years ago,Level 2,AERS Advisory,Yes,1970-01-01 00:00:00.000043421
3,1970-01-01 00:00:00.000000004,Yes,Between 36 and 48 months,Level 3,Tax,Yes,1970-01-01 00:00:00.000043422
4,1970-01-01 00:00:00.000000005,No,Between 24 and 36 months,Level 1,Market Development,No,1970-01-01 00:00:00.000043423


## Light GBM

In [19]:
# label_encoder = LabelEncoder()

# data_null_treated[['Gender']] = label_encoder.fit_transform(data_null_treated['Gender'].astype(str))
# data_null_treated[['Married']] = label_encoder.fit_transform(data_null_treated['Married'].astype(str))
# data_null_treated[['Education']] = label_encoder.fit_transform(data_null_treated['Education'].astype(str))
# data_null_treated[['Self_Employed']] = label_encoder.fit_transform(data_null_treated['Self_Employed'].astype(str))
# data_null_treated[['Property_Area']] = label_encoder.fit_transform(data_null_treated['Property_Area'].astype(str))
# data_null_treated[['Loan_Status']] = label_encoder.fit_transform(data_null_treated['Loan_Status'].astype(str))

In [None]:
x = data_null_treated.drop('Loan_Status', axis = 1)
y = data_null_treated[['Loan_Status']]

In [72]:
x.head()

Unnamed: 0,Education,ApplicantIncome,CoapplicantIncome,Property_Area,Credit_History,Self_Employed,LoanAmount,Dependents,Loan_Amount_Term,Gender,Married
0,0,5849.0,0.0,2,1,0,147.75964,0,360,1,0
1,0,4583.0,1508.0,0,1,0,128.0,1,360,1,1
2,0,3000.0,0.0,2,1,1,66.0,0,360,1,1
3,1,2583.0,2358.0,2,1,0,120.0,0,360,1,1
4,0,6000.0,0.0,2,1,0,141.0,0,360,1,0


In [73]:
x = x.replace('3+', 3)

In [74]:
x['Credit_History'] = x['Credit_History'].astype('int')
x['Dependents'] = x['Dependents'].astype('int')
x['Loan_Amount_Term'] = x['Loan_Amount_Term'].astype('int')

In [75]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 0 to 613
Data columns (total 11 columns):
Education            614 non-null int64
ApplicantIncome      614 non-null float64
CoapplicantIncome    614 non-null float64
Property_Area        614 non-null int64
Credit_History       614 non-null int32
Self_Employed        614 non-null int64
LoanAmount           614 non-null float64
Dependents           614 non-null int32
Loan_Amount_Term     614 non-null int32
Gender               614 non-null int64
Married              614 non-null int64
dtypes: float64(3), int32(3), int64(5)
memory usage: 50.4 KB


In [76]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

In [77]:
data_train = lgb.Dataset(x_train, label=y_train)


params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 10
params['min_data'] = 50
params['max_depth'] = 15


clf = lgb.train(params, data_train, 100)

In [78]:
y_pred=clf.predict(x_test)

#convert into binary values
for i in range(0,99):
    if y_pred[i]>=.5:       # setting threshold to .5
        y_pred[i]=1
    else:  
        y_pred[i]=0
        
y_pred = np.floor(y_pred)

In [79]:
y_pred

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [80]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred,y_test)
accuracy

0.518918918918919

## Handling class imbalance in target column

In [24]:
# #Getting each class in target columns as a list

# unique_class = data.Loan_Status.value_counts().index
# unique_class = list(unique_class)

# #Finding the equal percentage for number of classes there in target column
# if len(unique_class) == 2:
#     equal_percentage = ((len(data.Loan_Status) / len(data.Loan_Status.value_counts().index)) / (len(data.Loan_Status))) * 100
#     equal_percentage = equal_percentage / 2
# else:
#     equal_percentage = ((len(data.Loan_Status) / len(data.Loan_Status.value_counts().index)) / (len(data.Loan_Status))) * 100

# #Finding classes with imbalance issue
# columns_with_no_imbalance_issue = []
# columns_with_imbalance_issue = []
# for i in unique_class:
#     if ((data.Loan_Status.value_counts()[i]) / len(data.Loan_Status)) * 100 >= equal_percentage:
#         print('Class with no imbalance issue :', i )
#         columns_with_no_imbalance_issue.append(i)
#     else:
#         print('Class with imbalance issue :', i)
#         columns_with_imbalance_issue.append(i)
        
        
# ##Finding the columns which need to be upsampled        
# each_class_percentage = []
# each_class = []
# for i in unique_class:
#     b = (data.Loan_Status.value_counts()[i] / len(data)) * 100
#     each_class.append(i)
#     each_class_percentage.append(b)
    
# final = pd.DataFrame({'Class':each_class, 'Percentage_present':each_class_percentage})

# max_class = final.sort_values('Percentage_present',ascending=False)['Class'][0]
# max_class_percentage = final.sort_values('Percentage_present',ascending=False)['Percentage_present'][0]

# print('\nTop most Majority class is :', max_class, '\t\tAnd the percentage of that class is :', max_class_percentage)

# classes = list(final['Class'])
# classes_to_be_upsampled = []
# for i in classes:
#     diff_percent = max_class_percentage - final[final['Class'] == i]['Percentage_present']
#     diff_percent = np.array(diff_percent)
#     if diff_percent > equal_percentage:
#         classes_to_be_upsampled.append(i)
        
# print('\nClasses to be upsampled :', classes_to_be_upsampled)