In [None]:
# the main focus of this notebook is on dimesionality reduction. It makes the code faster, more physically understandable, and more predictive.
# this topic is a book by itself, but simple tools can be used to get rid of features that only make the model more complex but not more predictive.
# here, I mention a few simple one:
# if more than 25% of each feature data is missing, we can drop it
# if std of each feature is very low, here < 0.005, we may drop it
# if the pairwise correlation among features is more than 75%, we may drop one of them
# if VIF among features is more than 5, we may drop one feature
# Decision Tree algorithm can find the importance of each feature for the target, we can set a threshold and ingnore less important ones

# not to mention, before this section of the code, we already drop variables that:
# have very low correlation with the target
# have more than 0.05 p_value
# being a categorical feature, have high ratio of value_counts to the length of dataset

In [None]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import f_regression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

cf.go_offline()
sns.set()

In [None]:
df = pd.read_csv('../input/loan-predication/train_u6lujuX_CVtuZ9i (1).csv')
df.head()

In [None]:
df.info()

# 1) Defining the target

In [None]:
df['Loan_Status'].value_counts()

In [None]:
# first make sure target or label is categorical: our target is 1 if paid, and 0 if not
df['target'] = df['Loan_Status'].map({'Y' : 1, 'N' : 0})
df.drop('Loan_Status', axis = 1, inplace = True)
df.head()

In [None]:
df.describe()

In [None]:
df = df.drop(['Loan_ID'], axis = 1)

In [None]:
df.head()

In [None]:
# percentage of missing values for each column:
(100 * (df.isnull().sum()) / len(df)).sort_values(ascending = False)

# 2) droping clearly useless features

In [None]:
# this finds number of unique values for each categorical feature, if high, one can drop that specific feature
col_list = []
factor = []
nu = []

for col in df.select_dtypes(['object']):
    col_list.append(col)
    factor.append(100 * df[col].nunique() / len(df))
    nu.append(df[col].nunique())

col_list = np.array(col_list).T
factor = np.array(factor).T
nu = np.array(nu).T

factor_df = pd.DataFrame(data = col_list, columns = ['Column'])
factor_df['Factor'] = factor
factor_df['nu'] = nu
factor_df

In [None]:
# above numbers look to be pretty small, so we keep them all

# 3) what features are not important for this target

In [None]:
np.abs(df.corr()['target']).sort_values(ascending = True)[:-1].plot.bar(figsize = (16,8))

In [None]:
np.abs(df.corr()['target']).sort_values(ascending = True)[:6].index

In [None]:
# let's drop the 'ApplicantIncome'

In [None]:
# we can also look into the p_values

col_list = []
p_list = []
for col in df.select_dtypes(['number']):
    slope, intercept, r_value, p_value, std_err = stats.linregress(df[col], df['target'])
    col_list.append(col)
    p_list.append(p_value)
    #print(f'{col} is associated with the target wtih p_value of:    {p_value}')

pval_table = pd.DataFrame(data = col_list, columns = ['col'])
pval_table['p_values'] = p_list
pval_table.sort_values(by = 'p_values', ascending = False)

# based on here and p_value limit of 0.05, we can ignore first six ones, here we keep all

In [None]:
# above, again suggest we can drop 'ApplicantIncome'

In [None]:
df = df.drop('ApplicantIncome', axis = 1)

In [None]:
# now lets look into target dependance on categorical features
for col in df.select_dtypes(['object']):
    plt.figure(figsize = (16,6))
    sns.countplot(df[col], hue = df['target'])
    plt.show()

In [None]:
# looks they all are important in the target

# 4) look into each categorical column

In [None]:
# firsr lets look into each object column
for col in df.select_dtypes(['object']):
    print()
    print('for the feature:     ', col)
    print(df[col].value_counts())

# 5) Dealing with missing values

In [None]:
# total percentage of missing data
100 * (df.isnull().sum().sum()) / len(df)

In [None]:
# lets delete them
df = df.dropna()

# 6) is target balanced?

In [None]:
# is the label balanced?
sns.countplot(x = 'target', data = df)

# it is unbalanced but not too bad, as we dont have that many data, lets keep all

In [None]:
# balance the input for target, if only there are to values 0 and 1 as in val1 and val2

# first shuffle indices:
df = df.sample(frac=1).reset_index(drop=True)

# then set val1 and val2 below:
val1 = 0
val2 = 1

val_1_bigger = 0
val_2_bigger = 0

# find the number of each label
num_val1 = df[df['target'] == val1].shape[0]
num_val2 = df[df['target'] == val2].shape[0]

if num_val1 > num_val2:
    val_1_bigger = 1
else:
    val_2_bigger = 1

i_1_count = 0
i_2_count = 0
indices_to_remove = []

for i in np.arange(len(df)):
    
    if df['target'][i] == 0:
        i_1_count = i_1_count + 1
        if (val_1_bigger == 1) and (i_1_count > num_val2):
            indices_to_remove.append(i)
    if df['target'][i] == 1:
        i_2_count = i_2_count + 1
        if (val_2_bigger == 1) and (i_2_count > num_val1):
            indices_to_remove.append(i)

            
df = df.drop(index=indices_to_remove, axis = 0)
df = df.reset_index(drop = True)

In [None]:
# is the label balanced, now?
sns.countplot(x = 'target', data = df)

In [None]:
df.shape

In [None]:
for col in df.select_dtypes(['object']):
    dummies = pd.get_dummies(df[col], drop_first = True, prefix = col)
    df = pd.concat([df, dummies], axis = 1)
    df = df.drop(col, axis = 1)

In [None]:
df.head()

In [None]:
df.shape

# dimensionality reduction

In [None]:
dfcopy = df.drop('target', axis = 1)

# a) based on percent missing

In [None]:
ther = 25

In [None]:
# here we omit if more than 25% of a column is missing, the "cols_to_be_deleted" collects name of columns to be deleted
cols_to_be_deleted = []
for col in dfcopy.columns:
    per = 100 * (dfcopy[col].isnull().sum()) / len(dfcopy)
    if per > ther and col not in cols_to_be_deleted:
        print(col)
        cols_to_be_deleted.append(col)

In [None]:
cols_to_be_deleted

# b) based on low variation in features

In [None]:
ther = 0.05

In [None]:
# here we will drop columns that have std of below 0.005
std_cols = []
for col in dfcopy.columns:
    std_col = df[col].std()
    std_cols.append(std_col)
    if std_col < ther and col not in cols_to_be_deleted:
        cols_to_be_deleted.append(col)
    

df_std = pd.DataFrame(data = std_cols, index = dfcopy.columns, columns = ['var'])
#df_std.sort_values(by = 'var')

In [None]:
cols_to_be_deleted

# c) based on pairwsie correlation

In [None]:
ther = 0.50

In [None]:
dfcorr = dfcopy.corr()

In [None]:
# here if there is pairwise correlation above 0.75, we omit one of the columns to be added to the list
for i,col1 in enumerate(dfcorr.columns):
    for j, col2 in enumerate(dfcorr.columns):
        if j > i:
            corr_cell = dfcorr.loc[col1, col2]
            if corr_cell > ther and col2 not in cols_to_be_deleted:
                cols_to_be_deleted.append(col2)

In [None]:
cols_to_be_deleted

# d) based on multi-co-lineatiry

In [None]:
ther = 2

In [None]:
# vif of more than 5 means multi-co-linearity
for i,col1 in enumerate(dfcorr.columns):
    for j, col2 in enumerate(dfcorr.columns):
        if j > i:
            slope, intercept, r_value, p_value, std_err = stats.linregress(df[col1], df[col2])
            vif = 1 / (1 - r_value * r_value)
            if vif > ther and col2 not in cols_to_be_deleted:
                cols_to_be_deleted.append(col2)

In [None]:
cols_to_be_deleted

# e) based on decision tree feature importance

In [None]:
ther = 0.05

In [None]:
X = dfcopy
y = df['target']

In [None]:
# Create decision tree classifer object
clf = RandomForestClassifier()

# Train model
model = clf.fit(X, y)

# Calculate feature importances
importances = model.feature_importances_

In [None]:
df_importance = pd.DataFrame(data = importances, index = dfcopy.columns, columns = ['Importance'])
df_importance.sort_values(by = 'Importance').plot.barh(figsize = (12,24))

In [None]:
for ind in df_importance.index:
    imp = df_importance.loc[ind]['Importance']
    if imp < ther and col2 not in cols_to_be_deleted:
        cols_to_be_deleted.append(ind)

In [None]:
cols_to_be_deleted

In [None]:
df = df.drop(cols_to_be_deleted, axis = 1)
df.shape

In [None]:
# setting up x and y, the .values make it a numpy array to put into tf

x = df.drop('target', axis = 1)
y = df['target']


# split, first into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 101)

## scaling must happen after test_train split to avoid data leakage

scalar = StandardScaler()

x_train = scalar.fit_transform(x_train)

x_test = scalar.transform(x_test)

In [None]:
# modeling and priting off coefs:

lm = LogisticRegression()
lm.fit(x_train, y_train.ravel())

In [None]:
# predictions

predictions = pd.DataFrame(lm.predict(x_test), columns = ['Predicted Values'])
predictions ['Real Values'] = y_test.reset_index(drop = True)
predictions.head(10)

In [None]:
# making a data frame for real vs predicted vs residuals
predictions ['Residuals'] = predictions ['Real Values'] - predictions['Predicted Values']
sns.countplot(predictions['Residuals'])

In [None]:
print(classification_report(y_test, predictions['Predicted Values']))

In [None]:
my_cm = confusion_matrix(y_test, predictions['Predicted Values'])
conf_temp = {'Predicted NO': [my_cm[0][0], my_cm[1][0]], 'Predicted YES': [my_cm[0][1], my_cm[1][1]] }
my_cmdf = pd.DataFrame(conf_temp, index = ['Actual NO', 'Actual YES'])
my_cmdf

In [None]:
correct_percentage_log = 100 * (my_cm[0][0] + my_cm[1][1]) / predictions.shape[0]
wrong_percentage = 100 - correct_percentage_log
print(f'Model Accuracy is: {correct_percentage_log:.4}%' )