# Loan Prediction Kaggle Contest
Atiya Kailany <br>
May 15, 2021<br>
<br>

------------------------------------------------------


    

## Imports:

Importing necessary libraies

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Uploading Loan Prediction Data:

The test and train data is being saved into two separate sets of pandas DataFrames

In [None]:
train_data = pd.read_csv("./kaggle/input/loan-default-prediction/train_v2.csv")
test_data = pd.read_csv("./kaggle/input/loan-default-prediction/test_v2.csv")

knn_train_data = pd.read_csv("./kaggle/input/loan-default-prediction/train_v2.csv")
knn_test_data = pd.read_csv("./kaggle/input/loan-default-prediction/test_v2.csv")

## Processing Data for KNN with raw data:

Eliminating incompatible types of entries and substituting null and missing values with 0’s

In [None]:
# trying k nearest neighbor with all data, without removing outliers and missing values


# get columns with invalid data type
invalid_columns = knn_train_data.select_dtypes(include=['object']).columns

# remove invalid columns from data sets
knn_train_data.drop(invalid_columns, axis=1, inplace=True)
knn_test_data.drop(invalid_columns, axis=1, inplace=True)

# remove the 'id' column from the test set
knn_train_data.drop('id', axis=1, inplace = True)
knn_test_data.drop('id', axis=1, inplace = True)


# fill in null values with 0's.
knn_train_data = knn_train_data.fillna(0)
knn_test_data = knn_test_data.fillna(0)


all_train_data = knn_train_data
train_loss = all_train_data.loc[:,'loss']
all_train_data.drop('loss', axis=1, inplace = True)

## Processing the Second set of Data for all other models:

- Removing invalid columns
- substituting null and missing values with mean of X_train
- Dropping id columns

In [None]:
# get columns with invalid data type
invalid_columns = train_data.select_dtypes(include=['object']).columns

# remove invalid columns from data sets
train_data.drop(invalid_columns, axis=1, inplace=True)
test_data.drop(invalid_columns, axis=1, inplace=True)

# remove the 'id' column from the test set
test_data.drop('id', axis=1, inplace = True)

# sum data cells with null values
null_values = train_data.isnull().sum()
#print(null_values)

# data frame containing each column with null values
null_values = pd.DataFrame(null_values[null_values!=0])
#print(null_values)

## Processing the Second set of Data for all other models (Continued):

In addition, a train-test-split is performed for model fitting

In [None]:
# get features and target for train-test-split
features = train_data.iloc[:,1:751].copy()
target = train_data.iloc[:,751].copy()

# convert to binary
target[target>0] = 1

X_train, X_test, y_train, y_test = train_test_split(features, target, stratify = target, random_state=0)

# sum of null values 
# print(X_train.isnull().sum().sum())
# print(test_data.isnull().sum().sum())

# fill in null values with mean values.
X_train = X_train.fillna(X_train.mean())
test_data = test_data.fillna(X_train.mean())

# check again sum of null values 
# print(X_train.isnull().sum().sum())
# print(test_data.isnull().sum().sum())

## Principal Component Analysis (PCA)

In [None]:
#PCA model

# fit the data to the training set
scalar= StandardScaler()
scalar.fit(X_train)

# to training model
X_train = scalar.transform(X_train)

# for prediction
X_test = scalar.transform(test_data)

# reduce dimensionality 
pca = PCA(n_components = 200)
pca.fit(X_train)

# transform data 
X_train = pca.transform(X_train)
X_train = pd.DataFrame(data = X_train)
X_test = pca.transform(X_test)
X_test = pd.DataFrame(data = X_test)

## Logistic Regression

In [None]:
# Regression model
log_reg = LogisticRegression(class_weight='balanced',max_iter=400, random_state=1)

# fit model with the data
log_reg.fit(X_train,y_train)

# get prediction
log_reg_pred = log_reg.predict(X_test)

# check prediction output
print(log_reg_pred.shape)
print(log_reg_pred)

# check submission shape matches prediction shape
submission = pd.read_csv("./kaggle/input/loan-default-prediction/sampleSubmission.csv")
#print(submission['loss'].shape)

# see loss count from prediction
sns.countplot(log_reg_pred);

submission['loss'] = log_reg_pred
submission.to_csv("log_reg_submission.csv", index=False)

## Gaussian Naive Bayes

In [None]:
# Gaussian model
naive = GaussianNB()

# fit model with the data
naive.fit(X_train, y_train)

# get prediction
naive_pred = naive.predict(X_test)

# see loss count from prediction
sns.countplot(naive_pred);

submission['loss'] = naive_pred
submission.to_csv("naive_submission.csv", index=False)

## Random Forest Classifier

In [None]:
# Random Forest model
ran_forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

# fit model with the data
ran_forest.fit(X_train, y_train)

# get prediction
ran_forest_pred = ran_forest.predict(X_test)

# see loss count from prediction
sns.countplot(ran_forest_pred);

submission['loss'] = ran_forest_pred
submission.to_csv("ran_forest_submission.csv", index=False)

## Optimal K value determination Code:
This code will take a very long time to process as it is running the data set multiple times with
different k values and calculating the accuracies in order to determine the optimal k value.

P.S. This code has been commented out as it is unnecessary to run, since we already determined 8 to
yield the best results.

In [None]:
# TO TEST MULTIPLE K VALUES, COMMENTED DUE TO LONG PROCESSING TIMES ~12 mins per k
#
# k_value = range(1,10)
# scores = {}
# scores_list = []
# for k in range(1,10):
#     knn = KNeighborsClassifier(n_neighbors=k)
#     knn.fit(x_train, y_train)
#     y_pred = knn.predict(x_test)
#     scores[k] = metrics.accuracy_score(y_test, y_pred)
#     scores_list.append(metrics.accuracy_score(y_test, y_pred))
#
# plt.plot(k_value, scores_list)
# plt.xlabel('K value for KNN')
# plt.ylabel('Accuracy')

## Results of Optimal K value determination Code:
Note this is a screenshot of my output when I ran it, the code will take around 10 minutes for each k value, so around 100 minutes.
I've provided a screenshot of the results below for convenience:

![optimal k](https://user-images.githubusercontent.com/42689178/118394005-5a3a7300-b5ff-11eb-9890-91a484e02dae.jpg)

As seen from the above elbow, 8 is an optimal k value.

## KNN with raw data

In [None]:
# K Nearest Neighbor WITHOUT outlier removal.
# 11 mins running time with gaming PC

knn_x_train, knn_x_test, knn_y_train, knn_y_test = train_test_split(all_train_data, train_loss, random_state=1)

#model
knn = KNeighborsClassifier(n_neighbors=8)

#fit model with data
knn.fit(all_train_data, train_loss)

#get prediction
knn_pred_outlier = knn.predict(knn_test_data)

# see loss count from prediction
sns.countplot(knn_pred_outlier);

submission['loss'] = knn_pred_outlier
submission.to_csv("KNearest_without_outlier_submission.csv", index=False)

## KNN with outlier removal

In [None]:
# K Nearest Neighbor with outlier removal.

#model
# k = 8 is optimal as shown in experiment above.
# No need to test again here.

knn = KNeighborsClassifier(n_neighbors=8)

#fit model with data
knn.fit(X_train, y_train)

#get prediction
knn_pred_without_outlier = knn.predict(X_test)

# see loss count from prediction
sns.countplot(knn_pred_without_outlier);

submission['loss'] = knn_pred_without_outlier
submission.to_csv("KNearest_neighbor_submission.csv", index=False)



