## Importing the necessary libraries

In [None]:
import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score,plot_confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

from sklearn.model_selection import GridSearchCV
from sklearn import linear_model

from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.constraints import max_norm

In [None]:
scaler = MinMaxScaler()
standard_sclaer = StandardScaler()

## Some important functions

In [None]:
def emp_length_mapping(lst):
    mapping = {}
    for elem in lst:
        if elem in ["n/a", "< 1 year"]:
            mapping[elem] = 0
        else:
            mapping[elem] = int(elem.split()[0].strip("+"))
        
    return mapping

In [None]:
def target_mapping(lst):
    mapping = {}
    for elem in lst:
        if elem in ["Default", "Charged Off", "Late (31-120 days)",
                   "Does not meet the credit policy. Status:Charged Off"]:
            mapping[elem] = 1
        else:
            mapping[elem] = 0
    
    return mapping


### Removing the NULLS

In [None]:
def removeNulls(dataframe, axis =1, percent=0.3):
    df = dataframe.copy()
    ishape = df.shape
    if axis == 0:
        rownames = df.transpose().isnull().sum()
        rownames = list(rownames[rownames.values > percent*len(df)].index)
        df.drop(df.index[rownames],inplace=True) 
        print("nNumber of Rows droppedt: ",len(rownames))
    else:
        colnames = (df.isnull().sum()/len(df))
        colnames = list(colnames[colnames.values>=percent].index)
        df.drop(labels = colnames,axis =1,inplace=True)        
        print("Number of Columns droppedt: ",len(colnames))
    print("nOld dataset rows,columns",ishape,"nNew dataset rows,columns",df.shape)
    return df

# Loading the data

In [None]:
current_directory = "/Users/mlabhishek/Documents/Assignment for DS Candidates"
data = pandas.read_csv(r"loan_data.csv")

In [None]:
# NULL values
data.isnull().sum()

In [None]:
data.info()

In [None]:
data.describe()

### Remove columns where NA values are more than or equal to 30%

In [None]:
data = removeNulls(data, axis =1,percent = 0.3)

In [None]:
# remove those columns where more than 1% of the rows for that column contain a null value.
data = data[[label for label in data if data[label].isnull().sum() <= 0.01 * data.shape[0]]]

In [None]:
data.isnull().sum()

### Remove all columns with only one unique value.

In [None]:
unique = data.nunique()
unique = unique[unique.values == 1]

In [None]:
data.drop(labels = list(unique.index), axis =1, inplace=True)
print("New shape of the data is :", data.shape , "rows & columns.")

### Now we have to do something with those NULL values. We can:
    ** remove rows cointain NULL values, **
    ** fill them with median or mode value, **
    ** or use some imputation and try to predict their missing values. **

In [None]:
data = data.dropna()
data.shape[0] / data.shape[0]

## Removing the non useful columns

In [None]:
data = data.drop(["id", "member_id", "sub_grade", "url", "zip_code", "title"], axis=1)
data.head()

In [None]:
# Columns with only one value
for label in list(data):
    if len(data[label].unique()) < 5:
        print(data[label].value_counts())
        print("\n")

### # We can see that feature "pymnt_plan" has only two possible values: "n" and "y", but with only 10 occurrences of "y" (less than 1%), so definitely it is insignificant

In [None]:
data = data.drop(["pymnt_plan"], axis=1)
data.head()

In [None]:
plt.figure(figsize=(20,30))
seaborn.heatmap(data.corr(),annot=True)
plt.title('Correlation Matrix (for Loan Status)')

In [None]:
# Target column
list(data["loan_status"].unique())

In [None]:
# Our goal is to prepare predictive model of default. Default client is defined as one with loan_status variable taking on the following levels: 'Charged Off', 'Default', 'Late (31-120 days)', 'Does not meet the credit policy. Status:Charged Off'. So this values we will define as ones and the rest as zeros.
data["loan_status"] = data["loan_status"].map(target_mapping(data["loan_status"].unique()))

In [None]:
# let's do something with our non-numerical features!
data.select_dtypes(include=["object"]).head()

In [None]:
data["initial_list_status"] = data["initial_list_status"].map({"f": 1, "w": 0})
data["term"] = data["term"].apply(str).str.split().str[0].astype("int")
data["last_pymnt_amnt"] = data["last_pymnt_amnt"].astype("float")
data = pandas.get_dummies(data, columns=list(data.select_dtypes(include=["object"])))

In [None]:
# Convert all continuous variables to numeric values.
numeric_columns = ['loan_amnt','funded_amnt','funded_amnt_inv','installment','int_rate','annual_inc','dti']
data[numeric_columns] = data[numeric_columns].apply(pandas.to_numeric)

In [None]:
data[numeric_columns] = data[numeric_columns].apply(pandas.to_numeric)

### Loan Status: Remove all records with a value of less than 1.5%.

In [None]:
(data.loan_status.value_counts()*100)/len(data)

In [None]:
del_loan_status = (data.loan_status.value_counts()*100)/len(data)
del_loan_status = del_loan_status[(del_loan_status < 1.5)]
data.drop(labels = data[data.loan_status.isin(del_loan_status.index)].index, inplace=True)
print("So now we are left with",data.shape ,"rows & columns.")
print(data.loan_status.unique())

In [None]:
X = data.drop('loan_status', axis=1)
y = data['loan_status']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

### scaler is MinMax scaler

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## CREATING A KNN CLASSIFIER

In [None]:
KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)

In [76]:
preds = KNN.predict(X_test) 

In [None]:
print(classification_report(y_test,preds))

## CREATING THE XGBOOST MODEL

In [None]:
# fit model to training data
XG_BOOST = XGBClassifier()
XG_BOOST.fit(X_train, y_train)

In [None]:
preds = XG_BOOST.predict(X_test) 

In [None]:
print(classification_report(y_test,preds))

In [None]:
plot_confusion_matrix(XG_BOOST,X_test,y_test)

## CREATING TNE RANDOM FOREST CLASSIFIER

In [None]:
RANDOM_FOREST = RandomForestClassifier(n_estimators=100)
RANDOM_FOREST.fit(X_train,y_train)

In [None]:
preds = RANDOM_FOREST.predict(X_test)

In [None]:
print(classification_report(y_test,preds))
plot_confusion_matrix(RANDOM_FOREST,X_test,y_test)

# ANN NETWORK

In [None]:
ANN = Sequential()

# input layer
ANN.add(Dense(119,  activation='relu'))
ANN.add(Dropout(0.1))

# hidden layer
ANN.add(Dense(78,  activation='relu'))
ANN.add(Dropout(0.1))

# hidden layer
ANN.add(Dense(39, activation='relu'))
ANN.add(Dropout(0.1))

# hidden layer
ANN.add(Dense(19, activation='relu'))
ANN.add(Dropout(0.1))

# output layer
ANN.add(Dense(units=1,activation='sigmoid'))

# Compile model
ANN.compile(loss='binary_crossentropy', optimizer='adam')


In [None]:
ANN.fit(x=X_train, 
          y=y_train, 
          epochs=10,
          batch_size=256,
          validation_data=(X_test, y_test), 
          )

In [None]:
losses = pandas.DataFrame(ANN.history.history)
losses[['loss','val_loss']].plot()

In [None]:
predictions = (ANN.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test,predictions))

In [None]:
cm = confusion_matrix(y_test,predictions)
plot = sns.heatmap(cm, annot=True, fmt='d', cmap='viridis', square=True) #plot_confusion_matrix does not work directly for ANN

In [None]:
predictions = (ANN.predict(X_test) > 0.7).astype("int32")
print(classification_report(y_test,predictions))