# Bank loan status 

## Summary:
### We are analyzing the bank's data:

In [None]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
from sklearn.feature_selection import RFE
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv (r'../input/my-dataset/credit_train.csv') 

In [None]:
plt.rcParams['axes.facecolor'] = 'Beige'

# Exploratory Data Analysis (EDA)

## Take a quick look at the data:

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

### Data types: float64 (12 columns), object (7 columns)


In [None]:
data.columns

In [None]:
data.describe()

## Visualizing Data


In [None]:
data.hist(bins=25,figsize=(20,10));

In [None]:
plt.figure(figsize=(12,4))
sns.countplot(x='Purpose',data=data,hue='Loan Status')
plt.xticks(rotation=45)
plt.title('Purpose of taking Loan',fontdict={'fontsize':20})
plt.show()

In [None]:
sns.countplot(x='Loan Status', data = data)

In [None]:
plt.figure(figsize=(6,6))
plt.pie(x=data['Term'].value_counts(),labels=['short term','long term'],explode=[0,0.09])
plt.title('Time Period of Taking Loan',fontdict={'fontsize':20})
plt.show()


In [None]:
plt.figure(figsize=(12,5))
sns.countplot(x='Home Ownership',data=data,hue='Loan Status')
plt.title('Own Property vs Loan Status',fontdict={'fontsize':20})
plt.show()


In [None]:
plt.figure(figsize=(12,5))
sns.countplot(x='Home Ownership',data=data,hue='Loan Status')
plt.title('Own Property vs Loan Status',fontdict={'fontsize':20})
plt.show()


## EDA visual foundings:

In [None]:
data.info()

## Understanding our data:
### We use correlation matrix 

In [None]:
corr_matrix=data.corr()
corr_matrix

### But Plotting a correlation plot is actually clearer 

In [None]:
plt.figure(figsize=(14,8))
sns.heatmap(data.corr(), annot=True, cmap="YlGnBu")

### To understand the correlation plot: it creates a relation between 2 variables and checks if it's proportional(higher than 0.5) or inversely proportional (less than -0.5) , or no relation (close to zero)
#### Understanding the correlation between columns helps you make a better model as the insertion of lesser important columns will actually cause bias or corrupt the model.
### We found this:
- Bankruptcy has a high correlation with number of credit problems as the bankrupt bank account is probably immeditely frozen after a bankruptcy.
- Tax liens also have a high correlation with number of credit problems.
- Monthly debt has a high correlation with both, current credit balance, annual income and number of open accounts.


### We drop ID columns as they are features for identification:

In [None]:
data.drop(labels=['Loan ID', 'Customer ID'], axis=1, inplace=True)


## Missing data:
### Is there missing data?

In [None]:
print(data.isnull().values.any())

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap( data.isnull())

In [None]:
# Check the percentage of Nan in dataset
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data['Loan Status'].count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

### We have:
- 51% missing data in total Months since last deliquent.
- 19% in both credit score and annual income.

### Drop the columns with > 50% missing


In [None]:
data.drop(columns = 'Months since last delinquent', axis=1, inplace=True)

### But if there's some rows with alot of missing values, we should remove all rows with more than 2 missing values as filling more than 2 rows could corrupt the data, we use threshold = 8.

In [None]:
print(data.dropna(axis=0,thresh=8).tail())

### Drop last 514 rows as they are all NaN

In [None]:
data.drop(data.tail(514).index, inplace=True) 

### Check the missing values in data once again:

In [None]:
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data['Loan Status'].count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

## filling missing data:
### We should check that all values are numerical.

In [None]:
data.info()

## Encoding categorical columns:

In [None]:
le = LabelEncoder()
data['Term']=le.fit_transform(data['Term'])
data['Years in current job']=le.fit_transform(data['Years in current job'])
data['Home Ownership']=le.fit_transform(data['Home Ownership'])
data['Purpose']=le.fit_transform(data['Purpose'])
data['Loan Status']=le.fit_transform(data['Loan Status'])

## Before completing the missing values, we check the distribution to decide the better representation of central tendency:

In [None]:
sns.histplot(x= 'Credit Score',data =data)

### It's better to use median.

In [None]:
sns.histplot(x= 'Annual Income',data =data)

### It's better to use median.

In [None]:
sns.histplot(x= 'Years in current job',data =data)

### It's better to use mean.

In [None]:
sns.histplot(x= 'Bankruptcies',data =data)

### It's better to use median.

In [None]:
sns.histplot(x= 'Tax Liens',data =data)

### It's better to use mode.

####  For some columns it's better to use mean, other median and other mode.

In [None]:
imputerl = SimpleImputer(strategy='median', missing_values=np.nan)
imputerl = imputerl.fit(data[['Credit Score']])
data['Credit Score'] = imputerl.transform(data[['Credit Score']])
imputer2 = SimpleImputer(strategy='median', missing_values=np.nan)
imputer2 = imputer2.fit(data[['Maximum Open Credit']])
data['Maximum Open Credit'] = imputer2.transform(data[['Maximum Open Credit']])
imputer3 = SimpleImputer(strategy='median', missing_values=np.nan)
imputer3 = imputer3.fit(data[['Annual Income']])
data['Annual Income'] = imputer3.transform(data[['Annual Income']])
imputer4 = SimpleImputer(strategy='most_frequent', missing_values=np.nan)
imputer4 = imputer4.fit(data[['Tax Liens']])
data['Tax Liens'] = imputer4.transform(data[['Tax Liens']])
imputer5 = SimpleImputer(strategy='median', missing_values=np.nan)
imputer5 = imputer5.fit(data[['Bankruptcies']])
data['Bankruptcies'] = imputer5.transform(data[['Bankruptcies']])
imputer6 = SimpleImputer(strategy='mean', missing_values=np.nan)
imputer6 = imputer6.fit(data[['Years in current job']])
data['Years in current job'] = imputer5.transform(data[['Years in current job']])

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
sns.countplot('Loan Status', data = data)

# Outliers:
## We are going to use two methods to deal with oultiers:
## 1) Deletion by scatterplot identification.
## 2) Standardization.

### ______________________________________________

### To look for ouliers we use boxplot 

In [None]:
plt.figure(figsize = (25,8))
u = sns.boxplot(palette = 'cool', data=data)
u.set_xticklabels(u.get_xticklabels(),rotation=45)

### A more accurate representation for ouliers is using a scatter plot
#### We make a scatter plot matrix:

### We make the scatterplot for the numerical columns only:

In [None]:
plt.figure(figsize = (20,20))
sns.pairplot(data = data , x_vars = ['Maximum Open Credit', 'Annual Income', 'Current Credit Balance','Current Loan Amount'] , y_vars = ['Loan Status'])

### We can't find outliers by scatter plot also, so we use Interquartile method:

In [None]:
# IQR
Q1 = np.percentile(data['Annual Income'], 25,
                   interpolation = 'midpoint')
 
Q3 = np.percentile(data['Annual Income'], 75,
                   interpolation = 'midpoint')
IQR = Q3 - Q1
 
print("Old Shape: ", data.shape)
 
# Upper bound
upper = np.where(data['Annual Income'] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(data['Annual Income'] <= (Q1-1.5*IQR))
 
''' Removing the Outliers '''
data.drop(upper[0], inplace = True)
data.drop(lower[0], inplace = True)
 
print("New Shape: ", data.shape)

In [None]:
data.describe()

In [None]:
data = data.drop(data[data['Maximum Open Credit']>7.1**8].index)

In [None]:
plt.figure(figsize = (25,8))
u = sns.boxplot(palette = 'cool', data=data)
u.set_xticklabels(u.get_xticklabels(),rotation=45)

## 1) Deletion of outliers:

####  We only deleted the outliers that could actually corrupt data:

### Check the present of outliers again:

In [None]:
sns.countplot(x='Loan Status',data=data)

In [None]:
plt.figure(figsize = (25,8))
u = sns.boxplot(palette = 'cool', data=data)
u.set_xticklabels(u.get_xticklabels(),rotation=45)

In [None]:
sns.countplot(x='Loan Status',data=data)

#### Outliers are nowhere to be found

## VIF:

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data=data
VIF=pd.Series([variance_inflation_factor(vif_data.values,i) 
for i in range(vif_data.shape[1])],index=vif_data.columns)
VIF

## Treatment of multicolinearity:

In [None]:
def MC_remover(data):
    vif=pd.Series([variance_inflation_factor(data.values,i)for i in range(data.shape[1])],index=data.columns)
    if vif.max()>5:
        print(vif[vif == vif.max()].index[0],'has been removed')
        data = data.drop(columns=[vif[vif==vif.max()].index[0]])
        return data
    else:
        print("No multicollinearity present anymore")
        return data

In [None]:
for i in range(10):
    vif_data=MC_remover(vif_data)
vif_data.head()

### calculating VIF for remaining columns

In [None]:
VIF=pd.Series([variance_inflation_factor(vif_data.values,i) for i in range(vif_data.shape[1])],index=vif_data.columns)
VIF,len(vif_data.columns)

# Splitting data:

In [None]:
reg = linear_model.LinearRegression()

### Rearragning columns:


In [None]:
data = data[['Current Loan Amount', 'Term',
       'Credit Score', 'Annual Income', 'Years in current job',
       'Home Ownership', 'Purpose', 'Monthly Debt', 'Years of Credit History','Number of Open Accounts',
       'Number of Credit Problems', 'Current Credit Balance',
       'Maximum Open Credit', 'Bankruptcies', 'Tax Liens', 'Loan Status']]
data

### To create the proper comparison we drop house value column from x and make it y


In [None]:
x = data.drop(['Loan Status'] , axis = 1).values
y = data['Loan Status' ].values


In [None]:
y.sum()

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y , test_size= 0.2 , random_state=42)


In [None]:
x_train.shape

In [None]:
y_train.shape

## Balancing data

### 1) SMOT

In [None]:
from sklearn.datasets import make_classification
X, y = make_classification(n_classes=2, class_sep=0.5,
weights=[0.05, 0.95], n_informative=2, n_redundant=0, flip_y=0,
n_features=2, n_clusters_per_class=1, n_samples=1000, random_state=10)

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(x_train, y_train)

# Scaling
### Make sure all data aren't categorical

In [None]:
data.info()

In [None]:
from sklearn.preprocessing import RobustScaler

#### Robust scaler reduces the effect of ouliers


In [None]:
ro_scaler = RobustScaler()
x_train = ro_scaler.fit_transform(x_train)
x_test = ro_scaler.fit_transform(x_test)

In [None]:
x_train

In [None]:
x_train.shape

In [None]:
## copy data
datal = data.copy()

## Logistic regression:

In [None]:

# Function to calculate mean absolute error
def cross_val(X_train, y_train, model):
    # Applying k-Fold Cross Validation
    from sklearn.model_selection import cross_val_score
    accuracies = cross_val_score(estimator = model, X = x_train, y = y_train, cv = 5)
    return accuracies.mean()

# Takes in a model, trains the model, and evaluates the model on the test set
def fit_and_evaluate(model):
    
    # Train the model
    model.fit(x_train, y_train)
    
    # Make predictions and evalute
    model_pred = model.predict(x_test)
    model_cross = cross_val(x_train, y_train, model)
    
    # Return the performance metric
    return model_cross

In [None]:
# # Logistic Regression
from sklearn.linear_model import LogisticRegression
logr = LogisticRegression()
logr_cross = fit_and_evaluate(logr)

print('Logistic Regression Performance on the test set: Cross Validation Score = %0.4f' % logr_cross)

## Classification:
### 1) KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_cross = fit_and_evaluate(knn)

print('KNN Performance on the test set: Cross Validation Score = %0.4f' % knn_cross)

### 2) Naive Bayes


In [None]:
from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive_cross = fit_and_evaluate(naive)

print('Naive Bayes Performance on the test set: Cross Validation Score = %0.4f' % naive_cross)

### 3) Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
random = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
random_cross = fit_and_evaluate(random)

print('Random Forest Performance on the test set: Cross Validation Score = %0.4f' % random_cross)

### 4) Decision Tree Clssification

In [None]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(x_train,y_train)
predict=model.predict(x_test)
from sklearn.metrics import classification_report
report=classification_report(y_test,predict)
print(report)

## Confusion matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
pl=plot_confusion_matrix(model,x_test,y_test)
plt.show(pl)

In [None]:
# Thx