In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Loading dataset

In [None]:
train=pd.read_csv('/content/train_u6lujuX_CVtuZ9i.csv')
predict=pd.read_csv('/content/test_Y3wMUE5_7gLdaTN.csv')

In [None]:
train_original=train.copy()
predict_original=predict.copy()
#copy datasets.so wont harm original datasets

In [None]:
train.head(3)

In [None]:
predict.head(3)

#Data Cleaning - Checking and working with missing value

In [None]:
train.info()

In [None]:
predict.info()

#EDA

In [None]:
train = train.drop(['Loan_ID'], axis=1)# drop unnecessary column

In [None]:
train.info()

In [None]:
predict = predict.drop(['Loan_ID'], axis=1)# drop unnecessary column

In [None]:
predict.info()

In [None]:
train['Loan_Status'].value_counts()

In [None]:
train['Loan_Status'].value_counts().plot.bar(title='Loan Status')

The loan of 422(around 69%) people out of 614 was approved.

In [None]:
plt.figure(1)
plt.subplot(221)
train['Gender'].value_counts(normalize=True).plot.bar(figsize=(15,10),title='Gender')
plt.subplot(222)
train['Married'].value_counts(normalize=True).plot.bar(title='Married')
plt.subplot(223)
train['Self_Employed'].value_counts(normalize=True).plot.bar(title='Self Employed')
plt.subplot(224)
train['Credit_History'].value_counts(normalize=True).plot.bar(title='Credit_History')

80% applicants in the dataset are male.

Around 65% people are married.

Around 15% applicants in the dataset are self employed.

Around 85% applicants have repaid their debts.

In [None]:
plt.figure(1)
plt.subplot(131)
train['Dependents'].value_counts(normalize=True).plot.bar(figsize=(20,5),title='Dependents')
plt.subplot(132)
train['Education'].value_counts(normalize=True).plot.bar(title='Education')
plt.subplot(133)
train['Property_Area'].value_counts(normalize=True).plot.bar(title='Property_Area')

In [None]:
plt.figure(1)
plt.subplot(121)
sns.distplot(train['ApplicantIncome'])
plt.subplot(122)
train['ApplicantIncome'].plot.box(figsize=(16,5))

In [None]:
train.boxplot(column= 'ApplicantIncome', by='Education')
plt.tight_layout()
#There are higher number of graduates with very high incomes which are appearing to be outliers.

In [None]:
Married = pd.crosstab(train['Married'], train['Loan_Status'])
Married.div(Married.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(6,4))
plt.legend(loc = 'best')

In [None]:
Credit_History = pd.crosstab(train['Credit_History'], train['Loan_Status'])
Credit_History.div(Credit_History.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(6,4))
plt.legend(loc = 'best')

In [None]:
Property_Area = pd.crosstab(train['Property_Area'], train['Loan_Status'])
Property_Area.div(Property_Area.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(6,4))
plt.legend(bbox_to_anchor=(1.05,1.0),loc='best')

Proportion of married applicants is higher for the approved loans.

People with credit history as 1 are more likely to get their loan approved.

Proportion of loans getting approved in semiurban area is higher compared to rural or urban area.

In [None]:
train['Total_Income'] = train['ApplicantIncome'] + train['CoapplicantIncome']
bins = [0,2500,4000,6000,81000]
group= ['Low', 'Average', 'High', 'Very High']
train['Total_Income_bin'] = pd.cut(train['Total_Income'], bins, labels=group)
train

In [None]:
Total_Income_bin = pd.crosstab(train['Total_Income_bin'], train['Loan_Status'])
Total_Income_bin.div(Total_Income_bin.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.xlabel('Total Income')
plt.ylabel('Percentage')
plt.legend(bbox_to_anchor=(1.05,1.0),loc='best')

Proportions of loan getting approvals for applicants having low Total_Income is very less compared to that of applicants with Average, High and Very High income.

In [None]:
train = train.drop(['Total_Income', 'Total_Income_bin'], axis=1)

In [None]:
train.columns

In [None]:
train['Dependents'].replace('3+', 3, inplace=True)
predict['Dependents'].replace('3+', 3, inplace=True)
train['Loan_Status'].replace('N', 0, inplace=True)
train['Loan_Status'].replace('Y', 1, inplace=True)

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(train.corr(), vmax=0.8, square=True, cmap='BuPu')

We can see that most correlated variables are (ApplicantIncome-LoanAmount) and (Credit_History-Loan_Status). LoanAmount is also correlated with CoapplicantIncome.

In [None]:
train.isnull().sum() #checking null values

In [None]:
# We impute missing values with mean平均数, mode众数, median中位数.
train['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
train['Married'].fillna(train['Married'].mode()[0], inplace=True)
train['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
train['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)
train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
train['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)#数字 data type

In [None]:
train.isnull().sum()#null values done

In [None]:
predict.isnull().sum() #checking null values

In [None]:
# We impute missing values with mean平均数, mode众数, median中位数.
predict['Gender'].fillna(predict['Gender'].mode()[0], inplace=True)
predict['Dependents'].fillna(predict['Dependents'].mode()[0], inplace=True)
predict['Self_Employed'].fillna(predict['Self_Employed'].mode()[0], inplace=True)
predict['LoanAmount'].fillna(predict['LoanAmount'].median(), inplace=True)#数字 data type
predict['Loan_Amount_Term'].fillna(predict['Loan_Amount_Term'].mode()[0], inplace=True)
predict['Credit_History'].fillna(predict['Credit_History'].mode()[0], inplace=True)

In [None]:
predict.isnull().sum() #fillna null values done

#Model Building

In [None]:
X = train.drop('Loan_Status', axis = 1)
y = train['Loan_Status']

In [None]:
X = pd.get_dummies(X)
train = pd.get_dummies(train)
predict = pd.get_dummies(predict)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

##supervised_decisiontree

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=1)
model.fit(X_train,y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
print(classification_report(y_test, predictions))

In [None]:
print(accuracy_score(y_test, predictions))

##supervised_randomforest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfmodel = RandomForestClassifier(n_estimators=500)
rfmodel.fit(X_train, y_train)
rfpredictions = rfmodel.predict(X_test)

In [None]:
print(classification_report(y_test, rfpredictions))

In [None]:
print(accuracy_score(y_test, rfpredictions))

##Let's do hyperparameter tuning for random forest using gridsearchCV and fit the data.

In [None]:
rf=RandomForestClassifier(random_state=42,n_jobs=-1)

In [None]:
params= {
    'max_depth':[2,3,5,10,20],
    'min_samples_leaf':[5,10,20,50,100,200],
    'n_estimators':[10,25,30,50,100,200]
}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid_search=GridSearchCV(estimator=rf,
                         param_grid=params,
                         cv=4,
                         n_jobs=-1,verbose=1,scoring='accuracy')

In [None]:
%%time
grid_search.fit(X_train,y_train)

In [None]:
grid_search.best_score_

In [None]:
rf_best=grid_search.best_estimator_
rf_best

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize=(80,40))
plot_tree(rf_best.estimators_[7],feature_names=X.columns,class_names=['Disease','No'])

##Let's sort the data with the help of feature importance

In [None]:
rf_best.feature_importances_

In [None]:
imp_df=pd.DataFrame({'Varname':X_train.columns,'Imp':rf_best.feature_importances_})

In [None]:
imp_df.sort_values(by='Imp',ascending=False)

##supervised_classification_KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knnmodel = KNeighborsClassifier(n_neighbors=1)
knnmodel.fit(X_train, y_train)
knnpredictions = knnmodel.predict(X_test)
print(classification_report(y_test, knnpredictions))

In [None]:
print(accuracy_score(y_test, knnpredictions))

##supervised_classification_logistic Regression


In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=16)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
print(accuracy_score(y_test, y_pred))

##supervised_classification_Naive-Bayes


In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train);

In [None]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    classification_report,
)

y_pred = model.predict(X_test)

accuray = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test, average="weighted")

print("Accuracy:", accuray)
print("F1 Score:", f1)

#Feature Importance

In [None]:
def feature_imp(df,model):
  feat=pd.DataFrame(columns=['feature','importance'])
  feat['feature']=df.columns
  feat['importance']=model.best_estimator_.feature_importances_
  return feat.sort_values(by='importance',ascending=False)

In [None]:
#feature_imp(X_train,y_train).plot('feature','importance','barh',figsize=(10,7),legend=False)

In [None]:
train.head()

In [None]:
train.columns

We have 12 independent variables and 1 target variable, i.e. Loan_Status in the training dataset.

In [None]:
predict.columns

We have similar features in the predict dataset as the training dataset except for the Loan_Status. We will predict the Loan_Status using the model built using the train data.

In [None]:
train.dtypes

object: Object format means variables are categorical. Categorical variables in our dataset are Loan_ID, Gender, Married, Dependents, Education, Self_Employed, Property_Area, Loan_Status.


int64: It represents the integer variables. ApplicantIncome is of this format.

float64: It represents the variable that has some decimal values involved. They are also numerical

In [None]:
predict.dtypes

In [None]:
train.shape

We have 614 rows and 13 columns in the train dataset.

In [None]:
predict.shape


We have 367 rows and 12 columns in test dataset.

any missing values in the dataset using below code.

In [None]:
for col in train.columns:
  train[col] = train[col].fillna(train[col].mean())
train.isna().sum()

In [None]:
train['Loan_Status'].value_counts()

In [None]:
train['Loan_Status'].value_counts(normalize=True)

In [None]:
train['Loan_Status'].value_counts().plot.bar()

The loan of 422(around 69%) people out of 614 were approved.

In [None]:
train.notna()
sns.distplot(train['LoanAmount'])
plt.show()

In [None]:
train['LoanAmount'].plot.box(figsize=(16,5))
plt.show()

We see a lot of outliers in this variable and the distribution is fairly normal. We will treat the outliers in later sections.

#Feature Engineering

Based on the domain knowledge, we can come up with new features that might affect the target variable. We will create the following three new features:

Total Income - As discussed during bivariate analysis we will combine the Applicant Income and Coapplicant Income. If the total income is high, chances of loan approval might also be high.

EMI - EMI is the monthly amount to be paid by the applicant to repay the loan. Idea behind making this variable is that people who have high EMI’s might find it difficult to pay back the loan. We can calculate the EMI by taking the ratio of loan amount with respect to loan amount term.

Balance Income - This is the income left after the EMI has been paid. Idea behind creating this variable is that if this value is high, the chances are high that a person will repay the loan and hence increasing the chances of loan approval.

In [None]:
train['Total_Income']=train['ApplicantIncome']+train['CoapplicantIncome']
predict['Total_Income']=predict['ApplicantIncome']+predict['CoapplicantIncome']

In [None]:
train.head(5)

In [None]:
predict.head(5)

In [None]:
sns.distplot(train['Total_Income'])

In [None]:
train['Total_Income_log'] = np.log(train['Total_Income'])
sns.distplot(train['Total_Income_log'])
predict['Total_Income_log'] = np.log(predict['Total_Income'])

In [None]:
train['EMI']=train['LoanAmount']/train['Loan_Amount_Term']
predict['EMI']=predict['LoanAmount']/predict['Loan_Amount_Term']

In [None]:
sns.distplot(train['EMI'])

In [None]:
train['Balance Income'] = train['Total_Income']-(train['EMI']*1000)
predict['Balance Income'] = predict['Total_Income']-(predict['EMI']*1000)
sns.distplot(train['Balance Income'])