In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("../input/loan-eligible-dataset/loan-train.csv")

In [3]:
df.head()

In [4]:
df.shape

In [5]:
df.info()

In [6]:
df.describe()
#df.describe(include = 'O')

In [7]:
pd.crosstab(df['Credit_History'], df['Loan_Status']) 
#A crosstab is a table showing the relationship between two or more variables. 
#Where the table only shows the relationship between two categorical variables, a crosstab is also known as a contingency table.

In [8]:
df.boxplot(column = 'ApplicantIncome')

In [9]:
df['ApplicantIncome'].hist(bins = 20)

**Applicant Income is right skewed**

In [10]:
df['ApplicantIncome'].skew()

In [11]:
df['CoapplicantIncome'].hist(bins = 20)

**CoApplicant Income is also right skewed**

In [12]:
df['CoapplicantIncome'].skew()

In [13]:
df.boxplot(column = 'ApplicantIncome', by = 'Education')

In [14]:
df.boxplot(column = 'LoanAmount')

In [15]:
df['LoanAmount'].hist(bins = 20)

**Loan Amount is right skewed**

In [16]:
df['LoanAmount'].skew()

In [17]:
#Normalising the LoanAmount with the log function
df['LoanAmount_log'] = np.log(df['LoanAmount'])
df['LoanAmount_log'].hist(bins = 20)
print(df['LoanAmount_log'].skew())

In [18]:
df.isnull().sum()

In [19]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0]) 

In [20]:
df['Married'] = df['Married'].fillna(df['Married'].mode()[0]) 

In [21]:
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0]) 

In [22]:
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0]) 

In [23]:
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean()) 

In [24]:
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])

In [25]:
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [26]:
df.isnull().sum()

In [27]:
df['LoanAmount_log'] = df['LoanAmount_log'].fillna(df['LoanAmount_log'].mean())

In [28]:
df.isnull().sum()

In [29]:
#Summing up the Applicant Income and Coapplicant Income
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
#Normalizing the TotalIncome and creating a new column TotalIncome_log
df['TotalIncome_log'] = np.log(df['TotalIncome'])


In [30]:
df['TotalIncome_log'].hist(bins = 20)
df['TotalIncome_log'].skew()

**Right Skewness is reduced**

In [31]:
df.head()

## Dividing the dataset into dependent and independent variable

In [32]:
X = df.iloc[:, np.r_[1:5, 9:11, 13:15]].values
y = df.iloc[:, 12].values

In [33]:
X

In [34]:
y

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [36]:
X_train

In [37]:
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()

In [38]:
for i in range(0,5):
    X_train[:,i] = labelencoder_X.fit_transform(X_train[:,i])

In [39]:
X_train[:,7] = labelencoder_X.fit_transform(X_train[:,7])


In [40]:
X_train

In [41]:
labelencoder_y = LabelEncoder()
y_train = labelencoder_y.fit_transform(y_train)

In [42]:
y_train

In [43]:
for i in range(0,5):
    X_test[:,i] = labelencoder_X.fit_transform(X_test[:,i])

In [44]:
X_test[:,7] = labelencoder_X.fit_transform(X_test[:,7])

In [45]:
y_test = labelencoder_y.fit_transform(y_test)

In [46]:
y_test

In [47]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

In [48]:
X_train

# Applying Decision Tree Algorithm

In [49]:
from sklearn.tree import DecisionTreeClassifier
DTClassifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
DTClassifier.fit(X_train, y_train)

In [50]:
y_pred = DTClassifier.predict(X_test)
y_pred

In [51]:
from sklearn import metrics
print("The accuracy of decision tree is: ", metrics.accuracy_score(y_pred, y_test))

# Applying Naive Bayes Algorithm

In [52]:
from sklearn.naive_bayes import GaussianNB
NBClassifier = GaussianNB()
NBClassifier.fit(X_train, y_train)

In [53]:
y_pred = NBClassifier.predict(X_test)

In [54]:
y_pred

In [55]:
print("The accuracy of Naive Bayes is: ", metrics.accuracy_score(y_pred, y_test))

**We are going to use Naive Bayes as it has the better accuracy**

# EDA FOR TEST DATA

In [56]:
testdata = pd.read_csv("../input/loan-eligible-dataset/loan-test.csv")

In [57]:
testdata.head()

In [58]:
testdata.isnull().sum()

In [59]:
testdata['Gender'] = testdata['Gender'].fillna(testdata['Gender'].mode()[0])
testdata['Dependents'] = testdata['Dependents'].fillna(testdata['Dependents'].mode()[0])
testdata['Self_Employed'] = testdata['Self_Employed'].fillna(testdata['Self_Employed'].mode()[0])
testdata['LoanAmount'] = testdata['LoanAmount'].fillna(testdata['LoanAmount'].mean())
testdata['Loan_Amount_Term'] = testdata['Loan_Amount_Term'].fillna(testdata['Loan_Amount_Term'].mode()[0])
testdata['Credit_History'] = testdata['Credit_History'].fillna(testdata['Credit_History'].mode()[0])

In [60]:
testdata.isnull().sum()

In [61]:
#Normalising the LoanAmount with the log function
testdata['LoanAmount_log'] = np.log(testdata['LoanAmount'])
testdata['LoanAmount_log'].hist(bins = 20)
print(testdata['LoanAmount_log'].skew())

In [62]:
#Summing up the Applicant Income and Coapplicant Income
testdata['TotalIncome'] = testdata['ApplicantIncome'] + testdata['CoapplicantIncome']
#Normalizing the TotalIncome and creating a new column TotalIncome_log
testdata['TotalIncome_log'] = np.log(testdata['TotalIncome'])
testdata['TotalIncome_log'].hist(bins = 20)
testdata['TotalIncome_log'].skew()

In [63]:
testdata.head()

In [64]:
test = testdata.iloc[:, np.r_[1:5, 9:11, 13:15]].values

In [65]:
for i in range(0,5):
    test[:,i] = labelencoder_X.fit_transform(test[:,i])

In [66]:
test[:,7] = labelencoder_X.fit_transform(test[:,7])

In [67]:
test

In [68]:
test = ss.fit_transform(test)

In [69]:
test

In [70]:
pred = NBClassifier.predict(test)

In [71]:
pred