# LOAN PREDICTION

Binary classification using Logistic Regression

In [None]:
from io import IncrementalNewlineDecoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore'),

Importing and Loading Dataset

In [None]:
# from google.colab import files
# uploaded = files.upload()


In [None]:
df=pd.read_csv('train.csv')
df

Dataset info

In [None]:
df.info()

Dataset Shape

In [None]:
df.shape

Checking the Missing Values

In [None]:
df.isnull().sum()

First we will fill the missing values in "LoanAmount" & "Credit_History" by the 'Mean' & 'Median' of the respective variables.

In [None]:
df['LoanAmount']=df['LoanAmount'].fillna(df['LoanAmount'].mean())

In [None]:
df['Credit_History']=df['Credit_History'].fillna(df['Credit_History'].median())

Lets confirm if there are any missing values in 'LoanAmount & 'Credit_History'

In [None]:
df.isnull().sum()

Now lets drop all the missing values remaining

In [None]:
df.dropna(inplace=True)

Lets check missing values for the final time

In [None]:
df.isnull().sum()

Here, we have dropped all the missing values to avoid disturbances in the model. The Loan Prediction requires all the details to work efficiently and thus themissing values are dropped.

Now, lets check the final Dataset Shape

In [None]:
df.shape

# Exploratory Data Analysis

Comparison between Parameters in getting the Loan:

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(100, 50))
sns.set(font_scale=5)

plt.subplot(331)
sns.countplot(data=df, x='Gender', hue='Loan_Status')

plt.subplot(332)
sns.countplot(data=df, x='Married', hue='Loan_Status')

plt.subplot(333)
sns.countplot(data=df, x='Education', hue='Loan_Status')

plt.subplot(334)
sns.countplot(data=df, x='Self_Employed', hue='Loan_Status')

plt.subplot(335)
sns.countplot(data=df, x='Property_Area', hue='Loan_Status')

plt.tight_layout()
plt.show()

# Lets replace the Variable values to Numerical form & display the Value Counts

The data in Numerical form avoids disturbances in building the model.

In [None]:
df['Loan_Status'].replace('Y',1,inplace=True)
df['Loan_Status'].replace('N',0,inplace=True)

In [None]:
df['Loan_Status'].value_counts()

In [None]:
df.Gender=df.Gender.map({'Male':1,'Female':0})
df['Gender'].value_counts()

In [None]:
df.Married=df.Married.map({'Yes':1,'No':0})
df['Married'].value_counts()

In [None]:
df.Dependents=df.Dependents.map({'0':0,'1':1,'2':2,'3+':3})
df['Dependents'].value_counts()

In [None]:
df.Education=df.Education.map({'Graduate':1,'Not Graduate':0})
df['Education'].value_counts()

In [None]:
df.Self_Employed=df.Self_Employed.map({'Yes':1,'No':0})
df['Self_Employed'].value_counts()

In [None]:
df.Property_Area=df.Property_Area.map({'Urban':2,'Rural':0,'Semiurban':1})
df['Property_Area'].value_counts()

In [None]:
df['LoanAmount'].value_counts()

In [None]:
df['Loan_Amount_Term'].value_counts()

In [None]:
df['Credit_History'].value_counts()

From the above figure, we can see that Credit_History (independent variable) has the maximum correlation with Loan_Status (dependent variable) which denotes that the Loan_Status is heavily dependant on the Credit_History.

# Final DataFrame

In [None]:
df.head()

# Importing Packages for Classification algorithms

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Splitting the Data into Train and Test set

In [None]:
x=df.iloc[1:542,1:12].values
y=df.iloc[1:542,12].values

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)

# Logistic Regression (LR)

Logistic Regression is a supervised learning classification algorithm used to predict the probability of a target variable.

Mathematically, a logistic regression model predicts P(y=1) as a function of x. It is one of the simplest ML algorithms that can be used for various classification problems such as spam detection, Diabetes prediction, Cancer detection etc.

Sigmoid Function

In [None]:
model=LogisticRegression()
model.fit(x_train,y_train)

lr_prediction=model.predict(x_test)
print('Logistic Regression accuracy= ',metrics.accuracy_score(lr_prediction,y_test))

In [None]:
import matplotlib.pyplot as plt

# Accuracy value for Logistic Regression model
accuracy_logistic_regression = metrics.accuracy_score(lr_prediction,y_test)

# Plot accuracy graph
plt.figure(figsize=(50, 20))
plt.bar('Logistic Regression', accuracy_logistic_regression, color='lightblue')
plt.title('Accuracy of Logistic Regression Model')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add label to the bar
plt.text(0, accuracy_logistic_regression + 0.01, f'{accuracy_logistic_regression:.4f}', ha='center')

plt.show()



In [None]:
print('y_predicted= ',lr_prediction)
print('y_test= ',y_test)

# CONCLUSION

1. The loan status is heavily dependent on the credit history for predictions.

2. the logistic regression algorithm gives us the maximum accuracy (79%) compared to the other machine learning classification algorithms.

# Model Development

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier(random_state=1)

In [None]:
model.fit(x_train,y_train)

In [None]:
predictions = model.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
print(classification_report(y_test, predictions))

In [None]:
print('Decision Tree Accuracy: ',accuracy_score(y_test, predictions))

In [None]:
import matplotlib.pyplot as plt

# Accuracy value for Decision Tree model
accuracy_decision_tree = accuracy_score(y_test, predictions)

# Plot accuracy graph
plt.figure(figsize=(50, 20))
plt.bar('Decision Tree', accuracy_decision_tree, color='lightgreen')
plt.title('Accuracy of Decision Tree Model')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add label to the bar
plt.text(0, accuracy_decision_tree + 0.01, f'{accuracy_decision_tree:.4f}', ha='center')

plt.show()


# Random Forest

Random Forest is a machine learning algorithm used for classification and regression tasks. It is an ensemble learning technique that combines multiple decision trees to improve accuracy and prevent overfitting. The algorithm is based on the concept of bagging, where multiple subsets of the data are created and used to train individual decision trees. The final prediction is made by aggregating the predictions of all decision trees, either by voting for classification tasks or by averaging for regression tasks.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfmodel = RandomForestClassifier(n_estimators=500)
rfmodel.fit(x_train, y_train)

In [None]:
rfpredictions_test = rfmodel.predict(x_test)

In [None]:
from sklearn import metrics
print("Accuracy on the test set using Random Forest: ", metrics.accuracy_score(y_test, rfpredictions_test))

In [None]:
import matplotlib.pyplot as plt

# Accuracy value for Random Forest model
accuracy_random_forest = metrics.accuracy_score(y_test, rfpredictions_test)

# Plot accuracy graph
plt.figure(figsize=(50, 20))
plt.bar('Random Forest', accuracy_random_forest, color='pink')
plt.title('Accuracy of Random Forest Model')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add label to the bar
plt.text(0, accuracy_random_forest + 0.01, f'{accuracy_random_forest:.4f}', ha='center')

plt.show()


In [None]:
print(classification_report(y_test, rfpredictions_test))

In [None]:
print(rfpredictions_test)
print(y_train)

In [None]:
import matplotlib.pyplot as plt

# Accuracy values for different models
models = ['Logistic Regression', 'Decision Tree', 'Random Forest']
accuracies = [metrics.accuracy_score(lr_prediction,y_test), accuracy_score(y_test, predictions), metrics.accuracy_score(y_test, rfpredictions_test)]

# Plot accuracy graph
plt.figure(figsize=(50, 20))
plt.bar(models, accuracies, color=['lightblue', 'lightgreen', 'pink'])
plt.title('Accuracy of Different Models')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add labels to the bars
for i, accuracy in enumerate(accuracies):
    plt.text(i, accuracy + 0.01, f'{accuracy:.4f}', ha='center')

plt.show()


# GUI

In [1]:
import tkinter as tk
from tkinter import messagebox
import numpy as np
from sklearn.linear_model import LogisticRegression

# Function to train the machine learning model
def train_model():
    # Generate dummy data for demonstration
    X_train = np.random.rand(100, 3)  # Example features (applicant income, loan amount, credit history)
    y_train = np.random.randint(0, 2, 100)  # Example labels (0 for rejected, 1 for approved)
    
    # Train a logistic regression model (replace with your actual training code)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    return model

# Function to predict loan approval
def predict_loan():
    try:
        # Get input values from the user
        applicant_income = float(entry_income.get())
        loan_amount = float(entry_loan_amount.get())
        credit_history = int(entry_credit_history.get())
        
        # Make prediction using the model
        prediction = model.predict([[applicant_income, loan_amount, credit_history]])
        
        # Display prediction result
        if prediction[0] == 1:
            result_label.config(text="Loan Approved", fg="green")
        else:
            result_label.config(text="Loan Rejected", fg="red")
    except Exception as e:
        messagebox.showerror("Error", str(e))

# Train the machine learning model
model = train_model()

# Create GUI window
root = tk.Tk()
root.title("Loan Predictor")

# Create input fields
label_income = tk.Label(root, text="Applicant Income:")
label_income.grid(row=0, column=0, padx=10, pady=5)
entry_income = tk.Entry(root)
entry_income.grid(row=0, column=1, padx=10, pady=5)

label_loan_amount = tk.Label(root, text="Loan Amount:")
label_loan_amount.grid(row=1, column=0, padx=10, pady=5)
entry_loan_amount = tk.Entry(root)
entry_loan_amount.grid(row=1, column=1, padx=10, pady=5)

label_credit_history = tk.Label(root, text="Credit History (1 for Yes, 0 for No):")
label_credit_history.grid(row=2, column=0, padx=10, pady=5)
entry_credit_history = tk.Entry(root)
entry_credit_history.grid(row=2, column=1, padx=10, pady=5)

# Button to trigger prediction
predict_button = tk.Button(root, text="Predict", command=predict_loan)
predict_button.grid(row=3, column=0, columnspan=2, pady=10)

# Display prediction result
result_label = tk.Label(root, text="", font=("Helvetica", 16))
result_label.grid(row=4, column=0, columnspan=2, pady=5)

root.mainloop()


ModuleNotFoundError: No module named 'numpy'