In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier

# Load the data from the provided URL
url = 'https://github.com/amitaslanov/Final-Project-Data-Mining-2024/raw/main/LoanData.csv'
data = pd.read_csv(url)

In [2]:
# Initialize LabelEncoders
le_state = LabelEncoder()
le_emp_length = LabelEncoder()
le_homeownership = LabelEncoder()
le_verified_income = LabelEncoder()
le_loan_purpose = LabelEncoder()
le_loan_status = LabelEncoder()
le_has_second_income = LabelEncoder()
le_grade = LabelEncoder()

In [3]:
# Fit and transform categorical variables
data['state'] = le_state.fit_transform(data['state'])
data['emp_length'] = le_emp_length.fit_transform(data['emp_length'])
data['homeownership'] = le_homeownership.fit_transform(data['homeownership'])
data['verified_income'] = le_verified_income.fit_transform(data['verified_income'])
data['loan_purpose'] = le_loan_purpose.fit_transform(data['loan_purpose'])
data['loan_status'] = le_loan_status.fit_transform(data['loan_status'])
data['has_second_income'] = le_has_second_income.fit_transform(data['has_second_income'])
data['grade_encoded'] = le_grade.fit_transform(data['grade'])

In [4]:
# Print the mapping for each categorical variable
print("State Encoding:")
print(dict(zip(le_state.classes_, le_state.transform(le_state.classes_))))
print("\nEmployment Length Encoding:")
print(dict(zip(le_emp_length.classes_, le_emp_length.transform(le_emp_length.classes_))))
print("\nHomeownership Encoding:")
print(dict(zip(le_homeownership.classes_, le_homeownership.transform(le_homeownership.classes_))))
print("\nVerified Income Encoding:")
print(dict(zip(le_verified_income.classes_, le_verified_income.transform(le_verified_income.classes_))))
print("\nLoan Purpose Encoding:")
print(dict(zip(le_loan_purpose.classes_, le_loan_purpose.transform(le_loan_purpose.classes_))))
print("\nLoan Status Encoding:")
print(dict(zip(le_loan_status.classes_, le_loan_status.transform(le_loan_status.classes_))))
print("\nHas Second Income Encoding:")
print(dict(zip(le_has_second_income.classes_, le_has_second_income.transform(le_has_second_income.classes_))))
print("\nGrade Encoding:")
print(dict(zip(le_grade.classes_, le_grade.transform(le_grade.classes_))))

State Encoding:
{'AZ': 0, 'CA': 1, 'CT': 2, 'FL': 3, 'HI': 4, 'IL': 5, 'IN': 6, 'MA': 7, 'MD': 8, 'MI': 9, 'MN': 10, 'MO': 11, 'MS': 12, 'NE': 13, 'NH': 14, 'NJ': 15, 'NV': 16, 'NY': 17, 'OH': 18, 'RI': 19, 'SC': 20, 'TX': 21, 'VA': 22, 'WI': 23, 'WV': 24}

Employment Length Encoding:
{0.0: 0, 1.0: 1, 2.0: 2, 3.0: 3, 4.0: 4, 5.0: 5, 6.0: 6, 7.0: 7, 8.0: 8, 9.0: 9, 10.0: 10, nan: 11}

Homeownership Encoding:
{'mortgage': 0, 'own': 1, 'rent': 2}

Verified Income Encoding:
{'Not Verified': 0, 'Source Verified': 1, 'Verified': 2}

Loan Purpose Encoding:
{'car': 0, 'credit_card': 1, 'debt_consolidation': 2, 'home_improvement': 3, 'house': 4, 'other': 5, 'renewable_energy': 6, 'small_business': 7}

Loan Status Encoding:
{'Current': 0, 'Fully Paid': 1}

Has Second Income Encoding:
{False: 0, True: 1}

Grade Encoding:
{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}


In [5]:
# Calculate the correlation matrix
correlation_matrix = data.corr()

# Extract the correlations with the 'grade_encoded' column
target_correlation = correlation_matrix['grade_encoded']

# Get the top 5 features with the highest absolute correlation values
top_5_correlated_features = target_correlation.drop('grade_encoded').abs().sort_values(ascending=False).head(5)

top_features_list = list(top_5_correlated_features.index)

In [6]:
X = data[top_features_list]
y = data['grade']

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
gbm = GradientBoostingClassifier(random_state=42)
gbm.fit(X_train, y_train)

GradientBoostingClassifier(random_state=42)

In [8]:
import tkinter as tk
from tkinter import ttk

# Function to predict and display the score
def predict_score():
    # Get the values from the entry widgets
    interest_rate = float(interest_rate_entry.get())
    verified_income = float(verified_income_entry.get())
    loan_purpose = float(loan_purpose_entry.get())
    annual_income = float(annual_income_entry.get())
    term = float(term_entry.get())
    
    # Prepare the data for prediction (ensure the order and scaling are correct)
    input_data = [interest_rate, verified_income, loan_purpose, annual_income, term]
    
    # Reshape data for the model if necessary and predict
    prediction = gbm.predict([input_data])
    
    # Display the prediction
    prediction_label.config(text=f"Predicted Score: {prediction[0]}")

# Set up the GUI
root = tk.Tk()
root.title("Loan Grade Prediction")

# Create and place entry widgets for the features
interest_rate_entry = ttk.Entry(root)
interest_rate_entry.grid(row=0, column=1)
ttk.Label(root, text="Interest Rate (0-30%):").grid(row=0, column=0)
ttk.Label(root, text="The annual interest rate for the loan, expressed as a percentage").grid(row=0, column=2)

verified_income_entry = ttk.Entry(root)
verified_income_entry.grid(row=1, column=1)
ttk.Label(root, text="Verified Income (0-2):").grid(row=1, column=0)
ttk.Label(root, text="0: not verified, 1: source verified, 2: verified").grid(row=1, column=2)

loan_purpose_entry = ttk.Entry(root)
loan_purpose_entry.grid(row=2, column=1)
ttk.Label(root, text="Loan Purpose (0-7):").grid(row=2, column=0)
ttk.Label(root, text="0: car, 1: credic card, 2: debt condolidation, 3: home improvment, 4: house, 5: other, 6: renewable energy, 7: small business").grid(row=2, column=2)

annual_income_entry = ttk.Entry(root)
annual_income_entry.grid(row=3, column=1)
ttk.Label(root, text="Annual Income ($):").grid(row=3, column=0)
ttk.Label(root, text="The annual income of the applicant in dollars").grid(row=3, column=2)

term_entry = ttk.Entry(root)
term_entry.grid(row=4, column=1)
ttk.Label(root, text="Term (months):").grid(row=4, column=0)
ttk.Label(root, text="The term of the loan in months").grid(row=4, column=2)

# Prediction button
predict_button = ttk.Button(root, text="Predict Grade", command=predict_score)
predict_button.grid(row=5, column=0, columnspan=2)

# Label to display the prediction
prediction_label = ttk.Label(root, text="Predicted Score: ")
prediction_label.grid(row=6, column=0, columnspan=2)

root.mainloop()