# LOAN APPROVAL PREDECTIONS

##### Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pickle

##### 1. Loading the data

In [2]:
train_data = pd.read_csv('train[1].csv')
test_data = pd.read_csv('test[1].csv')

##### 2. Processing and Handling missing values

In [3]:
# Fill missing values in train_data
train_data['LoanAmount'].fillna(train_data['LoanAmount'].mean(), inplace=True)
train_data['Loan_Amount_Term'].fillna(train_data['Loan_Amount_Term'].mode()[0], inplace=True)
train_data['Credit_History'].fillna(train_data['Credit_History'].mode()[0], inplace=True)

# Fill missing values in test_data
test_data['LoanAmount'].fillna(test_data['LoanAmount'].mean(), inplace=True)
test_data['Loan_Amount_Term'].fillna(test_data['Loan_Amount_Term'].mode()[0], inplace=True)
test_data['Credit_History'].fillna(test_data['Credit_History'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['LoanAmount'].fillna(train_data['LoanAmount'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Loan_Amount_Term'].fillna(train_data['Loan_Amount_Term'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method

In [4]:
# Drop Loan_ID
train_data.drop('Loan_ID', axis=1, inplace=True)
test_ids = test_data['Loan_ID']
test_data.drop('Loan_ID', axis=1, inplace=True)

##### 4. Encode categorical values

In [5]:
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
encoder = LabelEncoder()
for col in categorical_columns:
    if col in train_data.columns:
        train_data[col] = encoder.fit_transform(train_data[col].astype(str))
    if col in test_data.columns:
        test_data[col] = encoder.transform(test_data[col].astype(str))

##### 5. Separating features and target

In [6]:
X = train_data.drop('Loan_Status', axis=1)
y = train_data['Loan_Status']

##### 6. Scaling the data

In [7]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
test_data = scaler.transform(test_data)

In [8]:
# Split the training data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

##### 7. Traning Models

In [9]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(random_state=42)
knn = KNeighborsClassifier()
svm = SVC(random_state=42)


In [10]:
#Train and evaluate each model
models = {
    "Logistic Regression": log_reg,
    "Decision Tree": decision_tree,
    "Random Forest": random_forest,
    "K-Nearest Neighbors": knn,
    "Support Vector Machine": svm
}

In [11]:
accuracies = {}
trained_models = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train model
    y_pred = model.predict(X_val)  # Predict on validation set
    acc = accuracy_score(y_val, y_pred)  # Calculate accuracy
    accuracies[model_name] = acc  # Store accuracy
    trained_models[model_name] = model  # Store trained model

# Print model accuracies
print("Model Accuracies:")
for model_name, accuracy in accuracies.items():
    print(f"{model_name}: {accuracy:.4f}")

Model Accuracies:
Logistic Regression: 0.8618
Decision Tree: 0.7317
Random Forest: 0.8374
K-Nearest Neighbors: 0.8618
Support Vector Machine: 0.8455


In [12]:
# Sort the models by accuracy
accuracy_results = {
    "Model": list(accuracies.keys()),
    "Accuracy": list(accuracies.values())
}
accuracy_df = pd.DataFrame(accuracy_results)
accuracy_df = accuracy_df.sort_values(by="Accuracy", ascending=False)
print(accuracy_df)

                    Model  Accuracy
0     Logistic Regression  0.861789
3     K-Nearest Neighbors  0.861789
4  Support Vector Machine  0.845528
2           Random Forest  0.837398
1           Decision Tree  0.731707


In [13]:
# Select the best model based on accuracy
best_model_name = max(accuracies, key=accuracies.get)
final_model = trained_models[best_model_name]
print(f"Best Model: {best_model_name} with Accuracy: {accuracies[best_model_name]:.4f}")

Best Model: Logistic Regression with Accuracy: 0.8618


##### 8. Make predictions on the test data using the best model

In [14]:
predictions = final_model.predict(test_data) 

##### 9. Save the final model to a pickle file

In [15]:
pickle_file = f'{best_model_name.replace(" ", "_").lower()}_model.pkl'
with open(pickle_file, 'wb') as file:
    pickle.dump(final_model, file)
print(f"Final model saved to {pickle_file}")

Final model saved to logistic_regression_model.pkl


In [16]:
candidate_data = {
    'Gender': 1,  # 1 for Male, 0 for Female
    'Married': 0,  # 0 for No, 1 for Yes
    'Dependents': 0,  # Number of dependents
    'Education': 0,  # 0 for Graduate, 1 for Not Graduate
    'Self_Employed': 1,  # 1 for Yes, 0 for No
    'Property_Area': 0,  # 0 for Urban, 1 for Semiurban, 2 for Rural
    'LoanAmount': 135000.0,
    'Loan_Amount_Term': 360,  # Term in months (example)
    'Credit_History': 1,  # 1 for Good, 0 for Bad
    'ApplicantIncome': 568800.0,
    'CoapplicantIncome': 20560.5,
}

candidate_data_list = list(candidate_data.values())
candidate_data_list = [candidate_data_list] 
candidate_data_scaled = scaler.transform(candidate_data_list)
y_pred_dt = final_model.predict(candidate_data_scaled)
prediction_label = 'YES' if y_pred_dt[0] == 1 else 'NO'
print(f"Loan Status for this Candidate: {prediction_label}")

Loan Status for this Candidate: YES




In [17]:
Candidate_data = {
    'Gender': 0,
    'Married': 0,
    'Dependents': 1,
    'Education': 1,
    'Self_Employed': 0,
    'Property_Area': 2,
    'LoanAmount': 150000.0,
    'Loan_Amount_Term': 180,
    'Credit_History': 0,
    'ApplicantIncome': 25000.0,
    'CoapplicantIncome': 10000.0,
}

Candidate_data_list = list(Candidate_data.values())
Candidate_data_list = [Candidate_data_list]
Candidate_data_scaled = scaler.transform(Candidate_data_list)
y_pred_no = final_model.predict(Candidate_data_scaled)
prediction_label_no = 'Loan cannot be given' if y_pred_no[0] == 0 else 'NO'
print(f"Loan Status for this Candidate: {prediction_label_no}")


Loan Status for this Candidate: NO


