In [16]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import joblib

# Define dataset paths
train_file_path = "D:/AIT582/Final Project/Loan approval/train.csv"

# Load dataset
data = pd.read_csv(train_file_path)

# Drop the 'id' column and separate features and target
X = data.drop(columns=['id', 'loan_status'])
y = data['loan_status']

# Encode categorical variables using LabelEncoder
categorical_columns = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le  # Save the encoder for future use

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the feature names
feature_names = X.columns.tolist()

# Scale the numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Logistic Regression model
lr_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
lr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = lr_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Save the trained model, scaler, feature names, and LabelEncoders
joblib.dump(lr_model, "logistic_regression_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(feature_names, "feature_names.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")

accuracy, report


(0.7993008781652314,
 '              precision    recall  f1-score   support\n\n           0       0.96      0.80      0.87     10087\n           1       0.39      0.81      0.53      1642\n\n    accuracy                           0.80     11729\n   macro avg       0.68      0.81      0.70     11729\nweighted avg       0.88      0.80      0.82     11729\n')

In [3]:
# Inspect raw prediction probabilities of the Logistic Regression model
raw_probabilities = lr_model.predict_proba(X_test_scaled)

# Display probabilities for the first 5 samples
print(raw_probabilities[:5])


[[0.97994671 0.02005329]
 [0.99505991 0.00494009]
 [0.83427846 0.16572154]
 [0.80370869 0.19629131]
 [0.94346636 0.05653364]]


In [10]:
# Test the model on training or test data
sample_scaled = scaler.transform(X_test.iloc[:5])  # Use a few samples
sample_predictions = lr_model.predict(sample_scaled)
sample_probabilities = lr_model.predict_proba(sample_scaled)

print("Sample Predictions:", sample_predictions)
print("Sample Probabilities:", sample_probabilities)


Sample Predictions: [0 0 0 0 0]
Sample Probabilities: [[0.97994671 0.02005329]
 [0.99505991 0.00494009]
 [0.83427846 0.16572154]
 [0.80370869 0.19629131]
 [0.94346636 0.05653364]]


In [13]:
print("Class distribution in the training data:")
print(y_train.value_counts(normalize=True))


Class distribution in the training data:
0    0.857021
1    0.142979
Name: loan_status, dtype: float64


In [15]:
# Load the provided model files to inspect and verify their contents
import joblib

# Define file paths
feature_names_path = "D:/AIT582/Final Project/FlaskLoan/models/feature_names.pkl"
label_encoders_path = "D:/AIT582/Final Project/FlaskLoan/models/label_encoders.pkl"
logistic_regression_model_path = "D:/AIT582/Final Project/FlaskLoan/models/logistic_regression_model.pkl"
scaler_path = "D:/AIT582/Final Project/FlaskLoan/models/scaler.pkl"

# Load the files
feature_names = joblib.load(feature_names_path)
label_encoders = joblib.load(label_encoders_path)
logistic_regression_model = joblib.load(logistic_regression_model_path)
scaler = joblib.load(scaler_path)

# Inspect the loaded data
{
    "Feature Names": feature_names,
    "Label Encoders": {key: list(le.classes_) for key, le in label_encoders.items()},
    "Scaler": scaler,
    "Model Type": type(logistic_regression_model)
}


{'Feature Names': ['person_age',
  'person_income',
  'person_home_ownership',
  'person_emp_length',
  'loan_intent',
  'loan_grade',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_default_on_file',
  'cb_person_cred_hist_length'],
 'Label Encoders': {'person_home_ownership': ['MORTGAGE',
   'OTHER',
   'OWN',
   'RENT'],
  'loan_intent': ['DEBTCONSOLIDATION',
   'EDUCATION',
   'HOMEIMPROVEMENT',
   'MEDICAL',
   'PERSONAL',
   'VENTURE'],
  'loan_grade': ['A', 'B', 'C', 'D', 'E', 'F', 'G'],
  'cb_person_default_on_file': ['N', 'Y']},
 'Scaler': StandardScaler(),
 'Model Type': sklearn.linear_model._logistic.LogisticRegression}