In [3]:
# Import dependencies for logistic regression model
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import joblib

In [4]:
#  Import and read the loan_approval_dataset.csv.
loan_df = pd.read_csv('Resources/loan_approval_dataset.csv')
loan_df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [5]:
# Check for leading/trailing whitespaces in the column names
loan_df.columns = loan_df.columns.str.strip()

In [6]:
# Drop loan id as it is not beneficial 
loan_df = loan_df.drop('loan_id', axis=1)

Separate the features `X` from the target `y`

In [7]:
# Separate the features, X,  from the target variable, y
y = loan_df['loan_status']
X = loan_df.drop(columns='loan_status')

In [8]:
# Preview the features data
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000


Encode the categorical variables from the features data using `get_dummies`.

In [9]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [10]:
# Review the features data
X.head()

Unnamed: 0,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,education_ Graduate,education_ Not Graduate,self_employed_ No,self_employed_ Yes
0,2,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1,0,1,0
1,0,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0,1,0,1
2,3,9100000,29700000,20,506,7100000,4500000,33300000,12800000,1,0,1,0
3,3,8200000,30700000,8,467,18200000,3300000,23300000,7900000,1,0,1,0
4,5,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0,1,0,1


In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Scale the data using `StandardScaler`

In [12]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Create model and train it using the training data

In [13]:
# Create and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [14]:
# Save the model to a file
joblib.dump(model, 'logistic_regression_model.pkl')

['logistic_regression_model.pkl']

Score the model using the test data

In [15]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.9233601070950469
Testing Data Score: 0.9039812646370023


Make predictions

In [16]:
predictions = model.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,Rejected,Rejected
1,Approved,Approved
2,Rejected,Rejected
3,Approved,Approved
4,Approved,Approved
5,Approved,Approved
6,Approved,Approved
7,Rejected,Rejected
8,Approved,Approved
9,Rejected,Rejected


Calculate the Accuracy Score

In [17]:
# Display the accuracy score for the test dataset
accuracy_score(y_test, predictions)

0.9039812646370023