In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import collections
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pickle

## 4. Feature Engineering & Modelling

In [2]:
df = pd.read_csv('../data/data_preprocessed.csv')
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,LoanAmountTerm,Graduate,SelfEmployed,CreditHistory,LoanApproval,Gender_Female,Gender_Male,...,Married_No,Married_Unknown,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3,PropertyArea_Rural,PropertyArea_Semiurban,PropertyArea_Urban
0,45830.0,15080.0,128000.0,360,1,0,1,0,0,1,...,0,0,1,0,1,0,0,1,0,0
1,30000.0,0.0,66000.0,360,1,1,1,1,0,1,...,0,0,1,1,0,0,0,0,0,1
2,25830.0,23580.0,120000.0,360,0,0,1,1,0,1,...,0,0,1,1,0,0,0,0,0,1
3,60000.0,0.0,141000.0,360,1,0,1,1,0,1,...,1,0,0,1,0,0,0,0,0,1
4,54170.0,41960.0,267000.0,360,1,1,1,1,0,1,...,0,0,1,0,0,1,0,0,0,1


In [3]:
scaler = MinMaxScaler()
cols_to_scale = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'LoanAmountTerm']
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,LoanAmountTerm,Graduate,SelfEmployed,CreditHistory,LoanApproval,Gender_Female,Gender_Male,...,Married_No,Married_Unknown,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3,PropertyArea_Rural,PropertyArea_Semiurban,PropertyArea_Urban
0,0.443788,0.264515,0.46124,0.74359,1,0,1,0,0,1,...,0,0,1,0,1,0,0,1,0,0
1,0.285314,0.0,0.22093,0.74359,1,1,1,1,0,1,...,0,0,1,1,0,0,0,0,0,1
2,0.243568,0.413612,0.430233,0.74359,0,0,1,1,0,1,...,0,0,1,1,0,0,0,0,0,1
3,0.585644,0.0,0.511628,0.74359,1,0,1,1,0,1,...,1,0,0,1,0,0,0,0,0,1
4,0.52728,0.736011,1.0,0.74359,1,1,1,1,0,1,...,0,0,1,0,0,1,0,0,0,1


### Prediction models
a. Logistic regression using 10 PC's: average accuracy is 0.8279002554106736.

b. Random forest using 10 PC's: average accuracy is 0.767934758256038.

c. SVM using 10 PC's: average accuracy is 0.8269230769230769.

d. Logistic regression using the 4 features with p < 0.05 & 10 PC's: accuracy is 0.994.

Since d. likely introduces bias due to high correlations between the 4 features and the target variable, I will take a. as my final model.

In [4]:
# Logistic regression using 10 PC's
X = df.drop("LoanApproval", axis=1)
y = df["LoanApproval"]

pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)

lr = LogisticRegression(penalty='l1', solver='liblinear')
k = 3
kf = KFold(n_splits=k, shuffle=True, random_state=42)

scores = cross_val_score(lr, X_pca, y, cv=kf)
average_accuracy = scores.mean()

print("Average accuracy:", average_accuracy)

Average accuracy: 0.8279002554106736


In [5]:
lr.fit(X_pca, y)
lr.coef_

array([[-0.29108573,  0.64588329,  0.03335136, -0.56273799,  0.        ,
        -1.03030733,  0.        , -3.62058025, -0.80726383,  0.47719039]])

In [8]:
# Define hyperparameters to tune
param_grid = {
    "pca__n_components": [5, 10, 15],
    "lr__penalty": ["l1", "l2"],
    "lr__C": [0.1, 1, 10],
    "lr__solver": ["liblinear"]
}

# Create a logistic regression model with PCA
pca = PCA()
lr = LogisticRegression()

# Combine PCA and logistic regression into a pipeline
pipe = Pipeline([
    ("pca", pca),
    ("lr", lr)
])

# Create a grid search object
grid = GridSearchCV(pipe, param_grid, cv=3)

# Fit the grid search object to the data
grid.fit(X, y)

# Print the best hyperparameters and accuracy score
print("Best hyperparameters:", grid.best_params_)
print("Accuracy score:", grid.best_score_)

Best hyperparameters: {'lr__C': 1, 'lr__penalty': 'l1', 'lr__solver': 'liblinear', 'pca__n_components': 15}
Accuracy score: 0.8279226598557153


In [11]:
estimator = grid.best_estimator_

In [12]:
estimator

In [33]:
# Logistic regression using 15 PC's
X = df.drop("LoanApproval", axis=1)
y = df["LoanApproval"]

pca = PCA(n_components=15)
X_pca = pca.fit_transform(X)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create logistic regression model
lr_model = LogisticRegression(C=1, penalty='l1', solver='liblinear')

# Train model on training data
lr_model.fit(X_train, y_train)

# Make predictions on test data
y_pred = lr_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.8365384615384616


In [31]:
# # Random forest using 10 PC's
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
k = 3
kf = KFold(n_splits=k, shuffle=True, random_state=42)
scores = cross_val_score(rf, X_pca, y, cv=kf)
average_accuracy = scores.mean()
print("Average accuracy:", average_accuracy)

Average accuracy: 0.8085316126719541


In [7]:
# SVM using 10 PC's

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=80)

# Create the SVM classifier with RBF kernel
svm = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)

# Fit the model to the training data
svm.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm.predict(X_test)

# Evaluate the performance of the model
accuracy = svm.score(X_test, y_test)
print("Accuracy:", accuracy)

# Perform k-fold cross-validation and get the average accuracy
k = 3
kf = KFold(n_splits=k, shuffle=True, random_state=42)
scores = cross_val_score(svm, X_pca, y, cv=kf)
average_accuracy = scores.mean()
print("Average accuracy:", average_accuracy)

Accuracy: 0.8269230769230769
Average accuracy: 0.8162835506564502


In [8]:
# Perform PCA on the scaled data to extract the first 10 principal components
pca = PCA(n_components=10)
X_pca = pca.fit_transform(df)

# Create a dataframe containing the principal components
pc_df = pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(10)])

# Combine the principal components and the original dataframe using pd.concat()
df2 = pd.concat([df, pc_df], axis=1)

# Print the new dataframe
df2.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,LoanAmountTerm,Graduate,SelfEmployed,CreditHistory,LoanApproval,Gender_Female,Gender_Male,...,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,0.443788,0.264515,0.46124,0.74359,1,0,1,0,0,1,...,-0.683034,-0.618931,-0.399576,0.714369,0.333302,0.137692,-0.531829,-0.720086,-0.278208,-0.232137
1,0.285314,0.0,0.22093,0.74359,1,1,1,1,0,1,...,-0.319897,-0.346588,0.526137,-0.780721,-0.250546,0.284425,0.005694,-0.11268,0.902474,-0.072396
2,0.243568,0.413612,0.430233,0.74359,0,0,1,1,0,1,...,-0.377031,-0.457674,0.381955,-0.677468,-0.430136,0.04534,0.910156,-0.071773,-0.057328,-0.098113
3,0.585644,0.0,0.511628,0.74359,1,0,1,1,0,1,...,0.762662,-0.426283,0.688506,-0.735533,0.035441,-0.462285,-0.208053,0.099873,-0.000645,0.090981
4,0.52728,0.736011,1.0,0.74359,1,1,1,1,0,1,...,-1.007717,-0.251496,0.727847,-0.164111,0.392797,0.062938,-0.196278,0.624383,0.898969,-0.367671


In [9]:
# Logistic regression using the 4 features with p < 0.05 & 10 PC's
features = ['Graduate', 'PropertyArea_Semiurban', 'CreditHistory', 'PropertyArea_Rural', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']
target = 'LoanApproval'
kf = KFold(n_splits=3, shuffle=True, random_state=42)
scores = []

for train_index, test_index in kf.split(df2):
    
    # Split the data into training and testing sets
    X_train, X_test = df2.iloc[train_index][features], df2.iloc[test_index][features]
    y_train, y_test = df2.iloc[train_index][target], df2.iloc[test_index][target]
    
    # Create the logistic regression model with L1 regularization
    model = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)
    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the testing data
    y_pred = model.predict(X_test)
    
    # Calculate the accuracy score and append to the list of scores
    accuracy = accuracy_score(y_test, y_pred)
    scores.append(accuracy)
    
# Calculate the average accuracy score across all k-fold splits
average_accuracy = np.mean(scores)
print(f'Average accuracy score: {average_accuracy:.3f}')
print(scores)



Average accuracy score: 0.994
[0.9942196531791907, 0.9941860465116279, 0.9941860465116279]
