In [24]:
# Load isomap dataset from sklearn manifold
from sklearn.manifold import Isomap
from sklearn.datasets import make_s_curve
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.spatial import distance
import numpy as np
import joblib

In [25]:
import pandas as pd

dataset = pd.read_csv("../../datasets/homeloan_train.xls")
homeloan_test = pd.read_csv("../../datasets/homeloan_test.xls")

In [26]:
train_original = dataset.copy()

In [27]:
# Preprocessing the dataset based on the provided steps

# Handling missing values
dataset["Gender"].fillna(dataset["Gender"].mode()[0], inplace=True)
dataset["Married"].fillna(dataset["Married"].mode()[0], inplace=True)
dataset["Dependents"].fillna(dataset["Dependents"].mode()[0], inplace=True)
dataset["Self_Employed"].fillna(dataset["Self_Employed"].mode()[0], inplace=True)
dataset["Credit_History"].fillna(dataset["Credit_History"].mode()[0], inplace=True)
dataset = dataset.dropna(subset=['LoanAmount', 'Loan_Amount_Term'])

# Removing the Loan_ID column
dataset = dataset.drop("Loan_ID", axis=1)

# Converting Loan_Status to numerical values
dataset.loc[dataset["Loan_Status"] == "Y", "Loan_Status"] = 1
dataset.loc[dataset["Loan_Status"] == "N", "Loan_Status"] = 0
dataset["Loan_Status"] = dataset["Loan_Status"].astype('int')

# store clean homeloan data to csv file
dataset.to_csv("../../datasets/homeloan_clean.csv", index=False)

In [7]:
# Splitting dataset into features and target
X = dataset.drop("Loan_Status", 1)
y = dataset[["Loan_Status"]]

  X = dataset.drop("Loan_Status", 1)


In [8]:
# Normalizing continuous features
continuous_features_list = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
categorical_features_list = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_normalized = X.copy()
X_normalized[continuous_features_list] = scaler.fit_transform(X[continuous_features_list])

# Converting categorical columns to numerical values
for col in categorical_features_list:
    for i in range(X[col].nunique()):
        X_normalized.loc[X[col] == X[col].unique()[i], col] = i

X_normalized.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
1,0,0,0,0,0,0.05483,0.036192,0.172214,0.74359,1.0,0
2,0,0,1,0,1,0.03525,0.0,0.082489,0.74359,1.0,1
3,0,0,1,1,0,0.030093,0.056592,0.160637,0.74359,1.0,1
4,0,1,1,0,0,0.072356,0.0,0.191027,0.74359,1.0,1
5,0,0,2,0,1,0.065145,0.100703,0.373372,0.74359,1.0,1


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Training the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train.values.ravel())

# Evaluating the model on the test set
logistic_accuracy = logistic_model.score(X_test, y_test)

logistic_accuracy

0.8103448275862069

In [11]:
# Save the model to a file
joblib.dump(logistic_model, 'logistic_model.pkl')

['logistic_model.pkl']

## Train Pytorch model

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader



In [14]:
# Neural Network Definition
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Hyperparameters
epochs = 100
learning_rate = 0.01

# Training the Neural Network (assuming you have data tensors and dataloaders set up)
model = SimpleNN(input_dim=X_train.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [22]:
import torch.utils.data as data

# Convert preprocessed data to tensors
train_tensor = torch.tensor(X_train.astype(float).values, dtype=torch.float32)
val_tensor = torch.tensor(X_test.astype(float).values, dtype=torch.float32)
train_labels = torch.tensor(y_train.values, dtype=torch.float32)
val_labels = torch.tensor(y_test.values, dtype=torch.float32)

# Create a dataset and dataloader
train_dataset = data.TensorDataset(train_tensor, train_labels)
val_dataset = data.TensorDataset(val_tensor, val_labels)
train_loader = data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = data.DataLoader(val_dataset, batch_size=32, shuffle=True)

In [23]:
for epoch in range(epochs):
    for data, target in train_loader:
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()

# Evaluation
with torch.no_grad():
    correct = 0
    total = 0
    for data, target in val_loader:
        outputs = model(data)
        predicted = torch.round(outputs)
        total += target.size(0)
        correct += (predicted == target).sum().item()

    print('Accuracy of the network on the validation set: %d %%' % (
            100 * correct / total))


# Saving the trained model
torch.save(model.state_dict(), 'homeloan_pytorch_model.pth')

Accuracy of the network on the validation set: 80 %


In [20]:
# Loading the model
loaded_model = SimpleNN(input_dim=X_train.shape[1])
loaded_model.load_state_dict(torch.load('homeloan_pytorch_model.pth'))
loaded_model.eval()

SimpleNN(
  (fc1): Linear(in_features=11, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

## GP-GOMEA

In [None]:
from gpgomea import GPGOMEA

# Convert data to numpy arrays
X_np = X_train.astype(float).values
y_np = y_train.values.ravel()

# Initialize GPGOMEA
gomea = GPGOMEA(problem='regression')

# Train the model
gomea.fit(X_np, y_np)

# Save the best solution
best_solution = gomea.best_solution()

In [7]:
# count how much unique values in categorical features per column in train dataframe and print it
for col in categorical_features_list:
    print(col, train[col].nunique())
    print(train[col].unique())

Gender 2
['Male' 'Female']
Married 2
['Yes' 'No']
Dependents 4
['1' '0' '2' '3+']
Education 2
['Graduate' 'Not Graduate']
Self_Employed 2
['No' 'Yes']
Property_Area 3
['Rural' 'Urban' 'Semiurban']


In [8]:
train_normalized = train.copy()

In [9]:
# Normalize contrinuous features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_normalized[continuous_features_list] = scaler.fit_transform(train[continuous_features_list])

In [10]:
# Encode Gender, Marries, Education, Self_Employed with 0 and 1 for train dataframe
for col in categorical_features_list:
    for i in range(train[col].nunique()):
        train_normalized.loc[train[col]==train[col].unique()[i], col] = i

In [11]:
train_normalized.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,0,0,0,0,0,0.05483,0.036192,0.172214,0.74359,1.0,0,0
2,0,0,1,0,1,0.03525,0.0,0.082489,0.74359,1.0,1,1
3,0,0,1,1,0,0.030093,0.056592,0.160637,0.74359,1.0,1,1
4,0,1,1,0,0,0.072356,0.0,0.191027,0.74359,1.0,1,1
5,0,0,2,0,1,0.065145,0.100703,0.373372,0.74359,1.0,1,1


In [12]:
original_point = train_normalized[train_normalized["Loan_Status"]==0].sample(1)

In [13]:
original_point

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
360,1,1,0,0,0,0.056562,0.0,0.104197,0.74359,1.0,2,0


In [14]:
original_point = train_normalized[train_normalized["Loan_Status"]==0].sample(1)

In [15]:
train_normalized[continuous_features_list].head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
1,0.05483,0.036192,0.172214,0.74359,1.0
2,0.03525,0.0,0.082489,0.74359,1.0
3,0.030093,0.056592,0.160637,0.74359,1.0
4,0.072356,0.0,0.191027,0.74359,1.0
5,0.065145,0.100703,0.373372,0.74359,1.0


In [16]:
len(train_normalized)

578

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Training the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train.values.ravel())

# Evaluating the model on the test set
logistic_accuracy = logistic_model.score(X_test, y_test)

In [26]:
# Transform continuous features back to unnormalized values
norm_original_train = train_normalized.copy()
norm_original_train[continuous_features_list] = scaler.inverse_transform(train_normalized[continuous_features_list])
norm_original_train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,0,0,0,0,0,4583.0,1508.0,128.0,360.0,1.0,0,0
2,0,0,1,0,1,3000.0,0.0,66.0,360.0,1.0,1,1
3,0,0,1,1,0,2583.0,2358.0,120.0,360.0,1.0,1,1
4,0,1,1,0,0,6000.0,0.0,141.0,360.0,1.0,1,1
5,0,0,2,0,1,5417.0,4196.0,267.0,360.0,1.0,1,1


In [31]:
train_normalized.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,0,0,0,0,0,0.05483,0.036192,0.172214,0.74359,1.0,0,0
2,0,0,1,0,1,0.03525,0.0,0.082489,0.74359,1.0,1,1
3,0,0,1,1,0,0.030093,0.056592,0.160637,0.74359,1.0,1,1
4,0,1,1,0,0,0.072356,0.0,0.191027,0.74359,1.0,1,1
5,0,0,2,0,1,0.065145,0.100703,0.373372,0.74359,1.0,1,1


In [47]:
# print unique values in Loan_Amount_Term and Credit_History
print(train_normalized["Loan_Amount_Term"].unique())
print(train_normalized["Credit_History"].unique())

[0.74358974 0.23076923 0.48717949 0.35897436 0.1025641  0.61538462
 1.         0.05128205 0.15384615 0.        ]
[1. 0.]


In [49]:
# For every continuous feature print median pairwise distance between points
median_distance = []
for col in continuous_features_list:
    current_median_distance = np.median(distance.pdist(train_normalized[col].values.reshape(-1, 1)))
    median_distance.append(current_median_distance)
    print(col, current_median_distance)

ApplicantIncome 0.024650587507730366
CoapplicantIncome 0.03839969280245758
LoanAmount 0.07525325615050651
Loan_Amount_Term 0.0
Credit_History 0.0


In [50]:
# For every continuous feature print mean pairwise distance between points
mean_distance = []
for col in continuous_features_list:
    current_mean = np.mean(distance.pdist(train_normalized[col].values.reshape(-1, 1)))
    mean_distance.append(current_mean)
    print(col, current_mean)

ApplicantIncome 0.05238814027172516
CoapplicantIncome 0.051630648167546034
LoanAmount 0.11640549356070311
Loan_Amount_Term 0.09408541759983714
Credit_History 0.23640354296474425


In [51]:
overall_median_distance = np.median(median_distance)
overall_mean_distance = np.mean(mean_distance)
print("overall median distance", overall_median_distance)
print("overall mean distance", overall_mean_distance)

overall median distance 0.024650587507730366
overall mean distance 0.11018264851291115
