In [1]:
# Load isomap dataset from sklearn manifold
from sklearn.manifold import Isomap
from sklearn.datasets import make_s_curve
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.spatial import distance
import numpy as np
import joblib

In [2]:
import pandas as pd
import os

# where am i?
PATH = os.getcwd()
data_path = PATH + '/../datasets/'

dataset = pd.read_csv( data_path + 'diabetes_train.xls' )
#diabetes_test = pd.read_csv("../../datasets/homeloan_test.xls")

In [3]:
train_original = dataset.copy()

In [4]:
""" 

    dataset 

"""

train_original.head()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
""" 

    features

"""

target = [ 
    'Class variable'
 ]

features = [ x for x in list( train_original ) if x not in target ]

print( f'features: { features }' )
print( f'target: { target }' )

X = dataset[ features ]
y = dataset[ target ]

print('')
print('dataset summary')
X.describe()

features: ['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)']
target: ['Class variable']

dataset summary


Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years)
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [6]:
# Normalizing continuous features
continuous_features_list = list( X )
categorical_features_list = []

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_normalized = X.copy()
X_normalized[ continuous_features_list] = scaler.fit_transform( X[ continuous_features_list ] )

# Converting categorical columns to numerical values
if categorical_features_list:
    for col in categorical_features_list:
        for i in range(X[col].nunique()):
            X_normalized.loc[X[col] == X[col].unique()[i], col] = i

X_normalized.head()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years)
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2


In [8]:
print('')
print('dataset summary')
X.describe()


dataset summary


Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years)
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Training the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train.values.ravel())

# Evaluating the model on the test set
logistic_accuracy = logistic_model.score(X_test, y_test)

logistic_accuracy

0.7662337662337663

In [10]:
# Save the model to a file
joblib.dump(logistic_model, 'logistic_model_diabetes.pkl')

['logistic_model_diabetes.pkl']

## Train Pytorch model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader



In [None]:
# Neural Network Definition
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Hyperparameters
epochs = 100
learning_rate = 0.01

# Training the Neural Network (assuming you have data tensors and dataloaders set up)
model = SimpleNN(input_dim=X_train.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
import torch.utils.data as data

# Convert preprocessed data to tensors
train_tensor = torch.tensor(X_train.astype(float).values, dtype=torch.float32)
val_tensor = torch.tensor(X_test.astype(float).values, dtype=torch.float32)
train_labels = torch.tensor(y_train.values, dtype=torch.float32)
val_labels = torch.tensor(y_test.values, dtype=torch.float32)

# Create a dataset and dataloader
train_dataset = data.TensorDataset(train_tensor, train_labels)
val_dataset = data.TensorDataset(val_tensor, val_labels)
train_loader = data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = data.DataLoader(val_dataset, batch_size=32, shuffle=True)

In [None]:
for epoch in range(epochs):
    for data, target in train_loader:
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()

# Evaluation
with torch.no_grad():
    correct = 0
    total = 0
    for data, target in val_loader:
        outputs = model(data)
        predicted = torch.round(outputs)
        total += target.size(0)
        correct += (predicted == target).sum().item()

    print('Accuracy of the network on the validation set: %d %%' % (
            100 * correct / total))


# Saving the trained model
torch.save(model.state_dict(), 'homeloan_pytorch_model.pth')

In [None]:
# Loading the model
loaded_model = SimpleNN(input_dim=X_train.shape[1])
loaded_model.load_state_dict(torch.load('homeloan_pytorch_model.pth'))
loaded_model.eval()

## GP-GOMEA

In [None]:
from gpgomea import GPGOMEA

# Convert data to numpy arrays
X_np = X_train.astype(float).values
y_np = y_train.values.ravel()

# Initialize GPGOMEA
gomea = GPGOMEA(problem='regression')

# Train the model
gomea.fit(X_np, y_np)

# Save the best solution
best_solution = gomea.best_solution()

In [None]:
# count how much unique values in categorical features per column in train dataframe and print it
for col in categorical_features_list:
    print(col, train[col].nunique())
    print(train[col].unique())

In [None]:
train_normalized = train.copy()

In [None]:
# Normalize contrinuous features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_normalized[continuous_features_list] = scaler.fit_transform(train[continuous_features_list])

In [None]:
# Encode Gender, Marries, Education, Self_Employed with 0 and 1 for train dataframe
for col in categorical_features_list:
    for i in range(train[col].nunique()):
        train_normalized.loc[train[col]==train[col].unique()[i], col] = i

In [None]:
train_normalized.head()

In [None]:
original_point = train_normalized[train_normalized["Loan_Status"]==0].sample(1)

In [None]:
original_point

In [None]:
original_point = train_normalized[train_normalized["Loan_Status"]==0].sample(1)

In [None]:
train_normalized[continuous_features_list].head()

In [None]:
len(train_normalized)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Training the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train.values.ravel())

# Evaluating the model on the test set
logistic_accuracy = logistic_model.score(X_test, y_test)

In [None]:
# Transform continuous features back to unnormalized values
norm_original_train = train_normalized.copy()
norm_original_train[continuous_features_list] = scaler.inverse_transform(train_normalized[continuous_features_list])
norm_original_train.head()

In [None]:
train_normalized.head()

In [None]:
# print unique values in Loan_Amount_Term and Credit_History
print(train_normalized["Loan_Amount_Term"].unique())
print(train_normalized["Credit_History"].unique())

In [None]:
# For every continuous feature print median pairwise distance between points
median_distance = []
for col in continuous_features_list:
    current_median_distance = np.median(distance.pdist(train_normalized[col].values.reshape(-1, 1)))
    median_distance.append(current_median_distance)
    print(col, current_median_distance)

In [None]:
# For every continuous feature print mean pairwise distance between points
mean_distance = []
for col in continuous_features_list:
    current_mean = np.mean(distance.pdist(train_normalized[col].values.reshape(-1, 1)))
    mean_distance.append(current_mean)
    print(col, current_mean)

In [None]:
overall_median_distance = np.median(median_distance)
overall_mean_distance = np.mean(mean_distance)
print("overall median distance", overall_median_distance)
print("overall mean distance", overall_mean_distance)