In [1]:
import pandas as pd
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import GridSearchCV

# Load the dataset
X, y = load_svmlight_file("scaled.txt")
X = pd.DataFrame(X.toarray())
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X.columns = features
y[y == -1] = 0

for column in X.columns:
    median = X[X[column] != 0][column].median()
    X[column] = X[column].replace(0, median)

# Split data training 0.6, test 0.2, validation 0.2
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

epoch = 1000
patience = 50
best_val_accuracy = 0
no_improvement_rounds = 0

train_accuracies = []
val_accuracies = []

model = Perceptron()

for i in range(epoch):
    model.partial_fit(X_train, y_train, classes=[0, 1])
    
    train_acc = accuracy_score(y_train, model.predict(X_train))
    val_acc = accuracy_score(y_val, model.predict(X_val))
    
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)
    
    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        no_improvement_rounds = 0
    else:
        no_improvement_rounds += 1
        
    if no_improvement_rounds > patience:
        print(f"Early stopping at epoch {i}")
        break

        
param_grid = {
    'eta0': [0.1, 0.01, 0.001],
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'alpha': [0.0001, 0.001, 0.01, 0.1]
}

grid_search = GridSearchCV(Perceptron(max_iter=1000, tol=1e-3), param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Print best parameters and their score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Fit the best model and evaluate on test set
best_model = grid_search.best_estimator_
test_acc = accuracy_score(y_test, best_model.predict(X_test))
print(f"Test Accuracy: {test_acc:.4f}")



Early stopping at epoch 103
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters found:  {'alpha': 0.0001, 'eta0': 0.01, 'penalty': 'elasticnet'}
Best cross-validation score: 0.74
Test Accuracy: 0.7143
