In [None]:
# ANDREW JOYNER
# 801293231
# HOMEWORK 3

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import warnings
warnings.filterwarnings('ignore')

In [None]:
## Problem 1: Logistic Regression Binary Classifier for Diabetes Dataset

In [None]:
### Step 1: Load the Diabetes Dataset

In [None]:
url = "https://raw.githubusercontent.com/HamedTabkhi/Intro-to-ML/main/Dataset/diabetes.csv"
df = pd.read_csv(url)
display(df.head())

In [None]:
### Step 2: Prepare the Data (Features and Target)

In [None]:
# Separate features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
### Step 3: Split the Data (80% Training, 20% Test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"\nTraining set class distribution:\n{y_train.value_counts()}")
print(f"\nTest set class distribution:\n{y_test.value_counts()}")

In [None]:
### Step 4: Scaling and Standardization

In [None]:
# Standardize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data standardized successfully!")
print(f"Training data mean: {X_train_scaled.mean():.6f}")
print(f"Training data std: {X_train_scaled.std():.6f}")

### Step 5: Train Logistic Regression Model with Verbose Output

In [None]:
# Train logistic regression with verbose output to track iterations
model = LogisticRegression(max_iter=1000, random_state=42, verbose=1, solver='lbfgs')
model.fit(X_train_scaled, y_train)

print("\nModel training completed!")

# Make predictions on test set using the trained logistic regression model
y_pred = model.predict(X_test_scaled)

In [None]:
# Calculate and plot confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create figure
fig, ax = plt.subplots(figsize=(8, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Diabetes (0)', 'Diabetes (1)'])
disp.plot(cmap='Blues', ax=ax, values_format='d')
ax.set_title('Confusion Matrix for Diabetes Binary Classifier', fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel('Predicted Label', fontsize=12)
ax.set_ylabel('True Label', fontsize=12)
plt.tight_layout()
plt.show()

# Print confusion matrix details
print("\nConfusion Matrix Analysis:")
print("="*60)
print(f"True Negatives (TN):  {cm[0, 0]:3d} - Correctly predicted No Diabetes")
print(f"False Positives (FP): {cm[0, 1]:3d} - Incorrectly predicted Diabetes")
print(f"False Negatives (FN): {cm[1, 0]:3d} - Incorrectly predicted No Diabetes")
print(f"True Positives (TP):  {cm[1, 1]:3d} - Correctly predicted Diabetes")
print("="*60)

### Step 10: Plot Confusion Matrix

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("="*60)
print("MODEL EVALUATION RESULTS")
print("="*60)
print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f} ({precision*100:.2f}%)")
print(f"Recall:    {recall:.4f} ({recall*100:.2f}%)")
print(f"F1 Score:  {f1:.4f} ({f1*100:.2f}%)")
print("="*60)

# Additional interpretation
print("\nInterpretation:")
print(f"- Out of 100 predictions, approximately {int(accuracy*100)} are correct")
print(f"- Out of 100 positive predictions, approximately {int(precision*100)} are true positives")
print(f"- Out of 100 actual positive cases, approximately {int(recall*100)} are correctly identified")

### Step 9: Evaluate Model Performance (Accuracy, Precision, Recall, F1 Score)

In [None]:
# Make predictions on test set using the trained SGD model
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier(loss='log_loss', max_iter=1000, random_state=42)
sgd_model.fit(X_train_scaled, y_train)
y_pred = sgd_model.predict(X_test_scaled)

print("Predictions completed!")
print(f"Sample predictions (first 10): {y_pred[:10]}")
print(f"Actual values (first 10): {y_test.values[:10]}")

### Step 8: Make Predictions on Test Set

In [None]:
# Plot loss and accuracy over iterations
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, accuracy_score
import numpy as np

# Initialize and train SGDClassifier with tracking
sgd_model = SGDClassifier(loss='log_loss', max_iter=1, warm_start=True, random_state=42, learning_rate='optimal', tol=None)

iterations = []
train_losses = []
train_accuracies = []
test_accuracies = []

n_iterations = 100

for i in range(n_iterations):
    sgd_model.fit(X_train_scaled, y_train)
    
    # Predictions for training and test sets
    y_train_pred = sgd_model.predict(X_train_scaled)
    y_test_pred = sgd_model.predict(X_test_scaled)
    
    # Calculate metrics
    y_train_proba = sgd_model.predict_proba(X_train_scaled)
    train_loss = log_loss(y_train, y_train_proba)
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    
    # Store metrics
    iterations.append(i + 1)
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    test_accuracies.append(test_acc)
    
    if (i + 1) % 20 == 0:
        print(f"Iteration {i+1}: Train Loss = {train_loss:.4f}, Train Acc = {train_acc:.4f}, Test Acc = {test_acc:.4f}")

print("\nTraining completed!")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot training loss
ax1.plot(iterations, train_losses, 'b-', linewidth=2, label='Training Loss')
ax1.set_xlabel('Iteration', fontsize=12)
ax1.set_ylabel('Loss', fontsize=12)
ax1.set_title('Training Loss over Iterations', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.legend()

# Plot training and test accuracy
ax2.plot(iterations, train_accuracies, 'g-', linewidth=2, label='Training Accuracy')
ax2.plot(iterations, test_accuracies, 'r--', linewidth=2, label='Test Accuracy')
ax2.set_xlabel('Iteration', fontsize=12)
ax2.set_ylabel('Accuracy', fontsize=12)
ax2.set_title('Classification Accuracy over Iterations', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)
ax2.legend()

plt.tight_layout()
plt.show()

print(f"Final Training Accuracy: {train_accuracies[-1]:.4f}")
print(f"Final Test Accuracy: {test_accuracies[-1]:.4f}")

### Step 7: Plot Training Results (Loss and Accuracy over Iterations)

In [None]:
# Custom training loop to track loss and accuracy over iterations
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss

# Use SGDClassifier for iterative training to track progress
sgd_model = SGDClassifier(loss='log_loss', max_iter=1, warm_start=True, random_state=42, 
                          learning_rate='optimal', tol=None)

# Track metrics over iterations
iterations = []
train_losses = []
train_accuracies = []
test_accuracies = []

n_iterations = 100

for i in range(n_iterations):
    sgd_model.fit(X_train_scaled, y_train)
    
    # Predictions for training and test sets
    y_train_pred = sgd_model.predict(X_train_scaled)
    y_test_pred = sgd_model.predict(X_test_scaled)
    
    # Calculate metrics
    y_train_proba = sgd_model.predict_proba(X_train_scaled)
    train_loss = log_loss(y_train, y_train_proba)
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    
    # Store metrics
    iterations.append(i + 1)
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    test_accuracies.append(test_acc)
    
    if (i + 1) % 20 == 0:
        print(f"Iteration {i+1}: Train Loss = {train_loss:.4f}, Train Acc = {train_acc:.4f}, Test Acc = {test_acc:.4f}")

print("\nTraining completed!")

### Step 6: Custom Training Loop to Track Loss and Accuracy