In [18]:
# (Programming, data available here) The data includes volumes of 10 brain regions of 50 control 
# participants and 50 patients. Use the volumes of the 10 regions as input features and build
#   (a) a two-layer neural network classifier to predict the disease diagnosis. Use 5-fold inner cross-validation (as discussed in the class) to tune the learning rate hyperparameter. 
#   (b) train a linear SVM and use inner cross-validation to tune the hyperparameter C of the SVM. 
#   (c) For each classifier, report Accuracy, Precision, Recall, and AUC (average and standard deviation 
#       over the five folds), and compare the results in a table. (3’)1,400 × 563


####Generate the data
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression



data = np.loadtxt('data_assignment_1.csv', delimiter=',', skiprows=1)

### generate data points
y = data[:, 2]  # Assuming the target variable is in the third column
X = data[:, 3:13]  # Assuming features are in columns 3 to 12 (10 features total)

In [19]:
#   (a) a two-layer neural network classifier to predict the disease diagnosis. 
# Use 5-fold inner cross-validation (as discussed in the class) to tune the learning rate hyperparameter. 

### Build 2 linear layers for this task (train, testing, validate)
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from torch.utils.data import TensorDataset, DataLoader

from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, recall_score

# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# Define k-fold cross-validation
k = 5  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)  # Initialize KFold

# Hyperparameters
batch_size = 16
epochs = 10
learning_rate = 0.01

# Define the model
class MyModel(nn.Module):
    def __init__(self, input_dim):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 2)  # Output layer for binary classification

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Metrics storage
fold_metrics = []
    

# Perform k-fold cross-validation
fold = 1
for train_idx, val_idx in kf.split(X):  # Use kf.split(X) instead of k.split(X)
    print(f"Fold {fold}/{k}")
    
    # Split data into training and validation sets
    X_train, X_val = X_tensor[train_idx], X_tensor[val_idx]
    y_train, y_val = y_tensor[train_idx], y_tensor[val_idx]
    
    # Create DataLoader for batching
    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize the model, loss function, and optimizer
    model = MyModel(input_dim=X.shape[1])  # Input dimension = number of features
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            predictions = model(batch_X)
            loss = criterion(predictions, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluate on validation data
    model.eval()
    fold_y_val = []
    fold_y_pred = []
    fold_y_prob = []
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            predictions = model(batch_X)
            probabilities = F.softmax(predictions, dim=1)  # Get probabilities
            _, predicted_classes = torch.max(probabilities, dim=1)

            fold_y_val.extend(batch_y.cpu().numpy())
            fold_y_pred.extend(predicted_classes.cpu().numpy())
            fold_y_prob.extend(probabilities[:, 1].cpu().numpy())  # Probabilities for class 1
    
    # Compute metrics for this fold
    accuracy = accuracy_score(fold_y_val, fold_y_pred)
    precision = precision_score(fold_y_val, fold_y_pred, average='binary')
    recall = recall_score(fold_y_val, fold_y_pred, average='binary')
    auc = roc_auc_score(fold_y_val, fold_y_prob)
    
    # Store the metrics
    fold_metrics.append({'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'AUC': auc})
    print(f"Fold {fold} Metrics: Accuracy={accuracy:.4f}, Precision={precision:.4f}, Recall={recall:.4f}, AUC={auc:.4f}")
    fold += 1

# Aggregate results across folds
metrics_summary = {
    'Accuracy': [m['Accuracy'] for m in fold_metrics],
    'Precision': [m['Precision'] for m in fold_metrics],
    'Recall': [m['Recall'] for m in fold_metrics],
    'AUC': [m['AUC'] for m in fold_metrics]
}

# Compute averages and standard deviations
metrics_avg_std = {
    metric: (np.mean(values), np.std(values)) for metric, values in metrics_summary.items()
}

# Print table of results
print("\nMetrics Summary (Averages and Standard Deviations):")
for metric, (avg, std) in metrics_avg_std.items():
    print(f"{metric}: Mean={avg:.4f}, Std={std:.4f}")

Fold 1/5
Fold 1 Metrics: Accuracy=0.6000, Precision=0.8000, Recall=0.3636, AUC=0.6667
Fold 2/5
Fold 2 Metrics: Accuracy=0.6500, Precision=0.6667, Recall=0.6000, AUC=0.7800
Fold 3/5
Fold 3 Metrics: Accuracy=0.5500, Precision=0.5263, Recall=1.0000, AUC=0.8700
Fold 4/5
Fold 4 Metrics: Accuracy=0.6500, Precision=0.6250, Recall=0.9091, AUC=0.6970
Fold 5/5
Fold 5 Metrics: Accuracy=0.6000, Precision=0.5000, Recall=0.3750, AUC=0.5521

Metrics Summary (Averages and Standard Deviations):
Accuracy: Mean=0.6100, Std=0.0374
Precision: Mean=0.6236, Std=0.1074
Recall: Mean=0.6495, Std=0.2645
AUC: Mean=0.7131, Std=0.1072


In [23]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
import numpy as np  # Added for calculations and table summary [Added in blue]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define hyperparameter grid for C
param_grid = {'C': [0.1, 1, 10, 100]}

# Initialize k-fold cross-validation [Added in blue]
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # Enables fold splitting for cross-validation

# Storage for metrics across folds [Added in blue]
fold_metrics = []  # This will store results for accuracy, precision, recall, and AUC per fold

# Perform k-fold cross-validation [Updated in blue to include cross-validation]
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), 1):  # Enumerate for fold tracking
    print(f"Fold {fold}/5")  # Track which fold is running

    # Split into training and validation sets for this fold
    X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
    y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]

    # Define the SVM model with grid search for hyperparameter tuning
    svm = SVC(kernel='linear', probability=True, random_state=42)  # `probability=True` needed for AUC calculation
    grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=3, scoring='accuracy', verbose=0)  # Inner CV
    grid_search.fit(X_fold_train, y_fold_train)

    # Get the best model for this fold
    best_model = grid_search.best_estimator_

    # Evaluate the best model on the validation set
    y_val_pred = best_model.predict(X_fold_val)
    y_val_prob = best_model.predict_proba(X_fold_val)[:, 1]  # Probabilities for the positive class

    # Compute metrics for this fold [Added in blue]
    accuracy = accuracy_score(y_fold_val, y_val_pred)
    precision = precision_score(y_fold_val, y_val_pred, average='binary', zero_division=1)  # Avoid undefined warning
    recall = recall_score(y_fold_val, y_val_pred, average='binary', zero_division=1)
    auc = roc_auc_score(y_fold_val, y_val_prob)

    # Store the metrics [Added in blue]
    fold_metrics.append({'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'AUC': auc})
    print(f"Fold {fold} Metrics: Accuracy={accuracy:.4f}, Precision={precision:.4f}, Recall={recall:.4f}, AUC={auc:.4f}")

    
#   (c) For each classifier, report Accuracy, Precision, Recall, and AUC (average and standard deviation 
#       over the five folds), and compare the results in a table. (3’)

    
# Compute averages and standard deviations across folds [Added in blue for summary reporting]
metrics_summary = {
    'Accuracy': [m['Accuracy'] for m in fold_metrics],
    'Precision': [m['Precision'] for m in fold_metrics],
    'Recall': [m['Recall'] for m in fold_metrics],
    'AUC': [m['AUC'] for m in fold_metrics]
}

# Prepare a summary table [Added in blue for clear reporting]
metrics_table = pd.DataFrame({
    'Metric': metrics_summary.keys(),
    'Mean': [f"{np.mean(values):.4f}" for values in metrics_summary.values()],
    'StdDev': [f"{np.std(values):.4f}" for values in metrics_summary.values()],
    'All Folds': [", ".join([f"{value:.4f}" for value in values]) for values in metrics_summary.values()]
})
print("\nMetrics Summary (Averages, Standard Deviations, and Fold-wise Results):")  # Explain the summary table
print(metrics_table.to_string(index=False))

# Evaluate the final model on the test set (optional) [Added in blue for final test set evaluation]
final_model = SVC(kernel='linear', C=grid_search.best_params_['C'], probability=True, random_state=42)
final_model.fit(X_train, y_train)
y_test_pred = final_model.predict(X_test)
y_test_prob = final_model.predict_proba(X_test)[:, 1]

# Test set metrics [Added in blue for detailed testing]
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='binary', zero_division=1)
test_recall = recall_score(y_test, y_test_pred, average='binary', zero_division=1)
test_auc = roc_auc_score(y_test, y_test_prob)

print("\nTest Set Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"AUC: {test_auc:.4f}")



Fold 1/5
Fold 1 Metrics: Accuracy=0.5000, Precision=0.3333, Recall=0.7500, AUC=0.5250
Fold 2/5
Fold 2 Metrics: Accuracy=0.7857, Precision=0.7500, Recall=0.8571, AUC=0.8163
Fold 3/5
Fold 3 Metrics: Accuracy=0.6429, Precision=0.7143, Recall=0.6250, AUC=0.6875
Fold 4/5
Fold 4 Metrics: Accuracy=0.5714, Precision=0.5000, Recall=0.3333, AUC=0.7083
Fold 5/5
Fold 5 Metrics: Accuracy=0.7143, Precision=0.8571, Recall=0.6667, AUC=0.7556

Metrics Summary (Averages, Standard Deviations, and Fold-wise Results):
   Metric   Mean StdDev                              All Folds
 Accuracy 0.6429 0.1010 0.5000, 0.7857, 0.6429, 0.5714, 0.7143
Precision 0.6310 0.1887 0.3333, 0.7500, 0.7143, 0.5000, 0.8571
   Recall 0.6464 0.1755 0.7500, 0.8571, 0.6250, 0.3333, 0.6667
      AUC 0.6985 0.0974 0.5250, 0.8163, 0.6875, 0.7083, 0.7556

Test Set Metrics:
Accuracy: 0.6000
Precision: 0.6429
Recall: 0.5625
AUC: 0.6585
