In [31]:
# (Programming, data available here) The data includes volumes of 10 brain regions of 50 control 
# participants and 50 patients. Use the volumes of the 10 regions as input features and build
#   (a) a two-layer neural network classifier to predict the disease diagnosis. Use 5-fold inner cross-validation (as discussed in the class) to tune the learning rate hyperparameter. 
#   (b) train a linear SVM and use inner cross-validation to tune the hyperparameter C of the SVM. 
#   (c) For each classifier, report Accuracy, Precision, Recall, and AUC (average and standard deviation 
#       over the five folds), and compare the results in a table. (3’)1,400 × 563


####Generate the data
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression



data = np.loadtxt('data_assignment_1.csv', delimiter=',', skiprows=1)

### generate data points
y = data[:, 2]  # Assuming the target variable is in the third column
X = data[:, 3:13]  # Assuming features are in columns 3 to 12 (10 features total)


In [50]:
#   (a) a two-layer neural network classifier to predict the disease diagnosis. 
# Use 5-fold inner cross-validation (as discussed in the class) to tune the learning rate hyperparameter. 

### Build 2 linear layers for this task (train, testing, validate)
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from torch.utils.data import TensorDataset, DataLoader

from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, recall_score

# Metrics storage
fold_metrics = []

# Perform k-fold cross-validation
fold = 1
for train_idx, val_idx in kf.split(X):
    print(f"Fold {fold}/{k}")
    
    # Split data into training and validation sets
    X_train, X_val = X_tensor[train_idx], X_tensor[val_idx]
    y_train, y_val = y_tensor[train_idx], y_tensor[val_idx]
    
    # Create DataLoader for batching
    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize the model, loss function, and optimizer
    model = MyModel(input_dim=X.shape[1])  # Input dimension = number of features
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            predictions = model(batch_X)
            loss = criterion(predictions, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluate on validation data
    model.eval()
    fold_y_val = []
    fold_y_pred = []
    fold_y_prob = []
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            predictions = model(batch_X)
            probabilities = F.softmax(predictions, dim=1)  # Get probabilities
            _, predicted_classes = torch.max(probabilities, dim=1)

            fold_y_val.extend(batch_y.cpu().numpy())
            fold_y_pred.extend(predicted_classes.cpu().numpy())
            fold_y_prob.extend(probabilities[:, 1].cpu().numpy())  # Probabilities for class 1
    
    # Compute metrics for this fold
    accuracy = accuracy_score(fold_y_val, fold_y_pred)
    precision = precision_score(fold_y_val, fold_y_pred, average='binary')
    recall = recall_score(fold_y_val, fold_y_pred, average='binary')
    auc = roc_auc_score(fold_y_val, fold_y_prob)
    
    # Store the metrics
    fold_metrics.append({'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'AUC': auc})
    print(f"Fold {fold} Metrics: Accuracy={accuracy:.4f}, Precision={precision:.4f}, Recall={recall:.4f}, AUC={auc:.4f}")
    fold += 1

# Aggregate results across folds
metrics_summary = {
    'Accuracy': [m['Accuracy'] for m in fold_metrics],
    'Precision': [m['Precision'] for m in fold_metrics],
    'Recall': [m['Recall'] for m in fold_metrics],
    'AUC': [m['AUC'] for m in fold_metrics]
}

# Compute averages and standard deviations
metrics_avg_std = {
    metric: (np.mean(values), np.std(values)) for metric, values in metrics_summary.items()
}

# Print table of results
print("\nMetrics Summary (Averages and Standard Deviations):")
for metric, (avg, std) in metrics_avg_std.items():
    print(f"{metric}: Mean={avg:.4f}, Std={std:.4f}")

Fold 1/5
Fold 1 Metrics: Accuracy=0.5500, Precision=0.6667, Recall=0.3636, AUC=0.5354
Fold 2/5
Fold 2 Metrics: Accuracy=0.6000, Precision=0.7500, Recall=0.3000, AUC=0.7100
Fold 3/5
Fold 3 Metrics: Accuracy=0.8000, Precision=0.7143, Recall=1.0000, AUC=0.8400
Fold 4/5
Fold 4 Metrics: Accuracy=0.5500, Precision=0.5833, Recall=0.6364, AUC=0.6566
Fold 5/5
Fold 5 Metrics: Accuracy=0.4500, Precision=0.4211, Recall=1.0000, AUC=0.6979

Metrics Summary (Averages and Standard Deviations):
Accuracy: Mean=0.5900, Std=0.1158
Precision: Mean=0.6271, Std=0.1172
Recall: Mean=0.6600, Std=0.2997
AUC: Mean=0.6880, Std=0.0980


In [51]:
#   (b) train a linear SVM and use inner cross-validation to tune the hyperparameter C of the SVM. 

from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the SVM model
svm = SVC(kernel='linear', probability=True, random_state=42)  # Enable `probability=True` for AUC computation

# Establish hyperparameter C
param_grid = {'C': [0.1, 1, 10, 100]}

# Set up GridSearchCV for inner cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Perform the grid search on the training set
grid_search.fit(X_train, y_train)

# Get the best hyperparameter and corresponding model
best_model = grid_search.best_estimator_
best_C = grid_search.best_params_['C']
print(f"Best C: {best_C}")

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
auc = roc_auc_score(y_test, y_prob)

# Print the metrics
print("Test Accuracy:", accuracy)
print("Test Precision:", precision)
print("Test Recall:", recall)
print("Test AUC:", auc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best C: 100
Test Accuracy: 0.6
Test Precision: 0.6428571428571429
Test Recall: 0.5625
Test AUC: 0.6584821428571429

Classification Report:
               precision    recall  f1-score   support

         0.0       0.56      0.64      0.60        14
         1.0       0.64      0.56      0.60        16

    accuracy                           0.60        30
   macro avg       0.60      0.60      0.60        30
weighted avg       0.61      0.60      0.60        30



In [None]:
#   (c) For each classifier, report Accuracy, Precision, Recall, and AUC (average and standard deviation 
#       over the five folds), and compare the results in a table. (3’) 1,400 × 563