In [31]:
# (Programming, data available here) The data includes volumes of 10 brain regions of 50 control 
# participants and 50 patients. Use the volumes of the 10 regions as input features and build
#   (a) a two-layer neural network classifier to predict the disease diagnosis. Use 5-fold inner cross-validation (as discussed in the class) to tune the learning rate hyperparameter. 
#   (b) train a linear SVM and use inner cross-validation to tune the hyperparameter C of the SVM. 
#   (c) For each classifier, report Accuracy, Precision, Recall, and AUC (average and standard deviation 
#       over the five folds), and compare the results in a table. (3’)1,400 × 563
# Note: For P5 and P6, please include source code, necessary comments and a description of models and
# procedures. Using Jupyter notebook is encouraged but other tools/languages are also acceptable.

####Generate the data
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression



data = np.loadtxt('data_assignment_1.csv', delimiter=',', skiprows=1)

### generate data points
y = data[:, 2]  # Assuming the target variable is in the third column
X = data[:, 3:13]  # Assuming features are in columns 3 to 12 (10 features total)


In [46]:
#   (a) a two-layer neural network classifier to predict the disease diagnosis. 
# Use 5-fold inner cross-validation (as discussed in the class) to tune the learning rate hyperparameter. 

### Build 2 linear layers for this task (train, testing, validate)
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from torch.utils.data import TensorDataset, DataLoader

# Define the model
class MyModel(nn.Module):
    def __init__(self, input_dim):
        super(MyModel, self).__init__()
        # First layer: Transforms 10 input features to 6
        self.fc1 = nn.Linear(input_dim, 6)
        # Output layer: Transforms 6 features to 2 (binary classification)
        self.fc2 = nn.Linear(6, 2)
    
    def forward(self, x):
        # First layer with ReLU activation
        x = F.relu(self.fc1(x))
        # Output layer with softmax activation (for probabilities)
        x = F.softmax(self.fc2(x), dim=1)
        return x

# Define the input dimension based on your training data
input_dim = X_train.shape[-1]

# Instantiate the model
model = MyModel(input_dim)

# Check the model structure
print(model)

### Convert data to PyTorch tensor
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# Metrics storage
train_accuracies = []
val_accuracies = []

# Hyperparameters
learning_rate = 0.01
epochs = 50
batch_size = 5

### Set up cross validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Perform k-fold cross-validation
fold = 1
for train_idx, val_idx in kf.split(X):
    print(f"Fold {fold}/{k}")
    
    # Split data into training and validation sets
    X_train, X_val = X_tensor[train_idx], X_tensor[val_idx]
    y_train, y_val = y_tensor[train_idx], y_tensor[val_idx]
    
    # Create DataLoader for batching
    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize the model, loss function, and optimizer
    model = MyModel(input_dim=X.shape[1])  # Input dimension = number of features
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            predictions = model(batch_X)
            loss = criterion(predictions, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluate on training data
    model.eval()
    correct_train = 0
    total_train = 0
    with torch.no_grad():
        for batch_X, batch_y in train_loader:
            predictions = model(batch_X)
            _, predicted_classes = torch.max(predictions, dim=1)
            correct_train += (predicted_classes == batch_y).sum().item()
            total_train += batch_y.size(0)
    train_accuracy = correct_train / total_train
    train_accuracies.append(train_accuracy)

    # Evaluate on validation data
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            predictions = model(batch_X)
            _, predicted_classes = torch.max(predictions, dim=1)
            correct_val += (predicted_classes == batch_y).sum().item()
            total_val += batch_y.size(0)
    val_accuracy = correct_val / total_val
    val_accuracies.append(val_accuracy)

    print(f"Train Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    fold += 1

# Final Results
print("\nFinal Results:")
print(f"Average Training Accuracy: {np.mean(train_accuracies):.4f}")
print(f"Average Validation Accuracy: {np.mean(val_accuracies):.4f}")

# print("Classification Report:\n", classification_report(y_test, y_pred))

MyModel(
  (fc1): Linear(in_features=10, out_features=6, bias=True)
  (fc2): Linear(in_features=6, out_features=2, bias=True)
)
Fold 1/5
Train Accuracy: 0.6625, Validation Accuracy: 0.6000
Fold 2/5
Train Accuracy: 0.7125, Validation Accuracy: 0.7000
Fold 3/5
Train Accuracy: 0.6500, Validation Accuracy: 0.7000
Fold 4/5
Train Accuracy: 0.7000, Validation Accuracy: 0.5500
Fold 5/5
Train Accuracy: 0.6625, Validation Accuracy: 0.6500

Final Results:
Average Training Accuracy: 0.6775
Average Validation Accuracy: 0.6400


In [47]:
#   (b) train a linear SVM and use inner cross-validation to tune the hyperparameter C of the SVM. 

from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the SVM model
svm = SVC(kernel='linear', random_state=42)

# Establish hyperparameter C  (otherwise could be defined above as argument) 
param_grid = {'C': [0.1, 1, 10, 100]}

# Set up GridSearchCV for inner cross-validation
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Perform the grid search on the training set
grid_search.fit(X_train, y_train)

# Get the best hyperparameter and corresponding model
best_model = grid_search.best_estimator_
best_C = grid_search.best_params_['C']
print(f"Best C: {best_C}")

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best C: 100
Test Accuracy: 0.6
Classification Report:
               precision    recall  f1-score   support

         0.0       0.56      0.64      0.60        14
         1.0       0.64      0.56      0.60        16

    accuracy                           0.60        30
   macro avg       0.60      0.60      0.60        30
weighted avg       0.61      0.60      0.60        30



In [None]:
#   (c) For each classifier, report Accuracy, Precision, Recall, and AUC (average and standard deviation 
#       over the five folds), and compare the results in a table. (3’) 1,400 × 563