In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [2]:
file_path1 = '/Users/yamdaniel/Feature_Selection/datasets/breast_cancer/SCANB.csv'
file_path2 = '/Users/yamdaniel/Feature_Selection/datasets/breast_cancer/sampleinfo_SCANB_t.csv'

In [3]:
def load_data(filename1, filename2):
    x = pd.read_csv(filename1)
    x = x.set_index(x.columns[0])
    y = pd.read_csv(filename2)['PAM50'] 
    return x.T, y

In [4]:
X,y = load_data(file_path1, file_path2)

In [None]:
class SoftmaxClassifier(nn.Module):
    def __init__(self, num_features, num_classes, L1=0.01, L2=0.01):
        super(SoftmaxClassifier, self).__init__()
        self.fc = nn.Linear(num_features, num_classes, bias=True)  # Y = W*X + b + L1*|W| + L2*W^2 
        self.L1 = L1 # shape: (num_classes, num_features)
        self.L2 = L2 # shape: (num_classes, num_features)
    
    def forward(self, x):
        return torch.softmax(self.fc(x), dim=1) 

    def group_loss(self):

        W = self.fc.weight  # Shape: (num_classes, num_features)

        L1 = torch.max(torch.abs(W), dim=1)[0]  
        L1 = self.L1 * torch.mean(L1)

        L2 = torch.sum(W * W, dim=1)
        L2 = self.L2 * torch.mean(L2)

        return L1 + L2

    def training_step(self, batch):
        """
        Computes the loss for a given batch.
        """
        inputs, labels = batch  # Unpack batch (features, labels)
        outputs = self.forward(inputs)  # Forward pass
        criterion = nn.CrossEntropyLoss()  # Cross-entropy loss for classification
        loss = criterion(outputs, labels)  # Compute loss
        loss += self.group_loss()  # Add regularization loss
        return loss

    def configure_optimizers(self, learning_rate=0.001):
        """
        Configures the optimizer for training.
        """
        optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        return optimizer

# Cross-validation function
def cross_validate(model, X, y, k=5, batch_size=32, epochs=10):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)  # K-fold cross-validation
    fold_accuracies = []  # To store accuracy for each fold

    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"Training fold {fold + 1}/{k}")
        
        # Split the data
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Convert to torch tensors
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train, dtype=torch.long)
        X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
        y_val_tensor = torch.tensor(y_val, dtype=torch.long)

        # Initialize model and optimizer
        model_instance = model(X_train.shape[1], len(np.unique(y)))  # num_features, num_classes
        optimizer = model_instance.configure_optimizers()

        # Train the model on this fold
        for epoch in range(epochs):
            model_instance.train()
            permutation = torch.randperm(X_train_tensor.size(0))

            for i in range(0, X_train_tensor.size(0), batch_size):
                batch_indices = permutation[i:i+batch_size]
                batch_X, batch_y = X_train_tensor[batch_indices], y_train_tensor[batch_indices]

                optimizer.zero_grad()
                loss = model_instance.training_step((batch_X, batch_y))
                loss.backward()
                optimizer.step()

        # Evaluate the model on the validation set
        model_instance.eval()
        with torch.no_grad():
            outputs = model_instance(X_val_tensor)
            _, predicted = torch.max(outputs, 1)
            accuracy = accuracy_score(y_val, predicted.numpy())  # Compute accuracy
            fold_accuracies.append(accuracy)

        print(f"Accuracy for fold {fold + 1}: {accuracy:.4f}")

    # Compute the average accuracy across all folds
    avg_accuracy = np.mean(fold_accuracies)
    print(f"\nAverage accuracy across {k} folds: {avg_accuracy:.4f}")
    return avg_accuracy