In [1]:
import torchvision
import torchvision.transforms as transforms
import torch
from torch.utils import data
from torch.utils.data import DataLoader
from torch import nn
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Subset
import numpy as np
import random
from sklearn.metrics import f1_score, accuracy_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import copy
import torch.nn as nn
import torch.optim as optim
import pandas as pd



In [2]:
# Set random seeds for reproducibility
def set_random_seeds(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True




In [3]:
# Create an imbalanced subset of the dataset
def create_imbalanced_subset(dataset, imbalance_ratio=0.01, seed=42):
    random.seed(seed)
    np.random.seed(seed)
    # Each class in CIFAR-100 has 500 samples
    samples_per_class = 500
    
    # Calculate the start and end index for class 0
    end_idx = int(samples_per_class * imbalance_ratio)
    
    # Use slicing to directly obtain the required number of samples for class 0
    main_class_subset_data = dataset.data[:end_idx]
    main_class_subset_labels = [0] * end_idx
    
    # Get samples for other classes
    other_data = dataset.data[samples_per_class:]
    other_labels = dataset.targets[samples_per_class:]
    
    # Merge the data
    imbalanced_data = np.concatenate([main_class_subset_data, other_data])
    imbalanced_labels = main_class_subset_labels + other_labels
    
    return imbalanced_data, imbalanced_labels




In [4]:
# Define the VGG block
def vgg_block(num_convs, in_channels, out_channels):
    layers = []
    for _ in range(num_convs):
        layers.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
        layers.append(nn.ReLU())
        in_channels = out_channels
    layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
    return nn.Sequential(*layers)


In [5]:
# Define the VGG network
def vgg(conv_arch):
    conv_blks = []
    in_channels = 3
    for (num_convs, out_channels) in conv_arch:
        conv_blks.append(vgg_block(num_convs, in_channels, out_channels))
        in_channels = out_channels

    return nn.Sequential(
    *conv_blks, nn.Flatten(),
    nn.Linear(512, 4096), nn.ReLU(), nn.Dropout(0.5),
    nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(0.5),
    nn.Linear(4096, 100)
)

In [6]:
def vgg11():
    conv_arch11 = [(1, 64), (1, 128), (2, 256), (2, 512), (2, 512)]
    model = vgg(conv_arch11)
    model.name = "VGG11"
    return model

def vgg13():
    conv_arch13 = [(2, 64), (2, 128), (2, 256), (2, 512), (2, 512)]
    model = vgg(conv_arch13)
    model.name = "VGG13"
    return model

def vgg16():
    conv_arch16 = [(2, 64), (2, 128), (3, 256), (3, 512), (3, 512)]
    model = vgg(conv_arch16)
    model.name = "VGG16"
    return model


In [7]:
def train_model(model, train_loader, n_epochs, lr, device):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    
    model.train()
    for epoch in range(n_epochs):
        for inputs, labels in train_loader:
            inputs = inputs.view(inputs.size(0), -1)  # Flatten the input data
            inputs = inputs.float()  # Ensure the data is float type
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    
    return loss.item()

In [8]:
def train_specific_model(processed_trainset, model, imbalance_ratios, batch_size, n_epochs, lr, device):
    avg_losses = []  # List to store average loss for each imbalance ratio
    
    for ratio in imbalance_ratios:
        X, y = zip(*processed_trainset)
        fold_losses = []  # List to store loss for each fold
        
        # Split the data into training and validation sets for each fold
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        for train_idx, val_idx in skf.split(X, y):
            train_fold = [(X[i], y[i]) for i in train_idx]
            
            train_fold_loader = DataLoader(train_fold, batch_size=batch_size, shuffle=True)
            
            # Clone the model to ensure fresh training for each fold
            net = copy.deepcopy(model)
            net.to(device)
            
            train_loss = train_model(net, train_fold_loader, n_epochs, lr, device)
            
            fold_losses.append(train_loss)  # Store the last loss value for this fold
        
        # Calculate the average loss of cross-validation
        avg_train_loss = sum(fold_losses) / len(fold_losses)
        avg_losses.append(avg_train_loss)  # Store the average loss for this imbalance ratio   
    
    return avg_losses


In [9]:
def load_data():
    transform = transforms.Compose([
        transforms.RandomCrop(36, padding=4),
        transforms.CenterCrop(32),
        transforms.RandomHorizontalFlip(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
    ])

    trainset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
    testset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)
    
    return trainset, testset

In [1]:
def ensure_same_shape_and_range(data, original_shape):
    # Ensure the shape is the same as the original shape
    data = data.reshape(original_shape)
    
    # Find the minimum and maximum values of the data
    min_value = np.amin(data)
    max_value = np.amax(data)
    
    # Scale the data to the [0, 255] range
    scaled_data = ((data - min_value) / (max_value - min_value) * 255).astype(np.uint8)
    
    return scaled_data

In [10]:
def initialize_model(conv_arch):
    return vgg(conv_arch)


In [11]:
def preprocess_data(method, trainset, main_class=0, imbalance_ratio=0.01, seed=42):
    """
    Preprocess the data using the specified method.
    
    Parameters:
    - method: The data augmentation method to use. One of ['smote', 'random_over', 'random_under'].
    - trainset: The training dataset.
    - main_class: The class to be imbalanced.
    - imbalance_ratio: The ratio of the main class in the imbalanced dataset.
    - seed: Random seed for reproducibility.
    
    Returns:
    - Processed dataset.
    """
    
    # Get the imbalanced data and labels
    imbalanced_data, imbalanced_labels = create_imbalanced_subset(trainset, imbalance_ratio)

    # Convert the data to numpy arrays
    X = np.array(imbalanced_data)
    y = np.array(imbalanced_labels)

    # Apply the specified method
    if method == 'smote':
        smote = SMOTE(random_state=seed)
        X_resampled, y_resampled = smote.fit_resample(X.reshape(X.shape[0], -1), y)
    elif method == 'random_over':
        ros = RandomOverSampler(random_state=seed)
        X_resampled, y_resampled = ros.fit_resample(X.reshape(X.shape[0], -1), y)
    elif method == 'random_under':
        rus = RandomUnderSampler(random_state=seed)
        X_resampled, y_resampled = rus.fit_resample(X.reshape(X.shape[0], -1), y)
    elif method == 'none':
        X_resampled, y_resampled = X.reshape(X.shape[0], -1), y
        
    # Convert the resampled data back to the original shape
    X_resampled = X_resampled.reshape(X_resampled.shape[0], 3, 32, 32)
    X_resampled = [torch.tensor(x, dtype=torch.float32) for x in X_resampled]
    
    # Combine the resampled features and labels into a dataset
    processed_data = list(zip(X_resampled, y_resampled))
    
    return processed_data

In [12]:
def test_model(testset, models, methods, imbalance_ratios, batch_size, device):
    """
    Test the models on imbalanced datasets.
    
    Args:
    - testset: The original test dataset.
    - models: A list of models to test.
    - methods: A list of methods to apply (e.g., ['smote', 'random_over', 'random_under']).
    - imbalance_ratios: A list of imbalance ratios to create subsets.
    - batch_size: Batch size for DataLoader.
    - device: Device to run the model on ('cuda' or 'cpu').
    
    Returns:
    - A dictionary containing average accuracy and F1 score for each model-method combination.
    - A dictionary containing accuracy and F1 score for each imbalance ratio for each model-method combination.
    """
    
    avg_results = {}
    detailed_results = {}
    
    for model in models:
        for method in methods:
            all_accuracies = []
            all_f1_scores = []
            
            for ratio in imbalance_ratios:
                # Create imbalanced subset
                subset_data, subset_labels = create_imbalanced_subset(testset, imbalance_ratio=ratio)
                test_loader = DataLoader(testset, batch_size=batch_size, shuffle=False)
                
                model.to(device)
                model.eval()  # Set the model to evaluation mode
                
                all_preds = []
                all_labels = []
                
                with torch.no_grad():
                    for data, labels in test_loader:
                        data = torch.tensor(data).permute(0, 3, 1, 2).float() / 255.0  # Convert and normalize
                        data, labels = data.to(device), labels.to(device)
                        outputs = model(data)
                        _, predicted = torch.max(outputs.data, 1)
                        
                        all_preds.extend(predicted.cpu().numpy())
                        all_labels.extend(labels.cpu().numpy())
                
                # Calculate accuracy and F1 score
                accuracy = accuracy_score(all_labels, all_preds)
                f1 = f1_score(all_labels, all_preds, average='macro')
                
                all_accuracies.append(accuracy)
                all_f1_scores.append(f1)
                
                # Store detailed results
                key = f"{model.name}_{method}_{ratio}"
                detailed_results[key] = {'accuracy': accuracy, 'f1_score': f1}
            
            # Calculate average accuracy and F1 score
            avg_accuracy = sum(all_accuracies) / len(all_accuracies)
            avg_f1 = sum(all_f1_scores) / len(all_f1_scores)
            
            avg_key = f"{model.name}_{method}"
            avg_results[avg_key] = {'avg_accuracy': avg_accuracy, 'avg_f1_score': avg_f1}
    
    return avg_results, detailed_results


In [13]:
import pandas as pd

IMBALANCE_RATIOS = [0.01, 0.05, 0.1, 0.5, 1]
BATCH_SIZE = 1024
N_EPOCHS = 50
LR = 0.001
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

model = vgg16()
method = 'smote'

# Load datasets
trainset, testset = load_data()

# Preprocess the data
processed_trainset = preprocess_data(method, trainset)

# Train the model and get average losses
avg_losses = train_specific_model(processed_trainset, model, IMBALANCE_RATIOS, BATCH_SIZE, N_EPOCHS, LR, DEVICE)

# Save the model state
torch.save(model.state_dict(), f'/Project/Yi/VGG16_CIFAR100_{method}.pth')

# Save the average losses to CSV
df_avg_losses = pd.DataFrame(avg_losses, index=IMBALANCE_RATIOS, columns=[f"{model.name}_{method}"])
df_avg_losses.to_csv('average_losses16.csv', mode='a')


Files already downloaded and verified
Files already downloaded and verified


In [14]:
import pandas as pd

IMBALANCE_RATIOS = [0.01, 0.05, 0.1, 0.5, 1]
BATCH_SIZE = 1024
N_EPOCHS = 150
LR = 0.01
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

model = vgg16()
method = 'random_over'

# Load datasets
trainset, testset = load_data()

# Preprocess the data
processed_trainset = preprocess_data(method, trainset)

# Train the model and get average losses
avg_losses = train_specific_model(processed_trainset, model, IMBALANCE_RATIOS, BATCH_SIZE, N_EPOCHS, LR, DEVICE)

# Save the model state
torch.save(model.state_dict(), f'/Project/Yi/VGG16_CIFAR100_{method}.pth')

# Save the average losses to CSV
df_avg_losses = pd.DataFrame(avg_losses, index=IMBALANCE_RATIOS, columns=[f"{model.name}_{method}"])
df_avg_losses.to_csv('average_losses16.csv', mode='a')



Files already downloaded and verified
Files already downloaded and verified


In [15]:
import pandas as pd

IMBALANCE_RATIOS = [ 0.01, 0.05, 0.1, 0.5, 1]
BATCH_SIZE = 1024
N_EPOCHS = 150
LR = 0.01
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

model = vgg16()
method = 'random_under'

# Load datasets
trainset, testset = load_data()

# Preprocess the data
processed_trainset = preprocess_data(method, trainset)

# Train the model and get average losses
avg_losses = train_specific_model(processed_trainset, model, IMBALANCE_RATIOS, BATCH_SIZE, N_EPOCHS, LR, DEVICE)

# Save the model state
torch.save(model.state_dict(), f'/Project/Yi/VGG16_CIFAR100_{method}.pth')

# Save the average losses to CSV
df_avg_losses = pd.DataFrame(avg_losses, index=IMBALANCE_RATIOS, columns=[f"{model.name}_{method}"])
df_avg_losses.to_csv('average_losses16.csv', mode='a')


Files already downloaded and verified
Files already downloaded and verified


In [16]:
import pandas as pd

IMBALANCE_RATIOS = [0.01, 0.05, 0.1, 0.5, 1]
BATCH_SIZE = 1024
N_EPOCHS = 150
LR = 0.01
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

model = vgg16()
method = 'none'

# Load datasets
trainset, testset = load_data()

# Preprocess the data
processed_trainset = preprocess_data(method, trainset)

# Train the model and get average losses
avg_losses = train_specific_model(processed_trainset, model, IMBALANCE_RATIOS, BATCH_SIZE, N_EPOCHS, LR, DEVICE)

# Save the model state
torch.save(model.state_dict(), f'/Project/Yi/VGG16_CIFAR100_{method}.pth')

# Save the average losses to CSV
df_avg_losses = pd.DataFrame(avg_losses, index=IMBALANCE_RATIOS, columns=[f"{model.name}_{method}"])
df_avg_losses.to_csv('average_losses16.csv', mode='a')


Files already downloaded and verified
Files already downloaded and verified


In [17]:
# Initialize dictionaries to store results
all_avg_results = {}
all_detailed_results = {}
IMBALANCE_RATIOS = [ 0.01, 0.05, 0.1, 0.5, 1]
trainset, testset = load_data()
BATCH_SIZE = 1024
N_EPOCHS = 150
LR = 0.01
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# DataFrames to store results
df_avg_results = pd.DataFrame(columns=['avg_accuracy', 'avg_f1_score'])
df_detailed_results = pd.DataFrame(columns=IMBALANCE_RATIOS)

# Load the model
model = vgg16()

# Iterate over each method for testing
METHODS = ['smote', 'random_over', 'random_under', 'none']
for method in METHODS:
    # Load the model state
    key = f"{model.name}_{method}"
    model.load_state_dict(torch.load(f'/Project/Yi/VGG16_CIFAR100_{method}.pth'))
    model.eval()
    
    # Test the current model-method combination
    avg_results, detailed_results = test_model(testset, [model], [method], IMBALANCE_RATIOS, BATCH_SIZE, DEVICE)
    
    # Store results
    all_avg_results[key] = avg_results[key]
    all_detailed_results[key] = detailed_results

    # Save results to the DataFrames and then to CSV
    df_avg_results.loc[key] = [avg_results[key]['avg_accuracy'], avg_results[key]['avg_f1_score']]
    for ratio in IMBALANCE_RATIOS:
        detailed_key = f"{key}_{ratio}"
        df_detailed_results.at[key, ratio] = detailed_results[detailed_key]['accuracy']
    
    df_avg_results.to_csv('average_results16.csv', mode='a')
    df_detailed_results.to_csv('detailed_results16.csv', mode='a')


  data = torch.tensor(data).permute(0, 3, 1, 2).float() / 255.0  # Convert and normalize
  data = torch.tensor(data).permute(0, 3, 1, 2).float() / 255.0  # Convert and normalize
  data = torch.tensor(data).permute(0, 3, 1, 2).float() / 255.0  # Convert and normalize
  data = torch.tensor(data).permute(0, 3, 1, 2).float() / 255.0  # Convert and normalize
  data = torch.tensor(data).permute(0, 3, 1, 2).float() / 255.0  # Convert and normalize
  data = torch.tensor(data).permute(0, 3, 1, 2).float() / 255.0  # Convert and normalize
  data = torch.tensor(data).permute(0, 3, 1, 2).float() / 255.0  # Convert and normalize
  data = torch.tensor(data).permute(0, 3, 1, 2).float() / 255.0  # Convert and normalize
  data = torch.tensor(data).permute(0, 3, 1, 2).float() / 255.0  # Convert and normalize
  data = torch.tensor(data).permute(0, 3, 1, 2).float() / 255.0  # Convert and normalize
  data = torch.tensor(data).permute(0, 3, 1, 2).float() / 255.0  # Convert and normalize
  data = torch.tensor