# DGL Project

## Imports

In [None]:
import random
import numpy as np
import torch
from MatrixVectorizer import *
from torch.utils.data import Dataset, DataLoader
from preprocessing import *
from evaluation import *
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import pandas as pd
from model import *
from train import *
import psutil
import time 
import os

In [None]:
# Set a fixed random seed for reproducibility across multiple libraries
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

# Check for CUDA (GPU support) and set device accordingly
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using GPU.")
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # For multi-GPU setups
    # Additional settings for ensuring reproducibility on CUDA
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device("cpu")
    print("CUDA not available. Using CPU.")

## Load data

In [None]:
def anti_vectorize_samples(dataset, dim):
    """
    Anti-vectorizes each sample in the dataset using MatrixVectorizer.

    Args:
    - dataset (np.ndarray): Dataset of samples in vectorized form of shape 
        (num_samples, vectorized form size).
    - dim (int): Size of the anti-vectorized matrix (number of rows/columns).

    Returns:
    - np.ndarray: Anti-vectorized matrices of shape (num_samples, dim, dim).
    """
    num_samples = dataset.shape[0]
    dataset_matrices = np.empty((num_samples, dim, dim))
    for i in range(num_samples):
        dataset_matrices[i] = MatrixVectorizer.anti_vectorize(dataset[i,:], dim, include_diagonal=False)

    return dataset_matrices

In [None]:
# Read in the training data
lr_train_split1 = np.genfromtxt("./new_data/lr_split_1.csv", delimiter=",", skip_header=1)
lr_train_split2 = np.genfromtxt("./new_data/lr_split_2.csv", delimiter=",", skip_header=1)
lr_train_split3 = np.genfromtxt("./new_data/lr_split_3.csv", delimiter=",", skip_header=1)
hr_train_split1 = np.genfromtxt("./new_data/hr_split_1.csv", delimiter=",", skip_header=1)
hr_train_split2 = np.genfromtxt("./new_data/hr_split_2.csv", delimiter=",", skip_header=1)
hr_train_split3 = np.genfromtxt("./new_data/hr_split_3.csv", delimiter=",", skip_header=1)

# Antivectorize
lr_train_split1 = anti_vectorize_samples(lr_train_split1, 160)
lr_train_split2 = anti_vectorize_samples(lr_train_split2, 160)
lr_train_split3 = anti_vectorize_samples(lr_train_split3, 160)
hr_train_split1 = anti_vectorize_samples(hr_train_split1, 268)
hr_train_split2 = anti_vectorize_samples(hr_train_split2, 268)
hr_train_split3 = anti_vectorize_samples(hr_train_split3, 268)

lr_train_matrices = [lr_train_split1, lr_train_split2, lr_train_split3]
hr_train_matrices = [hr_train_split1, hr_train_split2, hr_train_split3]

## 3-fold cross-validation

In [None]:
def predict(model, lr_matrices, args):
    """
    Returns predictions returned from model and low-resolution matrices.

    Args:
    - model (torch.nn.Module): trained model for making predictions.
    - lr_matrices (list of np.ndarray): List of low-resolution matrices.
    - args (dict): Additional arguments including padding information.

    Returns:
        np.ndarray: Array of predictions for each low-resolution matrix.
    """
    preds_matrices = [] # To store predictions

    model.eval()
    with torch.no_grad():
        for lr in lr_matrices:
            lr = torch.from_numpy(lr).type(torch.FloatTensor)
            preds, _, _, _ = model(lr)
            preds = unpad(preds, args['padding'])
            preds_matrices.append(preds.detach().cpu().numpy())

    return np.array(preds_matrices)

In [None]:
def plot_metrics(metrics, title="Metrics"):
    """
    Plot metrics with individual folds and average across folds.

    Args:
    - metrics (dict): Dictionary containing metric names as keys and lists of 
        metric values for each fold as values.
    - title (str, optional): Title of the plot. Defaults to "Metrics".
    """
    # Create a custom color palette (you can adjust the colors as needed)
    colors = ['#FF5733', '#3498DB', '#27AE60', '#F39C12', '#9B59B6', '#7D3C98']

    fig, axs = plt.subplots(2, 2, figsize=(10, 8))  # 2 rows, 2 columns
    plt.subplots_adjust(hspace=0.5)

    # Plot individual metrics
    for i in range(3):
        # Get the ith value for each metric
        metric_values = [values[i] for values in metrics.values()]
        metric_names = list(metrics.keys())
        row = i // 2
        col = i % 2
        ax = axs[row, col]

        ax.bar(range(len(metric_values)), metric_values, align='center', color=colors)
        ax.set_xticks(range(len(metric_values)), metric_names, rotation=45)
        ax.set_xticklabels(metric_names, fontdict={'rotation': 45})
        ax.set_title(f'Fold {i+1}')

    # Final plot with average values
    ax = axs[-1, -1]  # Select the bottom right subplot
    avg_values = [np.mean(values) for values in zip(metrics.values())]
    std_values = [np.std(values) for values in zip(metrics.values())]
    ax.bar(range(len(metrics)), avg_values, align='center', color=colors)
    ax.errorbar(range(len(metrics)), avg_values, yerr=std_values, fmt='o', color='black', capsize=5, label='Average', elinewidth=1)
    ax.set_xticks(range(len(metric_values)), metric_names, rotation=45)
    ax.set_xticklabels(metric_names, fontdict={'rotation': 45})
    ax.set_title("Avg. Across Folds")

    plt.suptitle(title)
    plt.tight_layout()
    plt.savefig(f"./images/barplots.png")
    plt.show()

In [None]:
def save_preds_to_csv(preds_matrices, fold_num):
    """
    Save predictions to predictions_fold_<fold_num>.csv

    Args:
    - preds_matrices (list of np.ndarray): List of prediction matrices.
    - fold_num (int): Fold number.
    """
    predictions = []
    for pred in preds_matrices:
        # Vectorize
        predictions.append(MatrixVectorizer.vectorize(pred, include_diagonal=False))

    # Flatten to 1D
    predictions = np.array(predictions)
    preds_flattened = predictions.flatten()

    df = pd.DataFrame(columns=['ID', 'Predicted'])

    df['ID'] = np.arange(1, len(preds_flattened) + 1)
    df['Predicted'] = preds_flattened

    df.to_csv(f"predictions_fold_{fold_num}.csv", index=False)

In [None]:
def run_kfold(lr_train_matrices, hr_train_matrices, ks, args):
    start_time = time.time()  # Record the start time for training

    process = psutil.Process(os.getpid())
    base_memory_usage = process.memory_info().rss
    print(f"Base RAM usage: {base_memory_usage/(1024*1024)} MiB")
    
    #concatenate 
    fold1_lr = [np.concatenate((lr_train_matrices[0], lr_train_matrices[1]), axis=0), lr_train_matrices[2]]
    fold2_lr = [np.concatenate((lr_train_matrices[1], lr_train_matrices[2]), axis=0), lr_train_matrices[0]]
    fold3_lr = [np.concatenate((lr_train_matrices[0], lr_train_matrices[2]), axis=0), lr_train_matrices[1]]
    folds_lr = [fold1_lr, fold2_lr, fold3_lr]
    
    fold1_hr = [np.concatenate((hr_train_matrices[0], hr_train_matrices[1]), axis=0), hr_train_matrices[2]]
    fold2_hr = [np.concatenate((hr_train_matrices[1], hr_train_matrices[2]), axis=0), hr_train_matrices[0]]
    fold3_hr = [np.concatenate((hr_train_matrices[0], hr_train_matrices[2]), axis=0), hr_train_matrices[1]]
    folds_hr = [fold1_hr, fold2_hr, fold3_hr]
    
    for i in range(3):
        print(f"\nFold {i+1}")
        model = EAGSRNet(ks, args)

        # Get train-test split
        train_lr = folds_lr[i][0]
        train_hr = folds_hr[i][0]

        test_lr = folds_lr[i][1]
        gt_matrices = folds_hr[i][1]
        
        # Train and test the model
        train_with_early_stopping(model, train_lr, train_hr,
                                  test_lr, gt_matrices, args) 

        preds_matrices = predict(model, test_lr, args)
        
        fold_metrics = evaluate_all(preds_matrices, gt_matrices)

    # Calculate total training time in minutes
    total_training_time = (time.time() - start_time) / 60
    print(f"Total Training Time for 3F-CF: {total_training_time} minutes")

    # Report total RAM usage
    memory_usage = (process.memory_info().rss - base_memory_usage)/(1024*1024)
    print(f"Total RAM used: {memory_usage} MiB")

In [None]:
# Hyperparameters

# ks is the top percentile that will be sampled at each pooling layer
ks = [0.9, 0.7, 0.6, 0.5]
args = {
    'epochs': 200,
    'lr': 0.0001,
    'lmbda': 0.1,
    'lr_dim': 160,
    'hr_dim': 320, # to account for padding this is hr_dim + padding * 2
    'hidden_dim': 320,
    'padding': 26,
    'mean_dense': 0.,
    'std_dense': 0.01,
    'mean_gaussian': 0.,
    'std_gaussian': 0.1, 
    'zero_penalty': 2.0,
    'device': device,
    'early_stopping_threshold': 0.0001,
}

In [None]:
run_kfold(lr_train_matrices, hr_train_matrices, ks, args)