## Preparation

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir("/content/drive/MyDrive/DeepLearning/DGL/")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, TensorDataset, Subset

import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

from utils.preprocessing import *
from utils.metrics import get_metrics
from utils.models import GraphCycleGAN
from utils.training import train_graph_cyclegan, train_graph_cyclegan_es
from evaluation import evaluate_all

In [None]:
# Set a fixed random seed for reproducibility across multiple libraries
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

# Check for CUDA (GPU support) and set device accordingly
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using GPU.")
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # For multi-GPU setups
    # Additional settings for ensuring reproducibility on CUDA
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device("cpu")
    print("CUDA not available. Using CPU.")

## Data Loading

In [None]:
# Load pre-processed data
lr_train = np.genfromtxt('data/lr_train.csv', delimiter=',', skip_header=1)
hr_train = np.genfromtxt('data/hr_train.csv', delimiter=',', skip_header=1)
lr_test = np.genfromtxt('data/lr_test.csv', delimiter=',', skip_header=1)

In [None]:
print(lr_train.shape)
print(hr_train.shape)
print(lr_test.shape)

In [None]:
# Check data suitability
contains_nan = np.isnan(lr_train).any() or np.isnan(hr_train).any() or np.isnan(lr_test).any()
contains_negative = (lr_train < 0).any() or (hr_train < 0).any() or (lr_test < 0).any()

print("Contains NaN:", contains_nan)
print("Contains negative numbers:", contains_negative)

## Visualization

In [None]:
def plot_evaluation_metrics(fold_results):
    # Assuming fold_results is a list of tuples with each tuple containing all metrics for a fold
    metrics = np.array(fold_results)

    # Calculate mean and standard deviation across folds for each metric
    metrics_mean = metrics.mean(axis=0)
    metrics_std = metrics.std(axis=0)

    # Define metric names
    metric_names = ['MAE', 'PCC', 'JSD', 'MAE-PC', 'MAE-EC', 'MAE-BC']

    # Set up the subplot for each fold + the average
    n_folds = len(fold_results)
    fig, axs = plt.subplots(1, n_folds + 1, figsize=(20, 5))

    # Define colors for each bar
    colors = plt.cm.viridis(np.linspace(0, 1, len(metric_names)))

    # Plot each fold's metrics
    for i in range(n_folds):
        axs[i].bar(metric_names, metrics[i], color=colors)
        axs[i].set_title(f'Fold {i+1}')

    # Plot the average metrics with error bars
    axs[-1].bar(metric_names, metrics_mean, yerr=metrics_std, capsize=5, color=colors)
    axs[-1].set_title('Avg. Across Folds')

    plt.tight_layout()
    plt.show()

## Training and Validation

In [None]:
lr_train_matrix = torch.from_numpy(np.array([anti_vectorize(lr_train[i], 160) for i in range(167)])).float().to(device)
hr_train_matrix = torch.from_numpy(np.array([anti_vectorize(hr_train[i], 268) for i in range(167)])).float().to(device)

kf = KFold(n_splits=3, shuffle=True, random_state=42)
train_data = TensorDataset(torch.from_numpy(lr_train), torch.from_numpy(hr_train), lr_train_matrix, hr_train_matrix)

In [None]:
num_epochs = 20
batch_size = 8

fold_results = []

for fold, (train_ids, val_ids) in enumerate(kf.split(train_data)):
    print('-'*10, f"Fold {fold+1}", '-'*10)

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = Subset(train_data, train_ids)
    val_subsampler = Subset(train_data, val_ids)

    # Define data loaders for training and testing data in this fold
    train_loader = DataLoader(train_subsampler, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_subsampler, batch_size=batch_size, shuffle=False)

    # Resetting the model and optimizer for each fold
    model = GraphCycleGAN(12720, 35778).to(device)
    fold_results.append(train_graph_cyclegan(model, train_loader, val_loader, epochs=num_epochs, device=device, fold_num=fold+1))

In [None]:
plot_evaluation_metrics(fold_results)

## Export Test Results

In [None]:
num_epochs = 20
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
model = GraphCycleGAN(12720, 35778).to(device)
train_graph_cyclegan(model, train_loader, epochs=num_epochs, device=device)

In [None]:
hr_test = model.G_A2B(torch.from_numpy(lr_test).float().to(device)).cpu().detach().numpy()

hr_test_export = np.clip(hr_test.flatten(), 0, 1)
id_column = np.arange(1, len(hr_test_export)+1)

submission_df = pd.DataFrame({
    'ID': id_column,
    'Predicted': hr_test_export
})

submission_df.to_csv('submission.csv', index=False)

# !NOTE: The Following Code is for Cluster-CV and Random-CV for the paper "A Benchmark for Graph Super-resolution GNNs". Not part of the model.

## Loading the Cluster-CV and Random-CV Data

In [None]:
# Load pre-processed data
lrs = []
hrs = []
    
for i in range(1,4):
    # Change the path to the location of the data on your machine for Cluster-CV and Random-CV
    lr_train_path = f'../../Random-CV2/Fold{i}/lr_split_{i}.csv'
    hr_train_path = f'../../Random-CV2/Fold{i}/hr_split_{i}.csv'
    lr_train = np.genfromtxt(lr_train_path, delimiter=',', skip_header=1)
    hr_train = np.genfromtxt(hr_train_path, delimiter=',', skip_header=1)
    print(lr_train.shape)
    print(hr_train.shape)
    contains_nan = np.isnan(lr_train).any() or np.isnan(hr_train).any()
    contains_negative = (lr_train < 0).any() or (hr_train < 0).any()
    print("Contains NaN:", contains_nan)
    print("Contains negative numbers:", contains_negative)
    lrs.append(lr_train)
    hrs.append(hr_train)

## Search for Early Stopping Point by K-Fold Cross Validation (Fair Comparison)

In [None]:
models = []
losseses = []
ess = []

batch_size = 8
for i in range(0,3):
    lr_train = []
    hr_train = []
    lr_validate = []
    hr_validate = []
    lr_test = []
    hr_test = []
    if i == 0:
        lr_train = np.concatenate((lrs[1][:-10], lrs[2][:-10]), axis=0)
        hr_train = np.concatenate((hrs[1][:-10], hrs[2][:-10]), axis=0)
        lr_validate = np.concatenate((lrs[1][-10:], lrs[2][-10:]), axis=0)
        hr_validate = np.concatenate((hrs[1][-10:], hrs[2][-10:]), axis=0)
        lr_test = lrs[0]
        hr_test = hrs[0]
    elif i == 1:
        lr_train = np.concatenate((lrs[0], lrs[2]), axis=0)
        hr_train = np.concatenate((hrs[0], hrs[2]), axis=0)
        lr_validate = np.concatenate((lrs[0][-10:], lrs[2][-10:]), axis=0)
        hr_validate = np.concatenate((hrs[0][-10:], hrs[2][-10:]), axis=0)
        lr_test = lrs[1]
        hr_test = hrs[1]
    elif i == 2:
        lr_train = np.concatenate((lrs[0], lrs[1]), axis=0)
        hr_train = np.concatenate((hrs[0], hrs[1]), axis=0)
        lr_validate = np.concatenate((lrs[0][-10:], lrs[1][-10:]), axis=0)
        hr_validate = np.concatenate((hrs[0][-10:], hrs[1][-10:]), axis=0)
        lr_test = lrs[2]
        hr_test = hrs[2]


    lr_train_matrix = torch.from_numpy(np.array([anti_vectorize(lr_train[i], 160) for i in range(lr_train.shape[0])])).float().to(device)
    hr_train_matrix = torch.from_numpy(np.array([anti_vectorize(hr_train[i], 268) for i in range(lr_train.shape[0])])).float().to(device)
    lr_test_matrix = torch.from_numpy(np.array([anti_vectorize(lr_test[i], 160) for i in range(lr_test.shape[0])])).float().to(device)
    hr_test_matrix = torch.from_numpy(np.array([anti_vectorize(hr_test[i], 268) for i in range(lr_test.shape[0])])).float().to(device)
    lr_validate_matrix = torch.from_numpy(np.array([anti_vectorize(lr_validate[i], 160) for i in range(lr_validate.shape[0])])).float().to(device)
    hr_validate_matrix = torch.from_numpy(np.array([anti_vectorize(hr_validate[i], 268) for i in range(lr_validate.shape[0])])).float().to(device)
    
    train_data = TensorDataset(torch.from_numpy(lr_train), torch.from_numpy(hr_train), lr_train_matrix, hr_train_matrix)
    validate_data = TensorDataset(torch.from_numpy(lr_validate), torch.from_numpy(hr_validate), lr_validate_matrix, hr_validate_matrix)
    
    
    num_epochs = 300
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    validate_loader = DataLoader(validate_data, batch_size=batch_size, shuffle=False)
    model = GraphCycleGAN(12720, 35778).to(device)
    model_n, losses, es = train_graph_cyclegan_es(model, train_loader, validate_loader, epochs=num_epochs, device=device)
    models.append(model_n)
    losseses.append(losses)
    ess.append(es)
    
print(f'models: {models}')
print(f'losses: {losseses}')
print(f'ess: {ess}')

## Training and Evaluation on Random-CV and Cluster-CV (Paper Benchmark)

In [None]:
batch_size = 8
for i in range(0,3):
    lr_train = []
    hr_train = []
    lr_test = []
    hr_test = []
    if i == 0:
        lr_train = np.concatenate((lrs[1], lrs[2]), axis=0)
        hr_train = np.concatenate((hrs[1], hrs[2]), axis=0)
        lr_test = lrs[0]
        hr_test = hrs[0]
    elif i == 1:
        lr_train = np.concatenate((lrs[0], lrs[2]), axis=0)
        hr_train = np.concatenate((hrs[0], hrs[2]), axis=0)
        lr_test = lrs[1]
        hr_test = hrs[1]
    elif i == 2:
        lr_train = np.concatenate((lrs[0], lrs[1]), axis=0)
        hr_train = np.concatenate((hrs[0], hrs[1]), axis=0)
        lr_test = lrs[2]
        hr_test = hrs[2]


    lr_train_matrix = torch.from_numpy(np.array([anti_vectorize(lr_train[i], 160) for i in range(lr_train.shape[0])])).float().to(device)
    hr_train_matrix = torch.from_numpy(np.array([anti_vectorize(hr_train[i], 268) for i in range(lr_train.shape[0])])).float().to(device)
    lr_test_matrix = torch.from_numpy(np.array([anti_vectorize(lr_test[i], 160) for i in range(lr_test.shape[0])])).float().to(device)
    hr_test_matrix = torch.from_numpy(np.array([anti_vectorize(hr_test[i], 268) for i in range(lr_test.shape[0])])).float().to(device)
    
    train_data = TensorDataset(torch.from_numpy(lr_train), torch.from_numpy(hr_train), lr_train_matrix, hr_train_matrix)
    
    
    num_epochs = 100
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    model = GraphCycleGAN(12720, 35778).to(device)
    train_graph_cyclegan(model, train_loader, epochs=num_epochs, device=device)
    hr_test_predict = model.G_A2B(torch.from_numpy(lr_test).float().to(device)).cpu().detach().numpy()
    print(f'shape: {hr_test_predict.shape}')
    hr_predict_matrix = torch.from_numpy(np.array([anti_vectorize(hr_test_predict[i], 268) for i in range(lr_test.shape[0])])).float().to(device)
    hr_predict_matrix = torch.where(hr_predict_matrix < 0, torch.tensor(0.0).to(device), hr_predict_matrix)
    print(f'anti shape: {hr_predict_matrix.shape}')
    evaluate_all(hr_test_matrix.cpu().detach().numpy(), hr_predict_matrix.cpu().detach().numpy(), f'Paper_RANDOM2_{i+1}')
    
    