In [None]:
import os
import copy

from evaluation_metric import evaluate_all
import pandas as pd
import torch
import random 
import numpy as np
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split, KFold
import numpy as np
import torch.optim as optim
from pprint import pprint
from tqdm import tqdm

from MatrixVectorizer import MatrixVectorizer           
from preprocessing import antivectorize_df
from model import GSRNet, Discriminator
from train import train_gan, test_gan
from utils import track_memory, compute_degree_matrix_normalization_batch_numpy, get_parser, evaluate, plot_metrics_fold, LR_size, HR_size

### Reproducibility code
 - Our code is adjusted to run on gpu

In [None]:
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

# Check for CUDA (GPU support) and set device accordingly
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using GPU.")
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # For multi-GPU setups
    # Additional settings for ensuring reproducibility on CUDA
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(device)

# Import Data
 - The following code is used on the first run to read the dataset and antivectorize it in a form where it can be loaded for further runs. For first time run uncomment the following code 

In [7]:
A_LR_train1 = pd.read_csv("lr_train_split_1.csv")
A_HR_train1 = pd.read_csv("hr_train_split_1.csv")

A_LR_train2 = pd.read_csv("lr_train_split_2.csv")
A_HR_train2 = pd.read_csv("hr_train_split_2.csv")

A_LR_train3 = pd.read_csv("lr_train_split_3.csv")
A_HR_train3 = pd.read_csv("hr_train_split_3.csv")

np.save('A_LR_train_matrix1.npy', antivectorize_df(A_LR_train1, LR_size))
np.save('A_HR_train_matrix1.npy', antivectorize_df(A_HR_train1, HR_size))
np.save('A_LR_train_matrix2.npy', antivectorize_df(A_LR_train2, LR_size))
np.save('A_HR_train_matrix2.npy', antivectorize_df(A_HR_train2, HR_size))
np.save('A_LR_train_matrix3.npy', antivectorize_df(A_LR_train3, LR_size))
np.save('A_HR_train_matrix3.npy', antivectorize_df(A_HR_train3, HR_size))

In [None]:
A_LR_train_matrix1 = np.load('A_LR_train_matrix1.npy')
A_HR_train_matrix1 = np.load('A_HR_train_matrix1.npy')
A_LR_train_matrix2 = np.load('A_LR_train_matrix2.npy')
A_HR_train_matrix2 = np.load('A_HR_train_matrix2.npy')
A_LR_train_matrix3 = np.load('A_LR_train_matrix3.npy')
A_HR_train_matrix3 = np.load('A_HR_train_matrix3.npy')


print(A_LR_train_matrix1.shape)
print(A_HR_train_matrix1.shape)
print(A_LR_train_matrix2.shape)
print(A_HR_train_matrix2.shape)
print(A_LR_train_matrix3.shape)
print(A_HR_train_matrix3.shape)


# Parameters
 - The optimal parameters based on our carried out experiments.

In [None]:
parser = get_parser()
# Create an empty Namespace to hold the default arguments
args = parser.parse_args([])
pprint(args.__dict__)

In [None]:
graph_items_train = [A_LR_train_matrix1, A_LR_train_matrix2, A_LR_train_matrix3]
graph_items_test = [ A_HR_train_matrix1, A_HR_train_matrix2, A_HR_train_matrix3]
for matrix1, matrix2 in zip(graph_items_train, graph_items_test):
    matrix1 = compute_degree_matrix_normalization_batch_numpy(matrix1)
    matrix2 = compute_degree_matrix_normalization_batch_numpy(matrix2)


# K-Fold Cross Validation
 - This is the cross-validation loop. We use KFold cross-validation to split 
the data into training and validation sets.

In [None]:
# cv = KFold(n_splits=args.splits, random_state=random_seed, shuffle=True)

best_model_fold_list = []
data_fold_list = []
i = 1
# Store the fold results
fold_results = []

print(f"Starting Cross Validation.")
track_memory()
for index in range(len(graph_items_test)):
    print(f"----- Fold {i} -----")
    track_memory()

    train = graph_items_train[index]
    test = graph_items_test[index]

    # subjects_adj, test_adj, subjects_ground_truth, test_ground_truth = (
    #     X[train_index],
    #     X[test_index],
    #     Y[train_index],
    #     Y[test_index],
    # )
    # data_fold_list.append(
    #     (subjects_adj, test_adj, subjects_ground_truth, test_ground_truth)
    # )
    # Create a deep copy of list1
    new_train = copy.deepcopy(graph_items_train)
    new_train.pop(index)
    new_test = copy.deepcopy(graph_items_test)
    new_test.pop(index)
    new_train = np.concatenate(new_train, axis=0)
    new_test = np.concatenate(new_test, axis=0)

    # # Remove the item at the given index
    # new_list.pop(index_to_remove)

    netG = GSRNet(args).to(device)
    optimizerG = optim.Adam(netG.parameters(), lr=args.lr)

    netD = Discriminator(args).to(device)
    optimizerD = optim.Adam(netD.parameters(), lr=args.lr)

    # track_memory()
    # GAN model

    return_model = train_gan(
        netG,
        optimizerG,
        netD,
        optimizerD,
        new_train,
        new_test,
        args,
        test_adj=train,
        test_ground_truth=test,
    )

    test_mae, _ = test_gan(return_model, train, test, args, to_file=False)
    train_mae,_ = test_gan(return_model, new_train, new_test, args)
    pred_val_matrices = np.zeros((268, 268))
    with torch.no_grad():
        pred_train_matrices = []
        for j, test_adj in enumerate(train):
            return_model.eval()
            pred = return_model(torch.from_numpy(test_adj))[0]
            pred = torch.clamp(pred, min=0.0, max=1.0)
            pred = pred.cpu()
            pred_train_matrices.append(pred)

        print("Train")
        pred_train_matrices = np.array(pred_train_matrices)
        evaluate_all(test, pred_train_matrices)
    print(f"Train MAE: {train_mae:.6f}, Val MAE: {test_mae:.6f}")
    best_model_fold_list.append(return_model)
    # Evaluate the model on the test set and log the results
    # predicted_test_output_matrices = return_model(torch.Tensor(train))
    # metrics = evaluate_all(
    #     train, predictions
    # )

    track_memory()

    i += 1


In [None]:
CAL_GRAPH = False

res_list = []

for i in range(args.splits):
    _, test_adjs, _, gt_matrices = data_fold_list[i]
    model = best_model_fold_list[i]
    model.eval()
    pred_matrices = np.zeros(gt_matrices.shape)
    with torch.no_grad():
        for j, test_adj in enumerate(test_adjs):
            pred = model(torch.from_numpy(test_adj))[0]
            pred = torch.clamp(pred, min=0.0, max=1.0)
            pred = pred.cpu()
            pred_matrices[j] = pred
    res_list.append(evaluate(pred_matrices, gt_matrices, cal_graph=CAL_GRAPH))

pd.DataFrame(res_list)

In [None]:
plot_metrics_fold(res_list)

*Discussion:* This is the final scores of our 3-Fold cross validation based on the MAE, PCC and JSD metrics. The first plot is generated by setting the CAL_GRAPH Flag to false. On our complete final plot we can see that Mean Absolute Error (MAE) ranges from 0.1281 to 0.1378. The model predicts HR samples with a level of accuracy, but there is still room for improvement. Pearson Correlation Coefficients (PCC) are consistently above 0.63, indicating a moderately strong positive correlation between the predicted HR value and the ground truth. This shows that the model successfully captures the general trend of the data. Jensen-Shannon Distance (JSD) remains around 0.28, showing that the predicted HR value partially diverged from the ground truth. Lastly, the average MAE with 3 different centrality types is very low, signifying that the model’s prediction captures ground truth’s network structure very well.


In [None]:
for i in range(args.splits):
    _, test_adjs, _, gt_matrices = data_fold_list[i]
    model = best_model_fold_list[i]
    model.eval()

    output_pred_list = []
    with torch.no_grad():
        for test_adj in tqdm(test_adjs):
            output_pred = model(torch.from_numpy(test_adj))[0].cpu()
            output_pred = torch.clamp(output_pred, min=0.0, max=1.0)
            output_pred = MatrixVectorizer.vectorize(output_pred).tolist()
            output_pred_list.append(output_pred)

    output_pred_stack = np.stack(output_pred_list, axis=0)
    output_pred_1d = output_pred_stack.flatten()

    df = pd.DataFrame(
        {
            "ID": [i + 1 for i in range(len(output_pred_1d))],
            "Predicted": output_pred_1d.tolist(),
        }
    )

    df.to_csv("predictions_fold_" + str(i + 1) + ".csv", index=False)

# Final Model
 - Below is the final train split performed on the parameter combination that performed the best on our KFold cross validation experiment.

In [None]:
A_HR_train = pd.read_csv("../data/hr_train.csv")

pca = PCA(n_components=0.99, whiten=False)
A_HR_train_pca = pca.fit_transform(A_HR_train)
print(f"HR Train PCA shape: {A_HR_train_pca.shape}")

gm = GaussianMixture(n_components=5, random_state=random_seed)
A_HR_train_label = gm.fit_predict(A_HR_train_pca)
unique, counts = np.unique(A_HR_train_label, return_counts=True)
print(np.asarray((unique, counts)).T)

X = np.load("A_LR_train_matrix.npy")
y = np.load("A_HR_train_matrix.npy")

X = compute_degree_matrix_normalization_batch_numpy(X)

n_sample = X.shape[0]
X_train, X_val, y_train, y_val = train_test_split(
    X.reshape(n_sample, -1),
    y.reshape(n_sample, -1),
    test_size=0.10,
    random_state=random_seed,
    stratify=A_HR_train_label,
)

X_train = X_train.reshape(-1, LR_size, LR_size)
X_val = X_val.reshape(-1, LR_size, LR_size)
y_train = y_train.reshape(-1, HR_size, HR_size)
y_val = y_val.reshape(-1, HR_size, HR_size)

print("Train size:", len(X_train))
print("Val size:", len(X_val))

netG = GSRNet(args).to(device)
optimizerG = optim.Adam(netG.parameters(), lr=args.lr)

netD = Discriminator(args).to(device)
optimizerD = optim.Adam(netD.parameters(), lr=args.lr)

track_memory()
# GAN model
final_model = train_gan(
    netG,
    optimizerG,
    netD,
    optimizerD,
    X_train,
    y_train,
    args,
    test_adj=X_val,
    test_ground_truth=y_val,
)
track_memory()

In [None]:
pprint(args.__dict__)

In [None]:
from evaluation_metric import evaluate_all
# final_model = model
# final_model.eval()
pred_val_matrices = np.zeros((268, 268))
with torch.no_grad():
    for model in fold_results:
        pred_train_matrices = []
        for j, test_adj in enumerate(train):
            model.eval()
            pred = model(torch.from_numpy(test_adj))[0]
            pred = torch.clamp(pred, min=0.0, max=1.0)
            pred = pred.cpu()
            pred_train_matrices.append(pred)

        print("Train")
        pred_train_matrices = np.array(pred_train_matrices)
        evaluate_all(test, pred_train_matrices)

    # for j, test_adj in enumerate(new_train):
    #     pred = final_model(torch.from_numpy(test_adj))[0]
    #     pred = torch.clamp(pred, min=0.0, max=1.0)
    #     pred = pred.cpu()
    #     pred_val_matrices[j] = pred

    # print("Val")
    # evaluate_all(new_test, pred_val_matrices)

In [None]:
output_pred_list = []
final_model = return_model
final_model.eval()
with torch.no_grad():
    for i in tqdm(range(A_LR_test_matrix.shape[0])):
        output_pred = final_model(torch.Tensor(A_LR_test_matrix[i]))[0]
        output_pred = torch.clamp(output_pred, min=0.0, max=1.0)
        output_pred = output_pred.cpu()
        output_pred = MatrixVectorizer.vectorize(output_pred).tolist()
        output_pred_list.append(output_pred)

In [None]:
output_pred_stack = np.stack(output_pred_list, axis=0)
output_pred_1d = output_pred_stack.flatten()
assert output_pred_1d.shape == (4007136,)

In [None]:
df = pd.DataFrame(
    {
        "ID": [i + 1 for i in range(len(output_pred_1d))],
        "Predicted": output_pred_1d.tolist(),
    }
)

df

*Note:* These are the predicted outputs of our best model that were submitted to the Kaggle competition.

In [None]:
df.to_csv("final_model.csv", index=False)