In [None]:
cd ..

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '3'

In [None]:
# import gc
import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import pickle
from rga.data.diag_repr_graph_data_module import DiagonalRepresentationGraphDataModule
from rga.data.graph_loaders import RealGraphLoader, SyntheticGraphLoader
# from rga.experiments.decorators import add_graphloader_args
from rga.models.autoencoder_components import GraphEncoder
from rga.models.edge_encoders import MemoryEdgeEncoder
from rga.util.load_model import *
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import *
# from sklearn.neural_network import MLPClassifier

In [None]:
checkpoints_folder = '/home/jgrzechocinski/recurrent-graph-autoencoder/best_checkpoints/'
datasets = [
    'COLLAB',
    'IMDB-BINARY',
    'IMDB-MULTI',
    'REDDIT-BINARY'
]
dataset_folder = '/usr/local/datasets/'

In [None]:
# ls '/home/jgrzechocinski/recurrent-graph-autoencoder/best_checkpoints/COLLAB'

In [None]:
# dataset_index = 1
# pickle_index = 0

In [None]:
class RealSaver(DiagonalRepresentationGraphDataModule):
    graphloader_class = RealGraphLoader
    
def prepare_model(model_path, hparams):
    encoder = GraphEncoder(edge_encoder_class = MemoryEdgeEncoder, **hparams)

    checkpoint = torch.load(model_path, map_location=torch.device('cpu'))
    encoder_checkpoint = {
        k.replace("encoder.edge_encoder.", "edge_encoder."): v
        for (k, v) in checkpoint["state_dict"].items()
        if "encoder" in k
    }
    encoder.load_state_dict(encoder_checkpoint)
    
    return encoder

def prepare_dataset(dataset_path, hparams):
    return RealSaver(
        pickled_dataset_path=dataset_path,
        use_labels=True,
        bfs=True,
        deduplicate_train = False,
        deduplicate_val_test = False,
        batch_size=32,
        batch_size_val=32,
        batch_size_test=32,
        workers=0,
        block_size=hparams['block_size'],
        subgraph_scheduler_name='none',
        subgraph_scheduler_params={}
    )

def get_embeddings(model, dataloader, N = 1):
    data_iterator = iter(dataloader)
    X = []
    Y = []
    sizes = []
    avg_degree = []
    for i, batch in enumerate(tqdm(data_iterator)):
        X.append(model(batch).detach().numpy())
        Y.append(batch[3])
        sizes.append(batch[2])

    return np.concatenate(X), np.concatenate(Y), np.concatenate(sizes)

In [None]:
def process_model(hparams_path, model_path, dataset_path, PCA_dim = None):
    hparams = load_hparams(hparams_path)

    model = prepare_model(model_path, hparams)
    dataset = prepare_dataset(dataset_path, hparams)

    train_X, train_Y, train_sizes = get_embeddings(model, dataset.train_dataloader(), N = 128)
    val_X, val_Y, train_sizes = get_embeddings(model, dataset.val_dataloader()[0], N = 128)

    if PCA_dim is not None:
        pca = PCA(n_components=PCA_dim)
        pca.fit(train_X)
        train_X = pca.transform(train_X)
        val_X = pca.transform(val_X)
        # test_X_reduced = pca.transform(test_X)

    sklearn_models = {
        'NB': GaussianNB(),
        'SVM': SVC(),
        'Logistic regression': LogisticRegression(),
        'xgboost': GradientBoostingClassifier(min_samples_leaf=10, verbose=False),
    }

    stats = {
        'train': {},
        'val': {},
        'test': {}
    }
    for name, sklearn_model in sklearn_models.items():
        sklearn_model.fit(train_X, train_Y)
        train_preds = sklearn_model.predict(train_X)
        val_preds = sklearn_model.predict(val_X)
    #     test_preds = sklearn_model.predict(test_X_reduced)

#         print(name)
#         display(pd.DataFrame([val_Y, val_preds], index=['True', 'Pred']).transpose().value_counts())
#         print(classification_report(val_Y, val_preds))
        
        stats['train'].update({
            name: accuracy_score(train_Y, train_preds),
        })
        stats['val'].update({
            name: accuracy_score(val_Y, val_preds),
        })
    #     stats['test'].update({
    #         name: accuracy_score(test_Y, test_preds),
    #     })
    return stats

In [None]:
classification_results = {
    'train':{dataset:{} for dataset in datasets}, 
    'val':{dataset:{} for dataset in datasets}, 
    'test':{dataset:{} for dataset in datasets}, 
}

In [None]:
dataset_name = 'IMDB-BINARY'
for pickle in tqdm([0, 1, 2, 3, 4]):
    hparams_path = checkpoints_folder + dataset_name + '/' + str(pickle) + '_hparams.yaml'
    model_path = checkpoints_folder + dataset_name + '/' + str(pickle) + '.ckpt'
    dataset_path = dataset_folder + dataset_name + '/' + str(pickle) + '.pkl'
    
    stats = process_model(hparams_path, model_path, dataset_path)
    
    selected_stats = {(pickle, k):v for (k,v) in stats['train'].items()}
    classification_results['train'][dataset_name].update(selected_stats)
    
    selected_stats = {(pickle, k):v for (k,v) in stats['val'].items()}
    classification_results['val'][dataset_name].update(selected_stats)
    


In [None]:
classification_results

In [None]:
dataset_name = 'IMDB-MULTI'
for pickle in tqdm([0, 1, 2, 3, 4]):
    hparams_path = checkpoints_folder + dataset_name + '/' + str(pickle) + '_hparams.yaml'
    model_path = checkpoints_folder + dataset_name + '/' + str(pickle) + '.ckpt'
    dataset_path = dataset_folder + dataset_name + '/' + str(pickle) + '.pkl'
    
    stats = process_model(hparams_path, model_path, dataset_path)
    
    selected_stats = {(pickle, k):v for (k,v) in stats['train'].items()}
    classification_results['train'][dataset_name].update(selected_stats)
    
    selected_stats = {(pickle, k):v for (k,v) in stats['val'].items()}
    classification_results['val'][dataset_name].update(selected_stats)
    
    

In [None]:
classification_results

In [None]:
dataset_name = 'REDDIT-BINARY'
for pickle in tqdm([0, 1, 2, 3, 4]):
    hparams_path = checkpoints_folder + dataset_name + '/' + str(pickle) + '_hparams.yaml'
    model_path = checkpoints_folder + dataset_name + '/' + str(pickle) + '.ckpt'
    dataset_path = dataset_folder + dataset_name + '/' + str(pickle) + '.pkl'
    
    stats = process_model(hparams_path, model_path, dataset_path, PCA_dim = 256)
    
    selected_stats = {(pickle, k):v for (k,v) in stats['train'].items()}
    classification_results['train'][dataset_name].update(selected_stats)
    
    selected_stats = {(pickle, k):v for (k,v) in stats['val'].items()}
    classification_results['val'][dataset_name].update(selected_stats)
    
    

In [None]:
classification_results

In [None]:
dataset_name = 'COLLAB'
for pickle in tqdm([0, 1, 2, 3, 4]):
    hparams_path = checkpoints_folder + dataset_name + '/' + str(pickle) + '_hparams.yaml'
    model_path = checkpoints_folder + dataset_name + '/' + str(pickle) + '.ckpt'
    dataset_path = dataset_folder + dataset_name + '/' + str(pickle) + '.pkl'
    
    stats = process_model(hparams_path, model_path, dataset_path, PCA_dim = 256)
    
    selected_stats = {(pickle, k):v for (k,v) in stats['train'].items()}
    classification_results['train'][dataset_name].update(selected_stats)
    
    selected_stats = {(pickle, k):v for (k,v) in stats['val'].items()}
    classification_results['val'][dataset_name].update(selected_stats)
    

In [None]:
classification_results

In [None]:
# dataset_name = 'REDDIT-MULTI-5K'
# classification_results[dataset_name] = {}
# for pickle in tqdm([0, 1, 2, 3, 4]):
#     hparams_path = checkpoints_folder + dataset_name + '/' + str(pickle) + '_hparams.yaml'
#     model_path = checkpoints_folder + dataset_name + '/' + str(pickle) + '.ckpt'
#     dataset_path = dataset_folder + dataset_name + '/' + str(pickle) + '.pkl'

#     selected_stats = {(pickle, k):v for (k,v) in stats['train'].items()}
#     classification_results['train'][dataset_name].update(selected_stats)
    
#     selected_stats = {(pickle, k):v for (k,v) in stats['val'].items()}
#     classification_results['val'][dataset_name].update(selected_stats)
    

In [None]:
# dataset_name = 'REDDIT-MULTI-12K'
# classification_results[dataset_name] = {}
# for pickle in tqdm([0, 1, 2, 3, 4]):
#     hparams_path = checkpoints_folder + dataset_name + '/' + str(pickle) + '_hparams.yaml'
#     model_path = checkpoints_folder + dataset_name + '/' + str(pickle) + '.ckpt'
#     dataset_path = dataset_folder + dataset_name + '/' + str(pickle) + '.pkl'

#     selected_stats = {(pickle, k):v for (k,v) in stats['train'].items()}
#     classification_results['train'][dataset_name].update(selected_stats)
    
#     selected_stats = {(pickle, k):v for (k,v) in stats['val'].items()}
#     classification_results['val'][dataset_name].update(selected_stats)
    

In [None]:
classification_results