In [1]:
cd ..

/home/adam/phd/recurrent-graph-autoencoder


In [16]:
import gc
import torch
import random
import numpy as np
from tqdm.auto import tqdm

from rga.data.diag_repr_graph_data_module import DiagonalRepresentationGraphDataModule
from rga.data.graph_loaders import RealGraphLoader, SyntheticGraphLoader
from rga.experiments.decorators import add_graphloader_args
from rga.models.autoencoder_components import GraphEncoder
from rga.models.edge_encoders import MemoryEdgeEncoder
from rga.util.load_model import *
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
# from sklearn.neural_network import MLPClassifier

In [4]:
class RealSaver(DiagonalRepresentationGraphDataModule):
    graphloader_class = RealGraphLoader

In [5]:
train_val_test_split = [0.7, 0.15, 0.15]
train_val_test_permutation_split = [1, 0, 0.0]
num_dataset_graph_permutations = 10

In [6]:
dataset = RealSaver(
    pickled_dataset_path='./datasets/imdb_multi_labels.pkl',
    use_labels=True,
    bfs=True,
    deduplicate_train = False,
    deduplicate_val_test = False,
    batch_size=500000,
    batch_size_val=500000,
    batch_size_test=500000,
    workers=0,
    block_size=8,
    subgraph_scheduler_name='none',
    subgraph_scheduler_params={}
)

Dataset successfully loaded!
File path: ./datasets/imdb_multi_labels.pkl
Statistic of set:  Train dataset
             Dataset size : 1050
                   Labels : True
           Min node count : 7
       Average node count : 12.89
           Max node count : 78
           Min edge count : 12.0
       Average edge count : 64.18
           Max edge count : 982.0
     Min filling fraction : 0.13
 Average filling fraction : 0.78
     Max filling fraction : 1.0
          Label "1" count : 342
          Label "2" count : 340
          Label "3" count : 368
----------------------------------------------------------------
Statistic of set:  Validation dataset 0
             Dataset size : 225
                   Labels : True
           Min node count : 7
       Average node count : 13.6
           Max node count : 89
           Min edge count : 12.0
       Average edge count : 73.44
           Max edge count : 1467.0
     Min filling fraction : 0.14
 Average filling fraction : 0.74
     M

preparing dataset train for autoencoder:   0%|          | 0/1050 [00:00<?, ?it/s]

preparing dataset val 0 for autoencoder:   0%|          | 0/225 [00:00<?, ?it/s]

preparing dataset test 0 for autoencoder:   0%|          | 0/225 [00:00<?, ?it/s]

In [9]:
checkpoint_path = 'tb_logs/RecursiveGraphAutoencoder/version_8/checkpoints/epoch=77-step=1325-v1.ckpt'
hparams_path = 'tb_logs/RecursiveGraphAutoencoder/version_8/hparams.yaml'
hparams = load_hparams(hparams_path)
encoder = GraphEncoder(edge_encoder_class = MemoryEdgeEncoder, **hparams)


checkpoint = torch.load(checkpoint_path)
encoder_checkpoint = {
    k.replace("encoder.edge_encoder.", "edge_encoder."): v
    for (k, v) in checkpoint["state_dict"].items()
    if "encoder" in k
}
encoder.load_state_dict(encoder_checkpoint)

<All keys matched successfully>

In [10]:
from rga.models.classifier_components import MLPClassifier
classifier_checkpoint_model = MLPClassifier(**hparams)

classifier_checkpoint = {
    k.replace("classifier.nn.", "nn."): v
    for (k, v) in checkpoint["state_dict"].items()
    if "class" in k
}

classifier_checkpoint_model.load_state_dict(classifier_checkpoint)

<All keys matched successfully>

In [11]:
train_batch = next(iter(dataset.train_dataloader()))
train_batch_labels = train_batch[3]
print(len(train_batch_labels))

val_batch = next(iter(dataset.val_dataloader()[0]))
val_batch_labels = val_batch[3]
print(len(val_batch_labels))

test_batch = next(iter(dataset.test_dataloader()[0]))
test_batch_labels = test_batch[3]
print(len(test_batch_labels))


1050
225
225


In [12]:
train_batch_X = encoder(train_batch).detach().numpy()
val_batch_X = encoder(val_batch).detach().numpy()
test_batch_X = encoder(test_batch).detach().numpy()

In [21]:
# classifier_checkpoint_model(torch.tensor(val_batch_X))

In [25]:
pred_val = torch.argmax(classifier_checkpoint_model(torch.tensor(val_batch_X)), dim=1)
print(classification_report(val_batch_labels-1, pred_val))

              precision    recall  f1-score   support

           0       0.49      0.27      0.35        85
           1       0.44      0.61      0.51        77
           2       0.48      0.54      0.51        63

    accuracy                           0.46       225
   macro avg       0.47      0.47      0.46       225
weighted avg       0.47      0.46      0.45       225



  input = module(input)


In [27]:
from pytorch_lightning.core.lightning import LightningModule
from torch import nn
from rga.models.utils.layers import sequential_from_layer_sizes
from torch.optim import Adam
from torch.utils.data import DataLoader
from pytorch_lightning import Trainer

class EmbeddingAE(LightningModule):
    def __init__(self, embedding_size, layers):
        super().__init__()
        self.compressing_layer = torch.argmin(torch.tensor(layers))
        self.nn = sequential_from_layer_sizes(embedding_size, embedding_size, layers)
        self.loss = nn.MSELoss()

    def forward(self, x):
        return self.nn(x)

    def get_compressed_embeddings(self, x):
        return self.nn[:self.compressing_layer*2+1](x)

    def training_step(self, batch, batch_idx):
        loss = self.loss(batch, self(batch))
        self.log("loss/train", loss, on_step=True, on_epoch=False, prog_bar=True, logger=False)
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.loss(batch, self(batch))
        self.log("loss/val", loss, on_step=False, on_epoch=True, prog_bar=True, logger=False)
        return loss
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=1e-3)

In [29]:
# import pytorch_lightning as pl
# from torch import nn
# import torch



# class VAE(pl.LightningModule):
#     def __init__(self):
#         super().__init__()

#         # self.save_hyperparameters()

#         # encoder, decoder
#         self.encoder = sequential_from_layer_sizes(256, 32, [128, 64])
#         self.decoder = sequential_from_layer_sizes(16, 256, [32, 64, 128])
#         self.loss = nn.MSELoss()
#         # distribution parameters
#         self.fc_mu = nn.Linear(32, 16)
#         self.fc_var = nn.Linear(32, 16)

#         # for the gaussian likelihood
#         self.log_scale = nn.Parameter(torch.Tensor([0.0]))

#     def configure_optimizers(self):
#         return torch.optim.Adam(self.parameters(), lr=1e-4)

#     def gaussian_likelihood(self, x_hat, logscale, x):
#         scale = torch.exp(logscale)
#         mean = x_hat
#         dist = torch.distributions.Normal(mean, scale)

#         # measure prob of seeing image under p(x|z)
#         log_pxz = dist.log_prob(x)
#         return log_pxz.sum(dim=(1, 2, 3))

#     def kl_divergence(self, z, mu, std):
#         # --------------------------
#         # Monte carlo KL divergence
#         # --------------------------
#         # 1. define the first two probabilities (in this case Normal for both)
#         p = torch.distributions.Normal(torch.zeros_like(mu), torch.ones_like(std))
#         q = torch.distributions.Normal(mu, std)

#         # 2. get the probabilities from the equation
#         log_qzx = q.log_prob(z)
#         log_pz = p.log_prob(z)

#         # kl
#         kl = (log_qzx - log_pz)
#         kl = kl.sum(-1)
#         return kl

#     def training_step(self, batch, batch_idx):
#         x = batch

#         # encode x to get the mu and variance parameters
#         x_encoded = self.encoder(x)
#         mu, log_var = self.fc_mu(x_encoded), self.fc_var(x_encoded)

#         # sample z from q
#         std = torch.exp(log_var / 2)
#         q = torch.distributions.Normal(mu, std)
#         z = q.rsample()

#         # decoded
#         x_hat = self.decoder(z)

#         # reconstruction loss
#         recon_loss = self.loss(x_hat, x)#self.gaussian_likelihood(x_hat, self.log_scale, x)

#         # kl
#         kl = self.kl_divergence(z, mu, std)

#         # elbo
#         elbo = (kl - recon_loss)
#         elbo = elbo.mean()

#         self.log_dict({
#             'elbo': elbo,
#             'kl': kl.mean(),
#             'recon_loss': recon_loss.mean(),
#             'reconstruction': recon_loss.mean(),
#             'kl': kl.mean(),
#         })

#         return elbo

#     def get_compressed_embeddings(self, batch):
#         x_encoded = self.encoder(batch)
#         return self.fc_mu(x_encoded)




# ae = VAE()#EmbeddingAE(256, [128, 64, 32, 16, 32, 64, 128])
# from pytorch_lightning.callbacks.early_stopping import EarlyStopping

# dataloaders_ae = {
#     'train':DataLoader(train_batch_X, batch_size=32, num_workers=0),
#     'val':DataLoader(val_batch_X, batch_size=32, num_workers=0),
#     'test':DataLoader(test_batch_X, batch_size=32, num_workers=0)
# }


# trainer = Trainer(max_epochs=50, log_every_n_steps=5) #, callbacks=[EarlyStopping(monitor="loss/val")]
# trainer.fit(ae, train_dataloaders=dataloaders_ae.get('train'), val_dataloaders=dataloaders_ae.get('val'))

In [30]:
# train_batch_X_compressed = ae.get_compressed_embeddings(torch.tensor(train_batch_X)).detach().numpy()
# val_batch_X_compressed = ae.get_compressed_embeddings(torch.tensor(val_batch_X)).detach().numpy()
# test_batch_X_compressed = ae.get_compressed_embeddings(torch.tensor(test_batch_X)).detach().numpy()

In [31]:
# train_batch_X_compressed.shape

In [32]:
from pytorch_lightning.core.lightning import LightningModule
from torch import nn
from rga.models.utils.layers import sequential_from_layer_sizes
from torch.optim import Adam
from torch.utils.data import DataLoader
from pytorch_lightning import Trainer
from rga.models.base import BaseModel
from rga.models.utils.getters import * 

class EmbeddingClassifier(BaseModel):
    def __init__(
        self,
        embedding_size: int,
        class_count: int,
        classifier_hidden_layer_sizes,
        classifier_activation_function: str,
        classifier_dropout: float,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.embedding_size = embedding_size
        input_size = embedding_size

        activation_f = get_activation_function(classifier_activation_function)
        self.class_count = class_count
        output_function = nn.Sigmoid if class_count == 2 else nn.Softmax

        self.gru_depth = 8

        self.gru = torch.nn.GRU(
            input_size = 0,
            hidden_size = embedding_size,
            num_layers = 1,
            batch_first = True
        )

        self.nn = sequential_from_layer_sizes(
            input_size,
            class_count if class_count != 2 else 1,
            classifier_hidden_layer_sizes,
            activation_f,
            output_function=output_function,
            dropout=classifier_dropout,
        )

    def forward(self, graphs: Tensor) -> Tensor:
        gru_in = torch.empty(size=[graphs.shape[0], self.gru_depth, 0])
        gru_h_0 = graphs[None, :, :]
        gru_out, gru_hidden = self.gru(gru_in, gru_h_0)
        graphs = gru_hidden[0]

        return self.nn(graphs)

    def step(self, batch, metrics: List = []) -> Tensor:
        y_pred = self(batch[:, :-1])
        labels = (batch[:, -1] - 1).long()

        if self.class_count == 2:
            loss = self.loss_function(y_pred[:, 0], labels.float())
            y_pred_labels = torch.round(y_pred[:, 0]).int()
        else:
            loss = self.loss_function(y_pred, labels)
            y_pred_labels = torch.argmax(y_pred, dim=1)

        for metric in metrics:
            metric(y_pred_labels, labels)

        return loss


model = EmbeddingClassifier(
    embedding_size = 104, 
    class_count = 3, 
    classifier_hidden_layer_sizes = [128, 64], 
    classifier_activation_function = 'ReLU', 
    classifier_dropout = 0.1, 
    loss_function='CrossEntropy',
    metrics=['Accuracy'],
    metric_update_interval=1,
    lr = 0.0001
)

data_loader_embedding_classifier = DataLoader(torch.cat([torch.tensor(train_batch_X), train_batch_labels[:, None]], axis = 1), batch_size=32)
data_loader_val_embedding_classifier = DataLoader(torch.cat([torch.tensor(val_batch_X), val_batch_labels[:, None]], axis = 1), batch_size=32)


trainer = Trainer(max_epochs = 100, check_val_every_n_epoch=5,)
trainer.fit(model, train_dataloaders=data_loader_embedding_classifier, val_dataloaders=data_loader_val_embedding_classifier)
# trainer.test(model, dataloaders=data_loader_val_embedding_classifier)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name          | Type             | Params
---------------------------------------------------
0 | loss_function | CrossEntropyLoss | 0     
1 | metrics_train | ModuleList       | 0     
2 | metrics_val   | ModuleList       | 0     
3 | metrics_test  | ModuleList       | 0     
4 | gru           | GRU              | 33.1 K
5 | nn            | Sequential       | 21.9 K
---------------------------------------------------
55.0 K    Trainable params
0         Non-trainable params
55.0 K    Total params
0.220     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
  input = module(input)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [34]:
pred_val = torch.argmax(model(torch.tensor(val_batch_X)), dim=1)
print(classification_report(val_batch_labels-1, pred_val))

              precision    recall  f1-score   support

           0       0.49      0.33      0.39        85
           1       0.45      0.58      0.51        77
           2       0.51      0.54      0.52        63

    accuracy                           0.48       225
   macro avg       0.48      0.48      0.47       225
weighted avg       0.48      0.48      0.47       225



  input = module(input)


In [35]:
# sklearn_model = RandomForestClassifier(n_estimators=500, min_samples_leaf=5, min_samples_split=4)
sklearn_model = GradientBoostingClassifier(min_samples_leaf=5, min_samples_split=4)
# sklearn_model = SVC()
# sklearn_model = MLPClassifier(hidden_layer_sizes=[16, 16, 16, 16, 16, 16], random_state=1,max_iter=500)
sklearn_model.fit(train_batch_X, train_batch_labels)
train_batch_labels_pred = sklearn_model.predict(train_batch_X)
print(classification_report(train_batch_labels, train_batch_labels_pred))

val_batch_labels_pred = sklearn_model.predict(val_batch_X)
print(classification_report(val_batch_labels, val_batch_labels_pred))

              precision    recall  f1-score   support

           1       0.90      0.48      0.62       342
           2       0.61      0.78      0.69       340
           3       0.62      0.74      0.67       368

    accuracy                           0.67      1050
   macro avg       0.71      0.66      0.66      1050
weighted avg       0.71      0.67      0.66      1050

              precision    recall  f1-score   support

           1       0.51      0.28      0.36        85
           2       0.43      0.60      0.50        77
           3       0.51      0.59      0.55        63

    accuracy                           0.48       225
   macro avg       0.49      0.49      0.47       225
weighted avg       0.49      0.48      0.46       225

