In [2]:
cd ..

/home/adam/phd/recurrent-graph-autoencoder


In [4]:
import gc
import torch
import random
import numpy as np
from tqdm.auto import tqdm

from rga.data.diag_repr_graph_data_module import DiagonalRepresentationGraphDataModule
from rga.data.graph_loaders import RealGraphLoader, SyntheticGraphLoader
from rga.experiments.decorators import add_graphloader_args
from rga.models.autoencoder_components import GraphEncoder
from rga.models.edge_encoders import MemoryEdgeEncoder
from rga.util.load_model import *

In [5]:
class RealSaver(DiagonalRepresentationGraphDataModule):
    graphloader_class = RealGraphLoader

In [6]:
train_val_test_split = [0.7, 0.15, 0.15]
train_val_test_permutation_split = [1, 0, 0.0]
num_dataset_graph_permutations = 10

In [9]:
dataset = RealSaver(
    datasets_dir='/home/adam/phd/recurrent-graph-autoencoder/datasets',
    dataset_name='IMDB-MULTI',
    use_labels=True,
    max_graph_size=None,
    num_dataset_graph_permutations=1, 
    train_val_test_split=train_val_test_split, 
    train_val_test_permutation_split=train_val_test_permutation_split,
    # save_dataset_to_pickle=to_save_path+'/'+dataset_name+'/'+str(i)+'.pkl',
    bfs=True,
    deduplicate_train = False,
    deduplicate_val_test = False,
    batch_size=500000,
    batch_size_val=500000,
    batch_size_test=500000,
    workers=0,
    block_size=6,
    subgraph_scheduler_name='none',
    subgraph_scheduler_params={}
)

reading edges: 0it [00:00, ?it/s]

Statistic of set:  Full original dataset
             Dataset size : 1500
                   Labels : True
           Min node count : 7
       Average node count : 13.0
           Max node count : 89
           Min edge count : 12.0
       Average edge count : 65.94
           Max edge count : 1467.0
     Min filling fraction : 0.13
 Average filling fraction : 0.77
     Max filling fraction : 1.0
          Label "1" count : 500
          Label "2" count : 500
          Label "3" count : 500
----------------------------------------------------------------
Statistic of set:  Train dataset
             Dataset size : 1050
                   Labels : True
           Min node count : 7
       Average node count : 13.27
           Max node count : 78
           Min edge count : 12.0
       Average edge count : 68.94
           Max edge count : 982.0
     Min filling fraction : 0.13
 Average filling fraction : 0.77
     Max filling fraction : 1.0
          Label "1" count : 352
          Lab

preparing dataset train for autoencoder:   0%|          | 0/1050 [00:00<?, ?it/s]

preparing dataset val 0 for autoencoder:   0%|          | 0/225 [00:00<?, ?it/s]

preparing dataset test 0 for autoencoder:   0%|          | 0/225 [00:00<?, ?it/s]

In [10]:
dataset

<__main__.RealSaver at 0x7f26215a9760>

In [11]:
checkpoint_path = '/home/adam/phd/recurrent-graph-autoencoder/tb_logs/RecurrentGraphAutoencoder/IMDB-MULTI/version_1/checkpoints/epoch=224-step=9674-v1.ckpt'
hparams_path = '/home/adam/phd/recurrent-graph-autoencoder/tb_logs/RecurrentGraphAutoencoder/IMDB-MULTI/version_1/hparams.yaml'
hparams = load_hparams(hparams_path)
encoder = GraphEncoder(edge_encoder_class = MemoryEdgeEncoder, **hparams)
# (
#     edge_encoder_class = MemoryEdgeEncoder,
#     embedding_size = 104,
#     edge_size = 1,
#     block_size= 6,
#     loss_function = 'BCEWithLogits',
#     loss_weight = None,
#     learning_rate = '0.0001',
#     optimizer = 'AdamWAMSGrad',
#     lr_scheduler_name = 'NoSched',
#     lr_scheduler_params = {},
#     lr_scheduler_metric = 'loss/train_avg',
#     metrics = [],
#     encoder_hidden_layer_sizes=[1024, 768],
#     encoder_activation_function='ELU'
# )

checkpoint = torch.load(checkpoint_path)
encoder_checkpoint = {
    k.replace("encoder.edge_encoder.", "edge_encoder."): v
    for (k, v) in checkpoint["state_dict"].items()
    if "encoder" in k
}
encoder.load_state_dict(encoder_checkpoint)

<All keys matched successfully>

In [12]:
train_batch = next(iter(dataset.train_dataloader()))
train_batch_labels = train_batch[3]
print(len(train_batch_labels))

val_batch = next(iter(dataset.val_dataloader()[0]))
val_batch_labels = val_batch[3]
print(len(val_batch_labels))

test_batch = next(iter(dataset.test_dataloader()[0]))
test_batch_labels = test_batch[3]
print(len(test_batch_labels))


1050
225
225


In [13]:
train_batch_X = encoder(train_batch).detach().numpy()
val_batch_X = encoder(val_batch).detach().numpy()
test_batch_X = encoder(test_batch).detach().numpy()

In [14]:
train_batch_X.shape

(1050, 256)

In [15]:
from pytorch_lightning.core.lightning import LightningModule
from torch import nn
from rga.models.utils.layers import sequential_from_layer_sizes
from torch.optim import Adam
from torch.utils.data import DataLoader
from pytorch_lightning import Trainer

class EmbeddingAE(LightningModule):
    def __init__(self, embedding_size, layers):
        super().__init__()
        self.compressing_layer = torch.argmin(torch.tensor(layers))
        self.nn = sequential_from_layer_sizes(embedding_size, embedding_size, layers)
        self.loss = nn.MSELoss()

    def forward(self, x):
        return self.nn(x)

    def get_compressed_embeddings(self, x):
        return self.nn[:self.compressing_layer*2+1](x)

    def training_step(self, batch, batch_idx):
        loss = self.loss(batch, self(batch))
        self.log("loss/train", loss, on_step=True, on_epoch=False, prog_bar=True, logger=False)
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.loss(batch, self(batch))
        self.log("loss/val", loss, on_step=False, on_epoch=True, prog_bar=True, logger=False)
        return loss
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=1e-3)

In [16]:
import pytorch_lightning as pl
from torch import nn
import torch



class VAE(pl.LightningModule):
    def __init__(self):
        super().__init__()

        # self.save_hyperparameters()

        # encoder, decoder
        self.encoder = sequential_from_layer_sizes(256, 32, [128, 64])
        self.decoder = sequential_from_layer_sizes(16, 256, [32, 64, 128])
        self.loss = nn.MSELoss()
        # distribution parameters
        self.fc_mu = nn.Linear(32, 16)
        self.fc_var = nn.Linear(32, 16)

        # for the gaussian likelihood
        self.log_scale = nn.Parameter(torch.Tensor([0.0]))

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-4)

    def gaussian_likelihood(self, x_hat, logscale, x):
        scale = torch.exp(logscale)
        mean = x_hat
        dist = torch.distributions.Normal(mean, scale)

        # measure prob of seeing image under p(x|z)
        log_pxz = dist.log_prob(x)
        return log_pxz.sum(dim=(1, 2, 3))

    def kl_divergence(self, z, mu, std):
        # --------------------------
        # Monte carlo KL divergence
        # --------------------------
        # 1. define the first two probabilities (in this case Normal for both)
        p = torch.distributions.Normal(torch.zeros_like(mu), torch.ones_like(std))
        q = torch.distributions.Normal(mu, std)

        # 2. get the probabilities from the equation
        log_qzx = q.log_prob(z)
        log_pz = p.log_prob(z)

        # kl
        kl = (log_qzx - log_pz)
        kl = kl.sum(-1)
        return kl

    def training_step(self, batch, batch_idx):
        x = batch

        # encode x to get the mu and variance parameters
        x_encoded = self.encoder(x)
        mu, log_var = self.fc_mu(x_encoded), self.fc_var(x_encoded)

        # sample z from q
        std = torch.exp(log_var / 2)
        q = torch.distributions.Normal(mu, std)
        z = q.rsample()

        # decoded
        x_hat = self.decoder(z)

        # reconstruction loss
        recon_loss = self.loss(x_hat, x)#self.gaussian_likelihood(x_hat, self.log_scale, x)

        # kl
        kl = self.kl_divergence(z, mu, std)

        # elbo
        elbo = (kl - recon_loss)
        elbo = elbo.mean()

        self.log_dict({
            'elbo': elbo,
            'kl': kl.mean(),
            'recon_loss': recon_loss.mean(),
            'reconstruction': recon_loss.mean(),
            'kl': kl.mean(),
        })

        return elbo

    def get_compressed_embeddings(self, batch):
        x_encoded = self.encoder(batch)
        return self.fc_mu(x_encoded)




ae = VAE()#EmbeddingAE(256, [128, 64, 32, 16, 32, 64, 128])
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

dataloaders_ae = {
    'train':DataLoader(train_batch_X, batch_size=32, num_workers=0),
    'val':DataLoader(val_batch_X, batch_size=32, num_workers=0),
    'test':DataLoader(test_batch_X, batch_size=32, num_workers=0)
}


trainer = Trainer(max_epochs=50, log_every_n_steps=5) #, callbacks=[EarlyStopping(monitor="loss/val")]
trainer.fit(ae, train_dataloaders=dataloaders_ae.get('train'), val_dataloaders=dataloaders_ae.get('val'))

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn(
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 43.2 K
1 | decoder | Sequential | 44.0 K
2 | loss    | MSELoss    | 0     
3 | fc_mu   | Linear     | 528   
4 | fc_var  | Linear     | 528   
---------------------------------------
88.3 K    Trainable params
0         Non-trainable params
88.3 K    Total params
0.353     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

In [17]:
train_batch_X_compressed = ae.get_compressed_embeddings(torch.tensor(train_batch_X)).detach().numpy()
val_batch_X_compressed = ae.get_compressed_embeddings(torch.tensor(val_batch_X)).detach().numpy()
test_batch_X_compressed = ae.get_compressed_embeddings(torch.tensor(test_batch_X)).detach().numpy()

In [18]:
train_batch_X_compressed.shape

(1050, 16)

In [19]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier

In [20]:
# model = RandomForestClassifier(n_estimators=500, min_samples_leaf=5, min_samples_split=4)
model = GradientBoostingClassifier(min_samples_leaf=5, min_samples_split=4)
# model = SVC()
# model = MLPClassifier(hidden_layer_sizes=[16, 16, 16, 16, 16, 16], random_state=1,max_iter=500)
model.fit(train_batch_X_compressed, train_batch_labels)
train_batch_labels_pred = model.predict(train_batch_X_compressed)
print(classification_report(train_batch_labels, train_batch_labels_pred))

val_batch_labels_pred = model.predict(val_batch_X_compressed)
print(classification_report(val_batch_labels, val_batch_labels_pred))

              precision    recall  f1-score   support

           1       0.66      0.34      0.44       352
           2       0.56      0.70      0.62       351
           3       0.53      0.66      0.59       347

    accuracy                           0.56      1050
   macro avg       0.58      0.57      0.55      1050
weighted avg       0.58      0.56      0.55      1050

              precision    recall  f1-score   support

           1       0.47      0.22      0.30        74
           2       0.46      0.56      0.51        80
           3       0.44      0.58      0.50        71

    accuracy                           0.45       225
   macro avg       0.46      0.45      0.43       225
weighted avg       0.46      0.45      0.44       225

