# HOW TO RUN THIS FILE
1. edit the ROOT_PATH variable to match the path from your google drive to the eva shared folder
2. make sure that the runtime type is set to GPU. the code requires CUDA to work. (you can set it to CPU only mode, but it's about 10x slower)

In [1]:
!nvidia-smi

Wed Dec  1 01:49:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import sys
# EDIT THIS VARIABLE:
ROOT_PATH = "/content/drive/MyDrive/CS245_Project_Shared/eva"
SRC_PATH = os.path.join(ROOT_PATH, "src")
sys.path.append(SRC_PATH)
from run_dbp15k import *

In [4]:
parser = argparse.ArgumentParser()
parser.add_argument("--file_dir", type=str, default="data/DBP15K/zh_en", required=False, help="input dataset file directory, ('data/DBP15K/zh_en', 'data/DWY100K/dbp_wd')")
parser.add_argument("--rate", type=float, default=0.3, help="training set rate")

parser.add_argument("--cuda", action="store_true", default=True, help="whether to use cuda or not")
parser.add_argument("--seed", type=int, default=2021, help="random seed")
parser.add_argument("--epochs", type=int, default=1000, help="number of epochs to train")
parser.add_argument("--check_point", type=int, default=100, help="check point")
parser.add_argument("--hidden_units", type=str, default="128,128,128", help="hidden units in each hidden layer(including in_dim and out_dim), splitted with comma")
parser.add_argument("--heads", type=str, default="2,2", help="heads in each gat layer, splitted with comma")
parser.add_argument("--instance_normalization", action="store_true", default=False, help="enable instance normalization")
parser.add_argument("--lr", type=float, default=0.005, help="initial learning rate")
parser.add_argument("--weight_decay", type=float, default=0, help="weight decay (L2 loss on parameters)")
parser.add_argument("--dropout", type=float, default=0.0, help="dropout rate for layers")
parser.add_argument("--attn_dropout", type=float, default=0.0, help="dropout rate for gat layers")
parser.add_argument("--dist", type=int, default=2, help="L1 distance or L2 distance. ('1', '2')")
parser.add_argument("--csls", action="store_true", default=False, help="use CSLS for inference")
parser.add_argument("--csls_k", type=int, default=10, help="top k for csls")
parser.add_argument("--il", action="store_true", default=False, help="Iterative learning?")
parser.add_argument("--semi_learn_step", type=int, default=10, help="If IL, what's the update step?")
parser.add_argument("--il_start", type=int, default=500, help="If Il, when to start?")
parser.add_argument("--bsize", type=int, default=7500, help="batch size")
parser.add_argument("--unsup", action="store_true", default=False)
parser.add_argument("--unsup_k", type=int, default=1000, help="|visual seed|")
#parser.add_argument("--long_tail_analysis", action="store_true", default=False)
parser.add_argument("--lta_split", type=int, default=0, help="split in {0,1,2,3,|splits|-1}")

args = argparse.Namespace(
    file_dir=os.path.join(ROOT_PATH, "data/DBP15K/fr_en"),
    rate=0.3,
    lr=0.0005,
    epochs=50, #1000 originally
    hiden_units="400,400,200",
    check_point=50,
    bsize=7500,
    il=False, # True originally
    il_start=500,
    semi_learn_step=5,
    csls=True,
    csls_k=3,
    seed=0,
)
args = parser.parse_args(args=[], namespace=args)
main(args, root_path=ROOT_PATH)

loading raw data...
70.69% entities have images
image feature shape: torch.Size([39654, 2048])
#left entity : 19661, #right entity: 19993
#left entity not in train set: 15161, #right entity not in train set: 15493
relation feature shape: torch.Size([39654, 1000])
attribute feature shape: torch.Size([39654, 1000])
-----dataset summary-----
dataset:	 /content/drive/MyDrive/CS245_Project_Shared/eva/data/DBP15K/fr_en
triple num:	 221720
entity num:	 39654
relation num:	 2111
train ill num:	 4500 	test ill num:	 10500
-------------------------
getting a sparse tensor r_adj...
GCN model details:
GCN(
  (gc1): GraphConvolution (128 -> 128)
  (gc2): GraphConvolution (128 -> 128)
)
optimiser details:
AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.0005
    weight_decay: 0.01
)
[start training...] 
[epoch 0] loss_all: -6.590077, time: 0.3505 s
[epoch 1] loss_all: -8.100548, time: 0.2335 s
[epoch 2] loss_all: -9.315067, time: 0.2292 s
[epoch 3] loss_a

(array([0.1588, 0.3899, 0.6066], dtype=float32),
 array([0.1528, 0.3938, 0.6022], dtype=float32),
 472.7920952380952,
 477.9702857142857,
 0.2349791641446913,
 0.23288773532230558)

# Loading Embeddings and Calculating Similarity matrix

In [7]:
# load the final embeddings after 50 epochs from google drive
# make sure you change root path and file_dir to the path from your google drive
final_embed = torch.load(os.path.join(ROOT_PATH, "final_embed_49.pt"))
ent2id_dict, ills, triples, r_hs, r_ts, ids = read_raw_data(args.file_dir, [1,2])
# split the dataset into the test set
test_ill_ = ills[int(len(ills) // 1 * args.rate):]
test_ill = np.array(test_ill_, dtype=np.int32)
# split the test set into elements in the left graph and right graph
test_left = torch.LongTensor(test_ill[:, 0].squeeze())
test_right = torch.LongTensor(test_ill[:, 1].squeeze())

# split the final embeddings into the two graphs (left and right)
final_embed_left = final_embed[test_left]
final_embed_right = final_embed[test_right]

loading raw data...


In [8]:
# this is a NxD array where N is the number of entities and D is the length of the embedding
final_embed_left.shape

torch.Size([10500, 528])

In [9]:
final_embed_right.shape

torch.Size([10500, 528])

# Using AutoEncoder 
Dimensionality reduction of embeddings from EVA using AutoEncoder.

In [10]:
import numpy as np
import torch
import argparse
from tqdm import tqdm
from sklearn.datasets import load_wine
from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.cluster import normalized_mutual_info_score
from torch import nn, optim

In [11]:
def calc_accuracy(embeddings):
    distance = pairwise_distances(embeddings[test_left], embeddings[test_right])
    print(distance.shape)
    distance = 1 - csls_sim(1 - distance, 3)
    test_left_np = test_left.cpu().numpy()
    test_right_np = test_right.cpu().numpy()
    acc_l2r = 0
    acc_r2l = 0
    for idx in range(test_left.shape[0]):
        values, indices = torch.sort(distance[idx, :], descending=False)
        rank = (indices == idx).nonzero().squeeze().item()
        if rank < 1:
            acc_l2r += 1
    for idx in range(test_right.shape[0]):
        _, indices = torch.sort(distance[:, idx], descending=False)
        rank = (indices == idx).nonzero().squeeze().item()
        if rank < 1:
            acc_r2l += 1
    acc_l2r = round(acc_l2r / test_left.size(0), 4)
    acc_r2l = round(acc_r2l / test_right.size(0), 4)
    return acc_l2r, acc_r2l
acc_l2r, acc_r2l = calc_accuracy(final_embed)
print(f"acc: {acc_l2r}, {acc_r2l}")
print(final_embed.shape)

torch.Size([10500, 10500])
acc: 0.2325, 0.231
torch.Size([39654, 528])


In [12]:
class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(528, 128),
            nn.Sigmoid())
            #nn.Linear(128, 64))
            #nn.ReLU(True), 
            #nn.Linear(64, 12))
        self.decoder = nn.Sequential(
            #nn.Linear(12, 64),
            #nn.ReLU(True),
            #nn.Linear(64, 128),
            #nn.ReLU(True),
            nn.Linear(128, 528), nn.ReLU(True))
        self.encoder.register_forward_hook(self.save_latent())

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def save_latent(self):
        def hook(module, input, output):
            self.latent = output
        return hook


model = autoencoder().cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=0.001, weight_decay=1e-5)

num_epochs = 100
for epoch in range(num_epochs):
    x = final_embed.cuda()
    print(final_embed.shape)
    # ===================forward=====================
    output = model(x)
    loss = criterion(output, x)
    # ===================backward====================
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    # ===================log========================
    print('epoch [{}/{}], loss:{:.4f}'
          .format(epoch + 1, num_epochs, loss.data))
    print(model.latent.shape)
    acc_l2r, acc_r2l = calc_accuracy(model.latent)
    print(f"acc: {acc_l2r}, {acc_r2l}")

torch.Size([39654, 528])
epoch [1/100], loss:0.0426
torch.Size([39654, 128])
torch.Size([10500, 10500])
acc: 0.1845, 0.1827
torch.Size([39654, 528])
epoch [2/100], loss:0.0298
torch.Size([39654, 128])
torch.Size([10500, 10500])
acc: 0.1867, 0.188
torch.Size([39654, 528])
epoch [3/100], loss:0.0203
torch.Size([39654, 128])
torch.Size([10500, 10500])
acc: 0.1856, 0.1863
torch.Size([39654, 528])
epoch [4/100], loss:0.0137
torch.Size([39654, 128])
torch.Size([10500, 10500])
acc: 0.1857, 0.1854
torch.Size([39654, 528])
epoch [5/100], loss:0.0092
torch.Size([39654, 128])
torch.Size([10500, 10500])
acc: 0.1802, 0.182
torch.Size([39654, 528])
epoch [6/100], loss:0.0062
torch.Size([39654, 128])
torch.Size([10500, 10500])
acc: 0.1729, 0.1782
torch.Size([39654, 528])
epoch [7/100], loss:0.0044
torch.Size([39654, 128])
torch.Size([10500, 10500])
acc: 0.1703, 0.1721
torch.Size([39654, 528])
epoch [8/100], loss:0.0032
torch.Size([39654, 128])
torch.Size([10500, 10500])
acc: 0.1635, 0.1678
torch.Size

# Clustering with GraphEncoder
## -- SCRAPPED FOR NOW -- 

In [None]:
# convert the pytorch tensors to numpy arrays
X = final_embed_left.cpu().detach().numpy()
Y = final_embed_right.cpu().detach().numpy()
print(type(X))
print(X.shape)
print(type(Y))
print(Y.shape)

<class 'numpy.ndarray'>
(10500, 528)
<class 'numpy.ndarray'>
(10500, 528)


In [None]:
# EDIT THIS VARIABLE
%cd /content/drive/MyDrive/CS245_Project_Shared/eva/src/
!ls

/content/drive/.shortcut-targets-by-id/1TNHdJkAXnDAdPZ4uzztu_RrLScIxYZuJ/CS245 Project Shared/eva/src
eva.py	   Load.py   models.py	  run_dbp15k.py  utils.py
layers.py  model.py  __pycache__  run_dwy15k.py


In [None]:
import numpy as np
import torch
import argparse
from tqdm import tqdm
from sklearn.datasets import load_wine
from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.cluster import normalized_mutual_info_score
from torch import nn, optim

from model import GraphEncoder

In [None]:
parser = argparse.ArgumentParser()
#parser.add_argument('--dataset', type=str, default='wine', help='Dataset to use')
parser.add_argument('-l', '--layers', nargs='+', type=int, default=[128, 64, 128], help='Sparsity Penalty Parameter')
parser.add_argument('-b', '--beta', type=float, default=0.01, help='Sparsity Penalty Parameter')
parser.add_argument('-p', '--rho', type=float, default=0.5, help='Prior rho')
parser.add_argument('-lr', type=float, default=0.01, help='Learning Rate')
parser.add_argument('-epoch', type=int, default=200, help='Number of Training Epochs')
parser.add_argument('-device', type=str, default='gpu', help='Train on GPU or CPU')

args = argparse.Namespace(
    l = [128, 64, 128],
    b = 0.01,
    p = 0.5,
    lr = 0.01,
    epoch = 7,
    device = 'gpu'
)
args = parser.parse_args(args=[], namespace=args)

device = torch.device('cuda' if args.device == 'gpu' else 'cpu')

# number of clusters = number of entities in X
k = Y.shape[0]

def main():

    # Obtain Similarity matrix
    S = cosine_similarity(X, Y)

    D = np.diag(1.0 / np.sqrt(S.sum(axis=1)))
    X_train = torch.tensor(D.dot(S).dot(D)).float().to(device)

    layers = [len(X_train)] + args.layers + [len(X_train)]

    model = GraphEncoder(layers, k).to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    with tqdm(total=args.epoch) as tq:
        for epoch in range(1, args.epoch + 1):
            optimizer.zero_grad()
            X_hat = model(X_train)
            loss = model.loss(X_hat, X_train, args.beta, args.rho)

            #pred_labels = model.get_cluster()
            #print(pred_labels.shape)
            #print(type(pred_labels))

            #true_labels =  np.arange(1, Y.shape[0] + 1)

            #nmi = normalized_mutual_info_score(pred_labels, true_labels, average_method='arithmetic')

            loss.backward()
            optimizer.step()

            tq.set_postfix(loss='{:.3f}'.format(loss), nmi='{:.3f}'.format(nmi))
            tq.update()
        print(model.get_cluster())


if __name__ == '__main__':
    main()

  1%|          | 1/100 [11:03<18:14:10, 663.14s/it, loss=0.257, nmi=1.000]

(10500,)
<class 'numpy.ndarray'>


  2%|▏         | 2/100 [22:08<18:04:50, 664.18s/it, loss=0.142, nmi=1.000]

(10500,)
<class 'numpy.ndarray'>


  3%|▎         | 3/100 [33:08<17:51:21, 662.70s/it, loss=0.058, nmi=1.000]

(10500,)
<class 'numpy.ndarray'>


  4%|▍         | 4/100 [44:08<17:38:16, 661.42s/it, loss=0.018, nmi=1.000]

(10500,)
<class 'numpy.ndarray'>


  5%|▌         | 5/100 [53:39<16:35:54, 628.99s/it, loss=0.005, nmi=1.000]

(10500,)
<class 'numpy.ndarray'>


  6%|▌         | 6/100 [1:03:04<15:51:05, 607.08s/it, loss=0.002, nmi=1.000]

(10500,)
<class 'numpy.ndarray'>


  7%|▋         | 7/100 [1:12:28<15:19:06, 592.97s/it, loss=0.002, nmi=1.000]

(10500,)
<class 'numpy.ndarray'>


  8%|▊         | 8/100 [1:21:46<14:52:22, 581.99s/it, loss=0.002, nmi=1.000]

(10500,)
<class 'numpy.ndarray'>


  9%|▉         | 9/100 [1:31:10<14:33:44, 576.09s/it, loss=0.002, nmi=1.000]

(10500,)
<class 'numpy.ndarray'>


  9%|▉         | 9/100 [1:33:42<15:47:31, 624.74s/it, loss=0.002, nmi=1.000]


KeyboardInterrupt: ignored

# NEW APPROACH