# <strong>Knowledge Graph Embeddings</strong>

We generate embeddings for the knowledge graph created earlier.

To keep things simple, we use json files creathd to access the triples, rather then quering from neo4j api.

**Information** :

We are using different models to from *pykeen* library to generate embeddings from the knowledge graph triples.

In [1]:
# imports
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

import json
import pykeen
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import logging
import pandas as pd

torch.manual_seed(43)

# Set the logging level to ERROR
logging.getLogger("pykeen").setLevel(logging.ERROR)
logging.getLogger("torch").setLevel(logging.ERROR)
scaler = StandardScaler()

In [2]:
# check cuda availability
torch.cuda.is_available()

False

In [3]:
# set files path 
bankrupt_json_path = r'output\bankrupt'
healthy_json_path = r'output\healthy'

<hr>

Create **Entity2id** and **Relation2id** dict to effectivly number and convert any entitiy into numerical data

In [4]:
bankrupt_json_file = os.listdir(bankrupt_json_path)
healthy_json_file = os.listdir(healthy_json_path)

print(f'Found {bankrupt_json_file.__len__()} bankrupt company json files')
print(f'Found {healthy_json_file.__len__()} healthy company json files')

# initialize dictionaries for entities and relations to ids
entity2id = defaultdict(lambda: len(entity2id))
relation2id = defaultdict(lambda: len(relation2id))

Found 50 bankrupt company json files
Found 50 healthy company json files


<hr>

Function to train model for generating embeddings using different models like
<ul>
<li>TransE</li>
<li>RotatE (cann't use because of complex numbers)</li>
<li>TransH</li>
<li>ConvE</li>
<li>RGCN</li>
</ul>

In [11]:
##############################################################
# 16/12/24 | 11:15 AM | 
# Aggregation of embeddings is done by taking mean of all the embeddings
# each triples creates one embeddings, these embeddings are then averaged
#****IMPROVMENTS***###########################################
# 1. Add more models.
# 2. Proper test-train split
##############################################################
all_json_files = []

for file in bankrupt_json_file:
    all_json_files.append(os.path.join(bankrupt_json_path, file))

for file in healthy_json_file:
    all_json_files.append(os.path.join(healthy_json_path, file))

def train_model_once(files, model="TransE", embedding_dim=10, split_ratio=[0.8, 0.2]):
    """
    Train the model on a unified dataset from multiple files.
    Returns the trained model and entity/relation mappings.
    This function makes sure the model is trained only once.
    """
    all_triples = []

    # Combine triples from all files
    for json_file in files:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for relation in data["relations"]:
                entity2id[relation["source"]]
                relation2id[relation["relation"]]
                entity2id[relation["target"]]
                all_triples.append(
                    (relation["source"], relation["relation"], relation["target"])
                )

    # Create a unified triples factory
    triples_factory = TriplesFactory.from_labeled_triples(np.array(all_triples))

    # Split into training and testing
    try:
        training_factory, testing_factory = triples_factory.split(split_ratio)
    except Exception as e:
        training_factory = triples_factory
        testing_factory = triples_factory

    # Train the model
    result = pipeline(
        model=model,
        dataset=None,
        training=training_factory,
        testing=testing_factory,
        model_kwargs=dict(embedding_dim=embedding_dim),
    )

    return result.model, triples_factory

def generate_embeddings_for_file(json_file, model, triples_factory):
    """
    Generates embeddings for a single file using a pre-trained model.
    Input:
    - json_file: path to the JSON file
    - model: trained model
    - triples_factory: triples factory object
    Output:
    - company_embedding: aggregated embedding for the company
    """
    ent_emb = {}
    rel_emb = {}

    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        triples = []
        for relation in data["relations"]:
            triples.append((relation["source"], relation["relation"], relation["target"]))

    # Extract entity and relation embeddings
    for i, content in enumerate(model.entity_representations[0]().detach().numpy()):
        ent_emb[triples_factory.entity_labeling.id_to_label[i]] = content

    for i, content in enumerate(model.relation_representations[0]().detach().numpy()):
        rel_emb[triples_factory.relation_labeling.id_to_label[i]] = content

    # Aggregate embeddings
    def aggregate():
        store = [
            np.concatenate([
                ent_emb[s],
                rel_emb[r],
                ent_emb[t]
            ]) for s, r, t in triples
        ]
        return np.mean(store, axis=0)

    company_embedding = aggregate()
    return company_embedding

In [None]:
##############################################################
# 5 Models used:
# TransE
# RotatE
# TransH
# ConvE
# RGCN
##############################################################
model_te, triple_factory_te = train_model_once(all_json_files)
model_re, triple_factory_re = train_model_once(all_json_files, model="RotatE")
model_th, triple_factory_th = train_model_once(all_json_files, model="TransH")
model_ce, triple_factory_ce = train_model_once(all_json_files, model="ConvE")
model_rgcn, triple_factory_rgcn = train_model_once(all_json_files, model="RGCN")
##############################################################

In [14]:
##############################################################
# 16/12/24 | 10:15 PM | 
# generating embeddings for all companies
# files are in order so the labels are 50 bankrupt [1] and 50 healthy 
##############################################################
all_company_data_te = []

for files in all_json_files:
    all_company_data_te.append(generate_embeddings_for_file(files, model_te, triple_factory_te))

label_te = [50*[1] + 50*[0]]

all_company_data_re = []

for files in all_json_files:
    all_company_data_re.append(generate_embeddings_for_file(files, model_re, triple_factory_re))

label_re = [50*[1] + 50*[0]]

all_company_data_th = []

for files in all_json_files:
    all_company_data_th.append(generate_embeddings_for_file(files, model_th, triple_factory_th))

label_th = [50*[1] + 50*[0]]

all_company_data_ce = []

for files in all_json_files:
    all_company_data_ce.append(generate_embeddings_for_file(files, model_ce, triple_factory_ce))

label_ce = [50*[1] + 50*[0]]

all_company_data_rgcn = []

for files in all_json_files:
    all_company_data_rgcn.append(generate_embeddings_for_file(files, model_rgcn, triple_factory_rgcn))

label_rgcn = [50*[1] + 50*[0]]

assert len(all_company_data_te) == bankrupt_json_file.__len__() + healthy_json_file.__len__()
assert len(all_company_data_re) == bankrupt_json_file.__len__() + healthy_json_file.__len__()
assert len(all_company_data_th) == bankrupt_json_file.__len__() + healthy_json_file.__len__()
assert len(all_company_data_ce) == bankrupt_json_file.__len__() + healthy_json_file.__len__()
assert len(all_company_data_rgcn) == bankrupt_json_file.__len__() + healthy_json_file.__len__()
##############################################################

In [34]:
##############################################################
# 16/12/24 | 10:15 PM | 
# Normalizing the embeddings
# create a databaset for each model
##############################################################
normalized_embedding_te = scaler.fit_transform(all_company_data_te)
normalized_embedding_th = scaler.fit_transform(all_company_data_th)
normalized_embedding_ce = scaler.fit_transform(all_company_data_ce)
normalized_embedding_rgcn = scaler.fit_transform(all_company_data_rgcn)

X_te = np.array(normalized_embedding_te)
y_te = np.array(label_te).flatten()

X_re = np.array(all_company_data_re)
y_re = np.array(label_re).flatten()

X_th = np.array(normalized_embedding_th)
y_th = np.array(label_th).flatten()

X_ce = np.array(normalized_embedding_ce)
y_ce = np.array(label_ce).flatten()

X_rgcn = np.array(normalized_embedding_rgcn)
y_rgcn = np.array(label_rgcn).flatten()

##############################################################

mapping_dict_te = {}
for file, x in zip(bankrupt_json_file, X_te[:50]):
    mapping_dict_te[file] = x

for file, x in zip(healthy_json_file, X_te[51:]):
    mapping_dict_te[file] = x


mapping_dict_re = {}
for file, x in zip(bankrupt_json_file, X_re[:50]):
    mapping_dict_re[file] = x

for file, x in zip(healthy_json_file, X_re[51:]):
    mapping_dict_re[file] = x

mapping_dict_th = {}
for file, x in zip(bankrupt_json_file, X_th[:50]):
    mapping_dict_th[file] = x

for file, x in zip(healthy_json_file, X_th[51:]):
    mapping_dict_th[file] = x

mapping_dict_ce = {}
for file, x in zip(bankrupt_json_file, X_ce[:50]):
    mapping_dict_ce[file] = x

for file, x in zip(healthy_json_file, X_ce[51:]):
    mapping_dict_ce[file] = x

mapping_dict_rgcn = {}
for file, x in zip(bankrupt_json_file, X_rgcn[:50]):
    mapping_dict_rgcn[file] = x

for file, x in zip(healthy_json_file, X_rgcn[51:]):
    mapping_dict_rgcn[file] = x
##############################################################

In [35]:
for (a, b, c, d, e) in zip(mapping_dict_te, mapping_dict_re, mapping_dict_th, mapping_dict_ce, mapping_dict_rgcn):
    print(f"TransE : Type {type(mapping_dict_te[a][0])} | Shape {mapping_dict_te[a].shape}")
    print(f"RotatE : Type {type(mapping_dict_re[b][0])} | Shape {mapping_dict_re[b].shape}")
    print(f"TransH : Type {type(mapping_dict_th[c][0])} | Shape {mapping_dict_th[c].shape}")
    print(f"ConvE : Type {type(mapping_dict_ce[d][0])} | Shape {mapping_dict_ce[d].shape}")
    print(f"RGCN : Type {type(mapping_dict_rgcn[e][0])} | Shape {mapping_dict_rgcn[e].shape}")
    break

TransE : Type <class 'numpy.float64'> | Shape (30,)
RotatE : Type <class 'numpy.complex64'> | Shape (30,)
TransH : Type <class 'numpy.float64'> | Shape (30,)
ConvE : Type <class 'numpy.float64'> | Shape (30,)
RGCN : Type <class 'numpy.float64'> | Shape (30,)


In [36]:
# run once
os.makedirs('output/embeddings', exist_ok=True)

def embedding_30(mapping_dict, model_name):
    for key, value in mapping_dict.items():
        mapping_dict[key] = value.tolist()

    with open(f'output/embeddings/{model_name}.json', 'w') as f:
        json.dump(mapping_dict, f)

    df = pd.DataFrame.from_dict(mapping_dict, orient='index', columns=[f"k{i}" for i in range(1, 31)])
    df.index.name = "path"
    df.to_csv(f'output/embeddings/{model_name}_30.csv')

embedding_30(mapping_dict_te, "TransE")
# embedding_30(mapping_dict_re, "RotatE") # RotatE cannot be saved due to complex numbers
embedding_30(mapping_dict_th, "TransH")
embedding_30(mapping_dict_ce, "ConvE")
embedding_30(mapping_dict_rgcn, "RGCN")

<hr>

## <strong>Classification Test</strong>

Direct classfication test by just using knowledge graph embedding models

In [50]:
##############################################
# classification using logistic regression for different models
##############################################
X_train_te, X_test_te, y_train_te, y_test_te = train_test_split(X_te, y_te, test_size=0.2, shuffle=True)
X_test_te, X_val_te, y_test_te, y_val_te = train_test_split(X_test_te, y_test_te, test_size=0.5, shuffle=True)
clf_te = LogisticRegression(random_state=0).fit(X_train_te, y_train_te)

y_pred_te = clf_te.predict(X_test_te)
print(f"TransE accuracy: {accuracy_score(y_test_te, y_pred_te)}")
print(classification_report(y_test_te, y_pred_te))

##############################################
X_train_th, X_test_th, y_train_th, y_test_th = train_test_split(X_th, y_th, test_size=0.2, shuffle=True)
X_test_th, X_val_th, y_test_th, y_val_th = train_test_split(X_test_th, y_test_th, test_size=0.5, shuffle=True)
clf_th = LogisticRegression(random_state=0).fit(X_train_th, y_train_th)

y_pred_th = clf_th.predict(X_test_th)
print(f"TransH accuracy: {accuracy_score(y_test_th, y_pred_th)}")
print(classification_report(y_test_th, y_pred_th))

##############################################
X_train_ce, X_test_ce, y_train_ce, y_test_ce = train_test_split(X_ce, y_ce, test_size=0.2, shuffle=True)
X_test_ce, X_val_ce, y_test_ce, y_val_ce = train_test_split(X_test_ce, y_test_ce, test_size=0.5, shuffle=True)
clf_ce = LogisticRegression(random_state=0).fit(X_train_ce, y_train_ce)

y_pred_ce = clf_ce.predict(X_test_ce)
print(f"ConvE accuracy: {accuracy_score(y_test_ce, y_pred_ce)}")
print(classification_report(y_test_ce, y_pred_ce))

##############################################
X_train_rgcn, X_test_rgcn, y_train_rgcn, y_test_rgcn = train_test_split(X_rgcn, y_rgcn, test_size=0.2, shuffle=True)
X_test_rgcn, X_val_rgcn, y_test_rgcn, y_val_rgcn = train_test_split(X_test_rgcn, y_test_rgcn, test_size=0.5, shuffle=True)
clf_rgcn = LogisticRegression(random_state=0).fit(X_train_rgcn, y_train_rgcn)

y_pred_rgcn = clf_rgcn.predict(X_test_rgcn)
print(f"RGCN accuracy: {accuracy_score(y_test_rgcn, y_pred_rgcn)}")
print(classification_report(y_test_rgcn, y_pred_rgcn))
##############################################

TransE accuracy: 0.3
              precision    recall  f1-score   support

           0       0.43      0.50      0.46         6
           1       0.00      0.00      0.00         4

    accuracy                           0.30        10
   macro avg       0.21      0.25      0.23        10
weighted avg       0.26      0.30      0.28        10

TransH accuracy: 0.8
              precision    recall  f1-score   support

           0       1.00      0.75      0.86         8
           1       0.50      1.00      0.67         2

    accuracy                           0.80        10
   macro avg       0.75      0.88      0.76        10
weighted avg       0.90      0.80      0.82        10

ConvE accuracy: 0.9
              precision    recall  f1-score   support

           0       1.00      0.80      0.89         5
           1       0.83      1.00      0.91         5

    accuracy                           0.90        10
   macro avg       0.92      0.90      0.90        10
weighted avg

In [51]:
os.makedirs('output/models', exist_ok=True)
import pickle
with open('output/models/TransE.pkl', 'wb') as f:
    pickle.dump(clf_te, f)
print("TransE model saved!")

with open('output/models/TransH.pkl', 'wb') as f:
    pickle.dump(clf_th, f)
print("TransH model saved!")

with open('output/models/ConvE.pkl', 'wb') as f:
    pickle.dump(clf_ce, f)
print("ConvE model saved!")

with open('output/models/RGCN.pkl', 'wb') as f:
    pickle.dump(clf_rgcn, f)
print("RGCN model saved!")

TransE model saved!
TransH model saved!
ConvE model saved!
RGCN model saved!


**ConvE** gave the best result for this run of 90% accuracy, 
**TransE** can also give better result if we try to run it multiple times

<hr>

<hr>

# **Neural Network**
We use Neural network for further refining the embeddings into 2 vectors in 10 dimensional vector space.

In [52]:
class EmbeddingModel(nn.Module):
    def __init__(self, input_dim=30, embedding_dim=10):
        super(EmbeddingModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 100),
            nn.ReLU(),
            nn.BatchNorm1d(100),

            nn.Linear(100, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),

            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),

            nn.Linear(32, embedding_dim),
            nn.ReLU(),

        )

        self.classifier = nn.Linear(embedding_dim, 1)

    def forward(self, x):
        embeddings = self.model(x)
        output = torch.sigmoid(self.classifier(embeddings))
        return embeddings, output
    
model = EmbeddingModel()

In [53]:
######################################################
# 17/12/24 | 12:00 AM
# NN model for refining embeddings 
# now the model is loading properly
######################################################
def embeddings_10(model_name, X_te, y_te):
    X_train = torch.tensor(X_te, dtype=torch.float32)
    y_train = torch.tensor(y_te, dtype=torch.float32)

    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    num_epochs = 50

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()

        embeddings, outputs = model(X_train)
        loss = criterion(outputs, y_train.view(-1, 1))
        loss.backward()
        optimizer.step()

        if(epoch + 1) % 10 == 0:
            print(f'Epoch: {epoch+1}, Loss: {loss.item():.4f}')

    model.eval()
    with torch.no_grad():
        X_train_embeddings, _ = model(X_train)

    torch.save(model.state_dict(), f"output/models/nn_{model_name}.pth")
    print("Model saved successfully")

    ######################################################
    mapping_dict_10 = {}

    for file, x in zip(bankrupt_json_file, X_train[:50]):
        mapping_dict_10[file], _ = model(x.unsqueeze(0))
    for file, x in zip(healthy_json_file, X_train[51:]):
        mapping_dict_10[file], _ = model(x.unsqueeze(0))

    for key, value in mapping_dict_10.items():
        mapping_dict_10[key] = value.tolist()[0]
        
    # convert into a csv file, easier to read
    df = pd.DataFrame.from_dict(mapping_dict_10, orient='index',columns=["k1", "k2", "k3", "k4", "k5", "k6", "k7", "k8", "k9", "k10"])
    df.index.name = "path"
    df.to_csv(f'output/embeddings/{model_name}_10.csv')
######################################################

In [55]:
embeddings_10("TransE", X_te, y_te)
embeddings_10("TransH", X_th, y_th)
embeddings_10("ConvE", X_ce, y_ce)
embeddings_10("RGCN", X_rgcn, y_rgcn)

Epoch: 10, Loss: 0.2180
Epoch: 20, Loss: 0.1666
Epoch: 30, Loss: 0.1246
Epoch: 40, Loss: 0.0921
Epoch: 50, Loss: 0.0676
Model saved successfully
Epoch: 10, Loss: 0.4031
Epoch: 20, Loss: 0.2255
Epoch: 30, Loss: 0.1502
Epoch: 40, Loss: 0.1045
Epoch: 50, Loss: 0.0735
Model saved successfully
Epoch: 10, Loss: 0.1911
Epoch: 20, Loss: 0.0995
Epoch: 30, Loss: 0.0588
Epoch: 40, Loss: 0.0371
Epoch: 50, Loss: 0.0249
Model saved successfully
Epoch: 10, Loss: 0.2568
Epoch: 20, Loss: 0.1049
Epoch: 30, Loss: 0.0614
Epoch: 40, Loss: 0.0400
Epoch: 50, Loss: 0.0274
Model saved successfully
