# <strong>Knowledge Graph Embeddings</strong>

We generate embedding for the Knowledge graph create earlier

To make things easier we refer to the json file created, rather then quering from neo4j api where our Knowledge graph is stored

<img src="Images\KnowledgeGraphEmbedding.png" width=600px>

above diagram show how we will use the Knowledge graph embeddings to our Model

#### Imports

In [1]:
# we need to make a list of triplets for getting embeddings
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import json
import pykeen
import os
import numpy as np
import torch
import logging

torch.manual_seed(43)

# Set the logging level to ERROR
logging.getLogger("pykeen").setLevel(logging.ERROR)
logging.getLogger("torch").setLevel(logging.ERROR)

<hr>

In [2]:
# set file paths
bankrupt_json_path = r'output\bankrupt'
healthy_json_path = r'output\healthy'

bankrupt_files_triplets = []
healthy_files_triplets = []

In [3]:
torch.cuda.is_available()

False

<hr>

##### Create **Entity2id** and **Relation2id** dict to effectivly number and convert any entitiy into numerical data

In [4]:
bankrupt_json_file = os.listdir(bankrupt_json_path)
healthy_json_file = os.listdir(healthy_json_path)

print(f'Found {bankrupt_json_file.__len__()} bankrupt company json files')
print(f'Found {healthy_json_file.__len__()} healthy company json files')

# initialize dictionaries for entities and relations to ids
entity2id = defaultdict(lambda: len(entity2id))
relation2id = defaultdict(lambda: len(relation2id))

Found 50 bankrupt company json files
Found 50 healthy company json files


This function will generate Embeddings by making use of pretrained TransE model for Knowledge graph Embeddings

In [5]:
def generateKGE(json_file, bankrupt):
    """
    Generates (30, 0) embeddings for each company using pykeen library
    """
    ent_emb = dict()
    rel_emb = dict()
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        triples = []
        for every_relation in data["relations"]:
            entity2id[every_relation["source"]]
            relation2id[every_relation["relation"]]
            entity2id[every_relation["target"]]
            triples.append((every_relation["source"], every_relation["relation"], every_relation["target"]))

        triples_factory = TriplesFactory.from_labeled_triples(np.array(triples))
        result = pipeline(
            model="TransE",
            dataset=None,
            training=triples_factory,
            testing=triples_factory,
            model_kwargs=dict(embedding_dim=10),
        )

        for i, content in enumerate(result.model.entity_representations[0]().detach().numpy()):
            ent_emb[triples_factory.entity_labeling.id_to_label[i]] = content
            
        for i, content in enumerate(result.model.relation_representations[0]().detach().numpy()):
            rel_emb[triples_factory.relation_labeling.id_to_label[i]] = content
        
        def aggregate():
                store = [np.concatenate(
                     [
                        ent_emb[s],
                        rel_emb[r],
                        ent_emb[t]
                     ]
                ) for s, r, t in triples]
                return np.mean(store, axis=0)
        company_embedding = aggregate()
        return [company_embedding, 1 if bankrupt else 0]

In [None]:
##############################################################
# 16/12/24 | 11:15 AM | 
# Aggregation of embeddings is done by taking mean of all the embeddings
# each triples creates one embeddings, these embeddings are then averaged
#****IMPROVMENTS***###########################################
# 1. Add more models.
# 2. Proper test-train split
##############################################################
import json
import numpy as np
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory

all_json_files = []

for file in bankrupt_json_file:
    all_json_files.append(os.path.join(bankrupt_json_path, file))

for file in healthy_json_file:
    all_json_files.append(os.path.join(healthy_json_path, file))

def train_model_once(files, model="TransE", embedding_dim=10, split_ratio=[0.8, 0.2]):
    """
    Train the model on a unified dataset from multiple files.
    Returns the trained model and entity/relation mappings.
    """
    all_triples = []

    # Combine triples from all files
    for json_file in files:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for relation in data["relations"]:
                entity2id[relation["source"]]
                relation2id[relation["relation"]]
                entity2id[relation["target"]]
                all_triples.append(
                    (relation["source"], relation["relation"], relation["target"])
                )

    # Create a unified triples factory
    triples_factory = TriplesFactory.from_labeled_triples(np.array(all_triples))

    # Split into training and testing
    try:
        training_factory, testing_factory = triples_factory.split(split_ratio)
    except Exception as e:
        training_factory = triples_factory
        testing_factory = triples_factory

    # Train the model
    result = pipeline(
        model=model,
        dataset=None,
        training=training_factory,
        testing=testing_factory,
        model_kwargs=dict(embedding_dim=embedding_dim),
    )

    return result.model, triples_factory

def generate_embeddings_for_file(json_file, model, triples_factory):
    """
    Generates embeddings for a single file using a pre-trained model.
    """
    ent_emb = {}
    rel_emb = {}

    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        triples = []
        for relation in data["relations"]:
            triples.append((relation["source"], relation["relation"], relation["target"]))

    # Extract entity and relation embeddings
    for i, content in enumerate(model.entity_representations[0]().detach().numpy()):
        ent_emb[triples_factory.entity_labeling.id_to_label[i]] = content

    for i, content in enumerate(model.relation_representations[0]().detach().numpy()):
        rel_emb[triples_factory.relation_labeling.id_to_label[i]] = content

    # Aggregate embeddings
    def aggregate():
        store = [
            np.concatenate([
                ent_emb[s],
                rel_emb[r],
                ent_emb[t]
            ]) for s, r, t in triples
        ]
        return np.mean(store, axis=0)

    company_embedding = aggregate()
    return company_embedding

model_te, triple_factory_te = train_model_once(all_json_files)
model_re, triple_factory_re = train_model_once(all_json_files, model="RotatE")
##############################################################

In [None]:
all_company_data = []

for files in bankrupt_json_file:
    all_company_data.append(generateKGE(os.path.join(bankrupt_json_path, files), True))

for files in healthy_json_file:
    all_company_data.append(generateKGE(os.path.join(healthy_json_path, files), False))


In [7]:
assert len(all_company_data) == bankrupt_json_file.__len__() + healthy_json_file.__len__()

In [21]:
##############################################################
# 16/12/24 | 10:15 PM | 
# generating embeddings for all companies
# files are in order so the labels are 50 bankrupt [1] and 50 healthy 
##############################################################
all_company_data_te = []

for files in all_json_files:
    all_company_data_te.append(generate_embeddings_for_file(files, model_te, triple_factory_te))

label_te = [50*[1] + 50*[0]]

all_company_data_re = []

for files in all_json_files:
    all_company_data_re.append(generate_embeddings_for_file(files, model_re, triple_factory_re))

label_re = [50*[1] + 50*[0]]

assert len(all_company_data_te) == bankrupt_json_file.__len__() + healthy_json_file.__len__()
assert len(all_company_data_re) == bankrupt_json_file.__len__() + healthy_json_file.__len__()
##############################################################

In [187]:
##############################################################
# 16/12/24 | 10:15 PM | 
# use these embeddings as dataset for training NN (adding NN before LR)
##############################################################
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_embedding = scaler.fit_transform(all_company_data_te)

X_te = np.array(normalized_embedding)
y_te = np.array(label_te).flatten()

X_re = np.array(all_company_data_re)
y_re = np.array(label_re).flatten()
##############################################################

In [8]:
company_embeddings = []
labels = []

for company_data in all_company_data:
    company_embeddings.append(company_data[0])
    labels.append(company_data[1])

X = np.array(company_embeddings)
y = np.array(labels)

It is crucial to map the input file and its embedding to match up them at later stage

In [14]:
mapping_dict = {}
for file, x in zip(bankrupt_json_file, X[:50]):
    mapping_dict[file] = x

for file, x in zip(healthy_json_file, X[51:]):
    mapping_dict[file] = x

mapping_dict

{'ABGSHIP_2013_MDA.txt.json': array([-1.66938707e-01, -3.15224797e-01,  3.37416023e-01, -3.14057618e-01,
         3.16189617e-01,  2.27844998e-01,  1.56313017e-01, -2.66191512e-01,
        -3.03619355e-01, -1.74325749e-01, -8.55988637e-02,  3.27311426e-01,
         1.40656069e-01, -3.93350832e-02, -2.14042161e-02,  1.52362883e-01,
        -3.85820796e-03, -8.97391364e-02, -3.53682905e-01,  3.57527345e-01,
         1.03640854e-01, -2.76631027e-01, -1.98075965e-01, -7.37413168e-02,
         3.63426685e-01,  2.39481524e-04,  3.47433448e-01, -3.22540313e-01,
        -1.68941692e-01, -2.70882100e-01], dtype=float32),
 'ADHUNIK_2015_MDA.txt.json': array([-0.40160838,  0.20587353,  0.31203613,  0.21636431, -0.10305728,
        -0.0713592 , -0.0676132 , -0.0088659 , -0.38740286,  0.00284216,
        -0.08760046,  0.01366924, -0.2437886 , -0.07112139, -0.10143141,
        -0.1318889 , -0.01584608, -0.30141678,  0.26264104, -0.04377741,
        -0.17198718,  0.06371696,  0.16321352,  0.18380713,

In [188]:
#####################################################################
# 16/12/24 | 11:15 PM |
# Mapping dict to their models 
# looks like rotatE uses complex number too!
#####################################################################
mapping_dict_te = {}
for file, x in zip(bankrupt_json_file, X_te[:50]):
    mapping_dict_te[file] = x

for file, x in zip(healthy_json_file, X_te[51:]):
    mapping_dict_te[file] = x


mapping_dict_re = {}
for file, x in zip(bankrupt_json_file, X_re[:50]):
    mapping_dict_re[file] = x

for file, x in zip(healthy_json_file, X_re[51:]):
    mapping_dict_re[file] = x
#####################################################################

In [189]:
for i in mapping_dict_te.items():
    print(i, i[1].shape)
    break

for i in mapping_dict_re.items():
    print(i, i[1].shape)
    break

('ABGSHIP_2013_MDA.txt.json', array([ 1.08216675, -0.62631242, -1.40613583, -0.98554452,  0.28779908,
       -0.52509024, -0.29446993, -2.70252746,  0.16694267, -0.34034837,
        0.13769372,  1.10673211, -0.59274591,  0.9594875 , -0.05748412,
       -0.12140918, -0.41046009, -0.44901804,  0.90953592,  0.04188335,
       -0.18268933, -0.45762981,  0.47285803, -0.45696954, -2.05119398,
       -1.1461618 , -1.19786592, -0.88360102,  1.45715564, -0.5692656 ])) (30,)
('ABGSHIP_2013_MDA.txt.json', array([-0.4222414 +0.08157843j,  0.39360547+0.31602648j,
        0.13233416+0.30745557j,  0.15965329+0.12634692j,
       -0.27571213+0.04884817j,  0.02578425-0.0365822j ,
       -0.3123841 +0.20387165j,  0.2994062 +0.12600023j,
       -0.36826396-0.00249605j,  0.47186083-0.2703824j ,
        0.18554513-0.3617699j , -0.18594271+0.34662518j,
       -0.05745354+0.80597377j, -0.04694924-0.3019938j ,
        0.20540616-0.22287595j, -0.41733655+0.2526276j ,
        0.08741319-0.34406137j, -0.33876047-

In [None]:
# convert numpy array to list
for key, value in mapping_dict.items():
    mapping_dict[key] = value.tolist()

In [33]:
# save the embeddings to a json file
with open('output/embeddings.json', 'w') as f:
    json.dump(mapping_dict, f)


In [191]:
#####################################################################
# 16/12/24 | 11:15 PM |
# Mapping dict to their models 
# looks like rotatE uses complex number!
# PROBLEMS ##########################################################
# 1. RotatE embeddings are complex numbers
# 2. Connot convert complex numbers to json
# SOLUTION ##########################################################
# 1. Convert complex numbers to string and while using convert back using complex()
#####################################################################
for key, value in mapping_dict_te.items():
    mapping_dict_te[key] = value.tolist()

for key, value in mapping_dict_re.items():
    mapping_dict_re[key] = [str(cmplx) for cmplx in value.tolist()]

with open('output/embeddings_te.json', 'w') as f:
    json.dump(mapping_dict_te, f)

with open('output/embeddings_re.json', 'w') as f:
    json.dump(mapping_dict_re, f)
#####################################################################

In [192]:
########################################################
# 16/12/24 | 11:43 PM 
# Saving transE model with 30 embeddings
########################################################
# convert into a csv file, easier to read
import pandas as pd
df = pd.DataFrame.from_dict(mapping_dict_te, orient='index',columns=[f"k{i}" for i in range(1, 31)])
df.index.name = "path"
df.to_csv('output/embeddings_te_30.csv')
########################################################

### Classification test

In [26]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, y_train = X, y

In [22]:
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)

In [29]:
y_pred = clf.predict(X_train)
print(f'Accuracy: {accuracy_score(y_train, y_pred)}')
print(classification_report(y_train, y_pred))

Accuracy: 0.7
              precision    recall  f1-score   support

           0       0.70      0.70      0.70        50
           1       0.70      0.70      0.70        50

    accuracy                           0.70       100
   macro avg       0.70      0.70      0.70       100
weighted avg       0.70      0.70      0.70       100



In [193]:
#####################################################################
# 16/12/24 | 11:15 PM |
# PROBLEMS ##########################################################
# 1. RotatE embeddings are complex numbers
# 2. Model cannot work with complex numbers
#####################################################################
X_train, X_test, y_train, y_test = train_test_split(X_te, y_te, test_size=0.2, random_state=42, shuffle=True)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, shuffle=True)
clf_te = LogisticRegression(random_state=0)
clf_te.fit(X_train, y_train)

# X_re_str = np.array([[str(x) for x in row] for row in X_re])
# clf_re = LogisticRegression(random_state=0)
# clf_re.fit(X_re_str, y_re)
#####################################################################

<hr>

In [194]:
y_pred = clf_te.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9
              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.86      1.00      0.92         6

    accuracy                           0.90        10
   macro avg       0.93      0.88      0.89        10
weighted avg       0.91      0.90      0.90        10



In [195]:
# saving the model 
import pickle
with open('output/logistic_regression_te.pkl', 'wb') as f:
    pickle.dump(clf_te, f)
print("Model saved successfully")    

Model saved successfully


In [196]:
with open('output/logistic_regression_te.pkl', 'rb') as f:
    clf_te = pickle.load(f)

for i in range(X_val.shape[0]):
    y_pred = clf_te.predict(X_val[i].reshape(1, -1))
    print(y_pred, y_val[i])

[0] 0
[0] 0
[1] 1
[1] 0
[0] 1
[1] 1
[1] 1
[1] 1
[1] 1
[0] 0


<hr>

#### **Neural Network**

created a neural network for binary classification task to classify KGE into 2 classes, bankrupt and healthy, by this we map the KGE even closer to embeddings of Bankrupt and Healthy company files

In [197]:
import torch
import torch.nn as nn
import torch.optim as optim

class EmbeddingModel(nn.Module):
    def __init__(self, input_dim=30, embedding_dim=10):
        super(EmbeddingModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 100),
            nn.ReLU(),
            nn.BatchNorm1d(100),

            nn.Linear(100, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),

            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),

            nn.Linear(32, embedding_dim),
            nn.ReLU(),

        )

        self.classifier = nn.Linear(embedding_dim, 1)

    def forward(self, x):
        embeddings = self.model(x)
        output = torch.sigmoid(self.classifier(embeddings))
        return embeddings, output
    
model = EmbeddingModel()

##### Train the model to fit for binary classification

In [39]:
X_train = torch.tensor(X, dtype=torch.float32)
y_train = torch.tensor(y, dtype=torch.float32)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    embeddings, outputs = model(X_train)
    loss = criterion(outputs, y_train.view(-1, 1))
    loss.backward()
    optimizer.step()

    if(epoch + 1) % 10 == 0:
        print(f'Epoch: {epoch+1}, Loss: {loss.item():.4f}')

model.eval()
with torch.no_grad():
    X_train_embeddings, _ = model(X_train)


Epoch: 10, Loss: 0.1818
Epoch: 20, Loss: 0.1281
Epoch: 30, Loss: 0.0801
Epoch: 40, Loss: 0.0473
Epoch: 50, Loss: 0.0273


In [198]:
######################################################
# 17/12/24 | 12:00 AM
# NN model for refining embeddings 
# now the model is loading properly
######################################################
X_train = torch.tensor(X_te, dtype=torch.float32)
y_train = torch.tensor(y_te, dtype=torch.float32)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    embeddings, outputs = model(X_train)
    loss = criterion(outputs, y_train.view(-1, 1))
    loss.backward()
    optimizer.step()

    if(epoch + 1) % 10 == 0:
        print(f'Epoch: {epoch+1}, Loss: {loss.item():.4f}')

model.eval()
with torch.no_grad():
    X_train_embeddings, _ = model(X_train)

torch.save(model.state_dict(), "binary_classifier_nn.pth")
print("Model saved successfully")
######################################################

Epoch: 10, Loss: 0.5191
Epoch: 20, Loss: 0.4252
Epoch: 30, Loss: 0.3457
Epoch: 40, Loss: 0.2687
Epoch: 50, Loss: 0.1943
Model saved successfully


In [200]:
model = EmbeddingModel()
model.load_state_dict(torch.load("binary_classifier_nn.pth"))
model.eval()
with torch.no_grad():
    X_train_embeddings, _ = model(X_train)

  model.load_state_dict(torch.load("binary_classifier_nn.pth"))


In [215]:
######################################################
# 17/12/24 | 12:30 AM
# store the embeddings in a json file with shape (10,)
# scaling : NO : embeddings became 0 after scaling
######################################################
mapping_dict_te_10 = {}

for file, x in zip(bankrupt_json_file, X_train[:50]):
    mapping_dict_te_10[file], _ = model(x.unsqueeze(0))
for file, x in zip(healthy_json_file, X_train[51:]):
    mapping_dict_te_10[file], _ = model(x.unsqueeze(0))

for key, value in mapping_dict_te_10.items():
    mapping_dict_te_10[key] = value.tolist()[0]
    
# convert into a csv file, easier to read
import pandas as pd
df = pd.DataFrame.from_dict(mapping_dict_te_10, orient='index',columns=["k1", "k2", "k3", "k4", "k5", "k6", "k7", "k8", "k9", "k10"])
df.index.name = "path"
df.to_csv('output/embeddings_te_10.csv')
######################################################


In [69]:
mapping_dict = {}

for file, x in zip(bankrupt_json_file, X_train[:50]):
    mapping_dict[file], _ = model(x.unsqueeze(0))
for file, x in zip(healthy_json_file, X_train[51:]):
    mapping_dict[file], _ = model(x.unsqueeze(0))

for key, value in mapping_dict.items():
    mapping_dict[key] = value.tolist()[0]

Store the embeddings in json file or in csv file for easy retrival!

In [None]:
# convert into a csv file, easier to read
import pandas as pd
df = pd.DataFrame.from_dict(mapping_dict, orient='index',columns=["k1", "k2", "k3", "k4", "k5", "k6", "k7", "k8", "k9", "k10"])
df.index.name = "path"
df.to_csv('output/embeddings.csv')

In [None]:
# converting it into json file
with open('output/embeddings.json', 'w') as f:
    json.dump(mapping_dict, f)

<hr>
<hr>

##### Rough / Sample trials

In [103]:
# testing it on single file
# source, target, relation
one_file = []
with open(os.path.join(bankrupt_json_path, bankrupt_json_file[2]), 'r') as f:
    jsonfile = json.load(f)
    for every_relation in jsonfile['relations']:
        one_file.append((every_relation['source'], every_relation['relation'], every_relation['target']))

entity2id = defaultdict(lambda: len(entity2id))
relation2id = defaultdict(lambda: len(relation2id))

triplet_ids = [[(entity2id[s], relation2id[r], entity2id[o]) for s, r, o in one_file], 1]
print(one_file)
print(triplet_ids[0])
triple_factory = TriplesFactory.from_labeled_triples(np.array(one_file))
print(triple_factory)
print(triple_factory.entity_labeling)

result = pipeline(
    model="TransE",
    dataset=None,
    training=triple_factory,
    testing=triple_factory,
    model_kwargs=dict(embedding_dim=10)
)

INFO:pykeen.pipeline.api:Using device: None


[('Adhunik Metaliks Limited', 'PRODUCES', 'Special Alloy Steel'), ('Adhunik Metaliks Limited', 'PRODUCES', 'Ferro Alloys'), ('Adhunik Metaliks Limited', 'PRODUCES', 'Iron Billets'), ('Adhunik Metaliks Limited', 'PRODUCES', 'Rolled Products'), ('Global Economic Recovery', 'HAD_NEGATIVE_IMPACT_ON', 'Adhunik Metaliks Limited'), ('Indian Economy', 'HAD_NEGATIVE_IMPACT_ON', 'Adhunik Metaliks Limited'), ('Steel Industry', 'HAD_NEGATIVE_IMPACT_ON', 'Adhunik Metaliks Limited')]
[(0, 0, 1), (0, 0, 2), (0, 0, 3), (0, 0, 4), (5, 1, 0), (6, 1, 0), (7, 1, 0)]
TriplesFactory(num_entities=8, num_relations=2, create_inverse_triples=False, num_triples=7)
Labeling(label_to_id={'Adhunik Metaliks Limited': 0, 'Ferro Alloys': 1, 'Global Economic Recovery': 2, 'Indian Economy': 3, 'Iron Billets': 4, 'Rolled Products': 5, 'Special Alloy Steel': 6, 'Steel Industry': 7}, id_to_label={0: 'Adhunik Metaliks Limited', 1: 'Ferro Alloys', 2: 'Global Economic Recovery', 3: 'Indian Economy', 4: 'Iron Billets', 5: 'Rol

Training epochs on cpu:   0%|          | 0/5 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/1 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1 [00:00<?, ?batch/s]



Evaluating on cpu:   0%|          | 0.00/7.00 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.03s seconds


In [112]:
ent_emb = dict()
rel_emb = dict()
for i, content in enumerate(result.model.entity_representations[0]().detach().numpy()):
    ent_emb[triple_factory.entity_labeling.id_to_label[i]] = content
    
for i, content in enumerate(result.model.relation_representations[0]().detach().numpy()):
    rel_emb[triple_factory.relation_labeling.id_to_label[i]] = content

print(ent_emb)
print(rel_emb)

def func():
    k = [
        np.concatenate(
            [
                ent_emb[s],
                rel_emb[r],
                ent_emb[t]
            ]
        )
        for s, r, t in one_file
    ]
    kk = np.mean(k, axis=0)
    return kk
print(func().shape)

{'Adhunik Metaliks Limited': array([-0.38886335, -0.38222024,  0.4455946 , -0.19221817, -0.17458811,
       -0.28307518, -0.0807822 ,  0.41361225,  0.08520468, -0.41438782],
      dtype=float32), 'Ferro Alloys': array([ 0.24377456,  0.10277739,  0.37835982, -0.11915959, -0.55483526,
        0.0604708 ,  0.3830249 , -0.36510557, -0.42471564, -0.02762579],
      dtype=float32), 'Global Economic Recovery': array([-0.29396698, -0.21980724,  0.02687212, -0.5186562 ,  0.10426111,
       -0.17225906, -0.3152244 ,  0.21939981,  0.46066028,  0.44191346],
      dtype=float32), 'Indian Economy': array([ 0.1449898 ,  0.43746293, -0.06959216, -0.23102771,  0.4294105 ,
        0.16648461, -0.4694152 , -0.02244819,  0.25797275,  0.4794501 ],
      dtype=float32), 'Iron Billets': array([-0.1431634 ,  0.33696303, -0.37623212, -0.36582404,  0.12036691,
        0.4383562 ,  0.40429646, -0.19697198,  0.26959574, -0.33015525],
      dtype=float32), 'Rolled Products': array([-0.27646238,  0.08308879, -0.188

In [98]:
# use this to access the correct row from embedding matrix
triple_factory.entity_labeling.label_to_id['Adhunik Metaliks Limited']

0

In [None]:


def get_triplets(json_file, bankrupt):
    triples = []
    with open(json_file) as f:
        jsonfile = json.load(f)
        for every_relation in jsonfile['relations']:
            triples.append((every_relation['source'], every_relation['relation'], every_relation['target']))
    return [triples, 1 if bankrupt else 0]


def generate_kg_embeddings(triplets):
    training_data = [(str(s), str(o), str(r)) for s, r, o in triplets]
    triples_factory = TriplesFactory.from_labeled_triples(np.array(training_data))

    result = pipeline(
        model="TransE",
        dataset=None,
        training=triples_factory,
        testing=triples_factory,
        training_kwargs=dict(num_epochs=100),
        model_kwargs=dict(embedding_dim=10),
    )

    entity_embeddings = result.model.entity_representations[0]().detach().numpy()
    relation_embeddings = result.model.relation_representations[0]().detach().numpy()
    
    return entity_embeddings, relation_embeddings, result

def aggregate_comapny_embeddings(triplet_ids, entity_embeddings, relation_embeddings):
    # aggregate multiple embeddings into single triplet embedding

    triplet_embeddings = [
        np.concatenate(
            [
                entity_embeddings[head],
                relation_embeddings[relation],
                entity_embeddings[tail]
            ]
            )
            for head, relation, tail in triplet_ids
    ]

    company_embedding = np.mean(triplet_embeddings, axis=0)
    return company_embedding

In [114]:
# extract triplets from json files
all_company_data = []
for file in bankrupt_json_file:
    s, r, t = get_triplets(os.path.join(bankrupt_json_path, file), True)
    entity2id[s], relation2id[r], entity2id[t]
    all_company_data.append((s, r, t))

for file in healthy_json_file:
    s, r, t = get_triplets(os.path.join(healthy_json_path, file), False)
    entitiy2id[s], relation2id[r], entitiy2id[t]
    all_company_data.append((s, r, t))

assert len(all_company_data) == bankrupt_json_file.__len__() + healthy_json_file.__len__()

ValueError: not enough values to unpack (expected 3, got 2)

In [None]:
for company_data in all_company_data:
    company_triplets, label = company_data
    
    entity_emb, relation_emb = generate_kg_embeddings(company_triplets)

    triplets_id = [(entitiy2id(s), relation2id(r), entitiy2id(t)) for s, r, t, in company_triplets]

    

    

In [None]:
# Example company triplets (convert these for each company as needed)
triplets_for_company = triplet_ids  # Using the previously converted triplet IDs

# Generate embeddings
entity_emb, relation_emb = generate_kg_embeddings(triplets_for_company)

# Compute the final embedding for the company by averaging the triplet embeddings
company_embedding = aggregate_comapny_embeddings(triplets_for_company, entity_emb, relation_emb)


In [28]:
company_embedding.shape

(30,)