In [None]:
import os
import bz2
import pandas as pd
import numpy as np
import rdflib
from rdflib import Graph, URIRef, Literal, Namespace
from collections import Counter
import urllib.parse
import matplotlib.pyplot as plt
import seaborn as sns

import pykeen
from pykeen.datasets import Dataset
from pykeen.models import ConvE, DistMult
from pykeen.training import training_loop
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline
from pykeen.predict import predict_target

from sklearn.model_selection import train_test_split

In [2]:
ttl_files = [
    "data/hiwiki-20250601-anchor-text.ttl.bz2",
    "data/hiwiki-20250601-commons-page-links.ttl.bz2", 
    "data/hiwiki-20250601-disambiguations.ttl.bz2",
    "data/hiwiki-20250601-geo-coordinates-mappingbased.ttl.bz2",
    "data/hiwiki-20250601-homepages.ttl.bz2",
    "data/hiwiki-20250601-images.ttl.bz2",
    "data/hiwiki-20250601-instance-types-transitive.ttl.bz2",
    "data/hiwiki-20250601-instance-types.ttl.bz2",
    "data/hiwiki-20250601-mappingbased-literals.ttl.bz2",
    "data/hiwiki-20250601-mappingbased-objects-uncleaned.ttl.bz2",
    "data/hiwiki-20250601-redirects.ttl.bz2",
    "data/hiwiki-20250601-specific-mappingbased-properties.ttl.bz2",
    "data/hiwiki-20250601-topical-concepts.ttl.bz2"
]

In [None]:
all_triples = []
for file_path in ttl_files:
    print(f"Parsing {os.path.basename(file_path)}...")
    
    try:
        with bz2.open(file_path, 'rt', encoding='utf-8') as f:
            g = Graph()
            try:
                g.parse(f, format='turtle')
            except (ParserError, Exception) as e:
                print(f"  WARNING: Could not parse {os.path.basename(file_path)}. Error: {e}")
                continue

            for s, p, o in g:
                # clean subject and predicate which are always URIs
                # cleaning for this warning mostly: 
                # http://hi.wikipedia.org/wiki/चित्र:\nPreah_Khan_temple_at_Angkor,_Cambodia.jpg 
                # does not look like a valid URI, trying to serialize this will break.
                s_clean = str(s).replace('\n', '').replace('\r', '').strip()                
                p_clean = str(p).replace('\n', '').replace('\r', '').strip()

                if isinstance(o, URIRef):
                    o_clean = str(o).replace('\n', '').replace('\r', '').strip()                
                    all_triples.append((s_clean, p_clean, o_clean))
                
                elif isinstance(o, Literal):
                    # not including literals because it doesn't make sense to learn attributes like date of birth rather we want
                    # to learn symbolic links between 2 entities
                    # also theres a lot of issues with this one while parsing the graph like this
                    # 
                    continue
                    # print(f"literal: {o}")
                    # o_clean = str(o.value) 
                    # all_triples.append((s_clean, p_clean, o_clean))

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

print(f"total: {len(all_triples)} triples")

Parsing hiwiki-20250601-anchor-text.ttl.bz2...
Parsing hiwiki-20250601-commons-page-links.ttl.bz2...
Parsing hiwiki-20250601-disambiguations.ttl.bz2...
Parsing hiwiki-20250601-geo-coordinates-mappingbased.ttl.bz2...
Parsing hiwiki-20250601-homepages.ttl.bz2...
Parsing hiwiki-20250601-images.ttl.bz2...


http://commons.wikimedia.org/wiki/Special:FilePath/\nBhutan-Paro-Stadt-06-Zentrum-2015-gje.jpg does not look like a valid URI, trying to serialize this will break.
http://commons.wikimedia.org/wiki/Special:FilePath/\nBhutan-Paro-Stadt-06-Zentrum-2015-gje.jpg does not look like a valid URI, trying to serialize this will break.
http://commons.wikimedia.org/wiki/Special:FilePath/\nBhutan-Paro-Stadt-06-Zentrum-2015-gje.jpg?width=300 does not look like a valid URI, trying to serialize this will break.
http://commons.wikimedia.org/wiki/Special:FilePath/\nBhutan-Paro-Stadt-06-Zentrum-2015-gje.jpg does not look like a valid URI, trying to serialize this will break.
http://commons.wikimedia.org/wiki/Special:FilePath/\nBhutan-Paro-Stadt-06-Zentrum-2015-gje.jpg?width=300 does not look like a valid URI, trying to serialize this will break.
http://commons.wikimedia.org/wiki/Special:FilePath/\nBhutan-Paro-Stadt-06-Zentrum-2015-gje.jpg does not look like a valid URI, trying to serialize this will bre

Parsing hiwiki-20250601-instance-types-transitive.ttl.bz2...
Parsing hiwiki-20250601-instance-types.ttl.bz2...
Parsing hiwiki-20250601-mappingbased-literals.ttl.bz2...


Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_xsd_date at 0x10bd96d40>
Traceback (most recent call last):
  File "/Users/adityavenkatesh/Documents/Code/nef_new/neural-extraction-framework/GSoC25_H/.env/lib/python3.10/site-packages/rdflib/term.py", line 2163, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "/Users/adityavenkatesh/Documents/Code/nef_new/neural-extraction-framework/GSoC25_H/.env/lib/python3.10/site-packages/rdflib/xsd_datetime.py", line 593, in parse_xsd_date
    return parse_date(date_string if not minus else ("-" + date_string))
  File "/Users/adityavenkatesh/Documents/Code/nef_new/neural-extraction-framework/GSoC25_H/.env/lib/python3.10/site-packages/isodate/isodates.py", line 193, in parse_date
    raise ISO8601Error("Unrecognised ISO 8601 date format: %r" % datestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: '-0753-04-21'
Faile

Parsing hiwiki-20250601-mappingbased-objects-uncleaned.ttl.bz2...
Parsing hiwiki-20250601-redirects.ttl.bz2...
Parsing hiwiki-20250601-specific-mappingbased-properties.ttl.bz2...
Parsing hiwiki-20250601-topical-concepts.ttl.bz2...
total: 2039012 triples


In [4]:
# filter out disambiguation triplets
core_triples = [
    t for t in all_triples 
    if 'http://www.w3.org/2002/07/owl#sameAs' not in t[1] and 'http://dbpedia.org/ontology/wikiPageDisambiguates' not in t[1] and t[1].startswith('http://dbpedia.org/ontology/')
]

In [5]:
relation_counts = Counter(t[1] for t in core_triples)

for i, (relation, count) in enumerate(relation_counts.most_common(30)):
    print(f"{i+1}. {relation:<30} | Count: {count}")

1. http://dbpedia.org/ontology/language | Count: 108087
2. http://dbpedia.org/ontology/thumbnail | Count: 78098
3. http://dbpedia.org/ontology/wikiPageRedirects | Count: 77046
4. http://dbpedia.org/ontology/subdivision | Count: 49789
5. http://dbpedia.org/ontology/starring | Count: 32077
6. http://dbpedia.org/ontology/state | Count: 28398
7. http://dbpedia.org/ontology/district | Count: 25201
8. http://dbpedia.org/ontology/country | Count: 19176
9. http://dbpedia.org/ontology/birthPlace | Count: 14088
10. http://dbpedia.org/ontology/timeZone | Count: 11205
11. http://dbpedia.org/ontology/occupation | Count: 8277
12. http://dbpedia.org/ontology/termPeriod | Count: 7609
13. http://dbpedia.org/ontology/nationality | Count: 4206
14. http://dbpedia.org/ontology/deathPlace | Count: 3509
15. http://dbpedia.org/ontology/director | Count: 3279
16. http://dbpedia.org/ontology/politicalLeader | Count: 3214
17. http://dbpedia.org/ontology/producer | Count: 3178
18. http://dbpedia.org/ontology/type

In [6]:
least_common = relation_counts.most_common()[-30:]
for i, (relation, count) in enumerate(least_common):
    print(f"{i+1}. {relation:<30} | Count: {count}")

1. http://dbpedia.org/ontology/academicAdvisor | Count: 21
2. http://dbpedia.org/ontology/alongside | Count: 20
3. http://dbpedia.org/ontology/honours | Count: 19
4. http://dbpedia.org/ontology/recordedIn | Count: 17
5. http://dbpedia.org/ontology/nonFictionSubject | Count: 16
6. http://dbpedia.org/ontology/architect | Count: 16
7. http://dbpedia.org/ontology/administrativeCenter | Count: 15
8. http://dbpedia.org/ontology/viceChancellor | Count: 13
9. http://dbpedia.org/ontology/restingPlacePosition | Count: 11
10. http://dbpedia.org/ontology/minister | Count: 11
11. http://dbpedia.org/ontology/creatorOfDish | Count: 10
12. http://dbpedia.org/ontology/opponent | Count: 9
13. http://dbpedia.org/ontology/athletics | Count: 9
14. http://dbpedia.org/ontology/translator | Count: 9
15. http://dbpedia.org/ontology/coverArtist | Count: 8
16. http://dbpedia.org/ontology/sport | Count: 7
17. http://dbpedia.org/ontology/province | Count: 7
18. http://dbpedia.org/ontology/manager | Count: 7
19. ht

In [7]:
# Analyze Entity Degree (Connectivity)

entity_degrees = Counter()
for head, relation, tail in core_triples:
    entity_degrees[head] += 1
    entity_degrees[tail] += 1

print(f"\nTotal unique entities found in core_triples: {len(entity_degrees)}")



Total unique entities found in core_triples: 304395


In [8]:
degree_df = pd.DataFrame(entity_degrees.values(), columns=['degree'])
print(degree_df['degree'].describe())

count    304395.000000
mean          3.575361
std          84.226941
min           1.000000
25%           1.000000
50%           1.000000
75%           3.000000
max       28392.000000
Name: degree, dtype: float64


### metadata and structural noise we can remove
wikiPageRedirects (77046 count): This is not a semantic relationship. It's a structural artifact from Wikipedia that says "this page is a redirect to another". Keeping this will teach the model that many things are simply equivalent, which is not useful for predicting new facts. we can remove this safely. 


thumbnail (78098 count): This links an entity to its image URL. It has zero semantic value for predicting relationships like (Person, birthPlace, City). It's just noise


mainArticleForCategory (606 count): This is another structural link between a category page and its main article. Not a semantic fact. Remove.

language(108087 count): This almost always links an entity to a language entity (dbr:Hindi_language). technically it is semantic but it can create a massive, uninformative hub around the Hindi_language entity, biasing our model.




### high-frequency but potentially noisy relations
language (#2, 13,838 count): This links entities (like movies, books, people) to their language (e.g., http://dbpedia.org/resource/Hindi). This creates a massive "hub" where the "Hindi" entity is connected to thousands of things. While technically a fact, it doesn't help in predicting diverse relationships and can skew the model. We want to predict things about India, not that India's language is Hindi. For a more focused KGC task, it's best to remove this.


timeZone (#10, 1,418 count): Similar to language, this connects many locations to a few timezone entities (like "Asia/Kolkata"). This is also a candidate for removal to improve focus.




### semantic core relations we shold keep
starring, occupation, nationality, director, subdivision, state, district, country, birthPlace, deathPlace, location, residence. These are the facts we want to model.
starring, producer, director, writer, musicBy, genre: forms a strong sub-graph about media and films.
occupation, politician, party, almaMater: Great biographical and political relations.
All the others in the top 30 are generally good semantic relationships.

### rare relations 
generally model can't learn from sprase relations which only show up a couple of times. relations like Relations like bodyDiscovered (1), taoiseach (2), or mother (5) are too sparse.
My plan is to set a threshold and remove any relation that appears fewer than 50 or 100 times.  While we lose some specific facts, we can  create a denser, more learnable graph for the model.


## Entity Degree / connectivity analysis

count: 304395: we have ~305k unique entities in this core_triples set.
mean: 3.57: The average entity is connected to 3-4 other things. quite low..sparse graph. 
std: 84.22: degrees are not distributed evenly at all. some entities have a lot more connections than the avg. 

50% (median): 1.0: This is the most critical statistic. It means at least 50% of your entities have only ONE connection! These are "leaf" nodes. The model has no context for these entities and cannot learn a meaningful vector for them. They are pure noise for the training process.


max: 50000: This shows a massive "hub" entity, which is almost certainly the entity http://dbpedia.org/ontology/wikiPageRedirects is pointing to.


The Histogram: The "Zoomed-in View" plot confirms this visually. There is a gigantic bar at Degree = 1 with over 200,000 entities. This is the "long tail" of very sparsely connected nodes that we must prune.


The structure is dominated by a small number of massive "hubs" that connect to everything, while the vast majority of entities are on the periphery with very few connections.

### The Strategy: Pruning the Graph
Based on this analysis, my strategy is to perform a two-step pruning process to create a smaller, denser, and more semantically coherent graph.

Semantic Filtering: We will remove the relations from Category A and B using a "blacklist".


Frequency Pruning (K-Core Pruning): We will remove all relations that appear too infrequently and all entities that are not connected enough times.

when we split into train/val/test sets we should make sure that entities in our val/test also appear in train set. 

In [9]:
# remove triples as per blacklist
print(f"number of core triples before pruning: {len(core_triples)}")

RELATION_BLACKLIST = [
    'http://dbpedia.org/ontology/wikiPageRedirects',
    'http://dbpedia.org/ontology/thumbnail',
    'http://dbpedia.org/ontology/language',
    'http://dbpedia.org/ontology/timeZone',
    'http://dbpedia.org/ontology/restingPlacePosition',
    'http://dbpedia.org/ontology/mainArticleForCategory'
]
semantically_filtered_triples = [t for t in core_triples if t[1] not in RELATION_BLACKLIST]

print(f"number of triples after semantic filtering: {len(semantically_filtered_triples)}")

number of core triples before pruning: 544161
number of triples after semantic filtering: 269108


In [10]:
# prune graph as per required relation frequency and entity degree

MIN_RELATION_FREQUENCY = 50
MIN_ENTITY_DEGREE = 6

relation_counts = Counter(t[1] for t in semantically_filtered_triples)
valid_relations = {r for r, c in relation_counts.items() if c >= MIN_RELATION_FREQUENCY}
temp_triples = [t for t in semantically_filtered_triples if t[1] in valid_relations]

while True:
    entity_degrees = Counter()
    for h, _, t in temp_triples:
        entity_degrees[h] += 1
        entity_degrees[t] += 1
    valid_entities = {e for e, d in entity_degrees.items() if d >= MIN_ENTITY_DEGREE}
    pruned_triples = [t for t in temp_triples if t[0] in valid_entities and t[2] in valid_entities]
    if len(pruned_triples) == len(temp_triples):
        break
    temp_triples = pruned_triples

print(f"Final number of high-quality triples: {len(pruned_triples)}\n")

Final number of high-quality triples: 51607



In [11]:
# data prep for training
df = pd.DataFrame(pruned_triples, columns=['head', 'relation', 'tail'])
entities = pd.concat([df['head'], df['tail']]).unique()
relations = df['relation'].unique()
entity_to_id = {entity: i for i, entity in enumerate(entities)}
relation_to_id = {relation: i for i, relation in enumerate(relations)}
id_to_entity = {i: entity for entity, i in entity_to_id.items()}
id_to_relation = {i: rel for rel, i in relation_to_id.items()}


# triples_as_ids = np.array([(entity_to_id[h], relation_to_id[r], entity_to_id[t]) for h, r, t in df.itertuples(index=False)])
# train_ids, temp_ids = train_test_split(triples_as_ids, test_size=0.2, random_state=42)
# validation_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=42)


# this doesnt ensure that entities in our val/test also occur in our train set - hence the next cell fixes that
# tf_train = TriplesFactory(mapped_triples=train_ids, entity_to_id=entity_to_id, relation_to_id=relation_to_id, create_inverse_triples = True)
# tf_validation = TriplesFactory(mapped_triples=validation_ids, entity_to_id=entity_to_id, relation_to_id=relation_to_id, create_inverse_triples = True)
# tf_test = TriplesFactory(mapped_triples=test_ids, entity_to_id=entity_to_id, relation_to_id=relation_to_id, create_inverse_triples = True)

In [12]:
pruned_triples_array = np.array(pruned_triples, dtype=object)

all_tf = TriplesFactory.from_labeled_triples(
    triples=pruned_triples_array,
    create_inverse_triples=True # Let PyKEEN handle inverse triples for the full graph
)
tf_train, tf_validation, tf_test = all_tf.split(
    ratios=[0.8, 0.1, 0.1],
    random_state=42,
)

In [13]:
id_to_entity = all_tf.entity_id_to_label
id_to_relation = all_tf.relation_id_to_label

print(f"Number of entities in overall graph: {len(all_tf.entity_to_id)}")
print(f"Number of entities in training set: {len(tf_train.entity_to_id)}")
print(f"Number of entities in validation set: {len(tf_validation.entity_to_id)}")
print(f"Number of entities in test set: {len(tf_test.entity_to_id)}")

Number of entities in overall graph: 7584
Number of entities in training set: 7584
Number of entities in validation set: 7584
Number of entities in test set: 7584


## Training with TransE


In [14]:
# training on pruned graph
pipeline_result_transe = pipeline(
    training=tf_train,
    validation=tf_validation,
    testing=tf_test,
    model='transE',
    model_kwargs=dict(embedding_dim=100),
    training_kwargs=dict(num_epochs=100, batch_size=128),
    optimizer_kwargs=dict(lr=0.01), 
    random_seed=42,
    device='mps',
)
print("Training complete!")

INFO:pykeen.triples.triples_factory:Creating inverse triples.


Training epochs on mps:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

INFO:pykeen.triples.triples_factory:Creating inverse triples.


Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/646 [00:00<?, ?batch/s]



Evaluating on mps:0:   0%|          | 0.00/5.16k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.19s seconds


Training complete!


In [16]:
model = pipeline_result_transe.model
pred = predict_target(
    model=model,
    head="http://hi.dbpedia.org/resource/देश_प्रेमी_(1982_फ़िल्म)",
    relation="http://dbpedia.org/ontology/starring",
    triples_factory=tf_train,
)
# remove all targets which we know from training set
pred_filtered = pred.filter_triples(tf_train)
# pred_filtered.df.to_csv("एडोस_इंटरएक्टिव.csv")

pred_annotated = pred_filtered.add_membership_columns(validation=tf_validation, testing=tf_test)
pred_annotated.df[:10]

Unnamed: 0,tail_id,score,tail_label,in_validation,in_testing
6073,6073,-58.279877,http://hi.dbpedia.org/resource/लक्ष्मी_छाया,False,False
1420,1420,-58.299492,http://hi.dbpedia.org/resource/कुंदन_(हिन्दी_फ...,False,False
3279,3279,-58.30127,http://hi.dbpedia.org/resource/देवेन_वर्मा,False,False
2331,2331,-58.3134,http://hi.dbpedia.org/resource/जलाल_आग़ा,False,False
7155,7155,-58.328964,http://hi.dbpedia.org/resource/सुरेश_चटवाल,False,False
4975,4975,-58.361649,http://hi.dbpedia.org/resource/मदन_पुरी,False,False
3191,3191,-58.362354,http://hi.dbpedia.org/resource/दीपक_शिर्के,False,False
7260,7260,-58.385082,http://hi.dbpedia.org/resource/सोमी_अली,False,False
7158,7158,-58.418663,http://hi.dbpedia.org/resource/सुरैया,False,False
5249,5249,-58.421432,http://hi.dbpedia.org/resource/मीना_टी,False,False


In [17]:
results_df = pipeline_result_transe.metric_results.to_df()
test_results_df = results_df[
    (results_df['Side'] == 'tail') &
    (results_df['Rank_type'] == 'realistic')
]

print("\n--- Model Performance on the Test Set ---")
print(test_results_df[['Metric', 'Value']])


--- Model Performance on the Test Set ---
                                  Metric         Value
4          adjusted_arithmetic_mean_rank  1.665623e-01
13             median_absolute_deviation  2.683510e+02
22                 z_geometric_mean_rank  6.707749e+01
31           inverse_geometric_mean_rank  5.166937e-03
40                  arithmetic_mean_rank  6.312593e+02
49    adjusted_geometric_mean_rank_index  9.309739e-01
58                              variance  1.481916e+06
67                    harmonic_mean_rank  4.192870e+01
76                   inverse_median_rank  4.672897e-03
85            inverse_harmonic_mean_rank  2.385001e-02
94          inverse_arithmetic_mean_rank  1.584135e-03
103         z_inverse_harmonic_mean_rank  1.105881e+02
112                  geometric_mean_rank  1.935383e+02
121  adjusted_arithmetic_mean_rank_index  8.336576e-01
130                                count  5.161000e+03
139                   standard_deviation  1.217340e+03
148  adjusted_inverse_

## ConvE

In [18]:
# training on pruned graph
pipeline_result_conve = pipeline(
    training=tf_train,
    validation=tf_validation,
    testing=tf_test,
    model='convE',
    model_kwargs=dict(embedding_dim=150),
    training_kwargs=dict(num_epochs=200, batch_size=256),
    optimizer_kwargs=dict(lr=0.01), 
    random_seed=42,
    device='mps',
)
print("Training complete!")

INFO:pykeen.pipeline.api:Using device: mps
INFO:pykeen.nn.modules:Resolving None * None * None = 150.
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.triples.triples_factory:Creating inverse triples.


Training epochs on mps:0:   0%|          | 0/200 [00:00<?, ?epoch/s]

INFO:pykeen.triples.triples_factory:Creating inverse triples.
INFO:pykeen.training.training_loop:Dropping last (incomplete) batch each epoch (1/322 (0.31%) batches).


Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/322 [00:00<?, ?batch/s]



Evaluating on mps:0:   0%|          | 0.00/5.16k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.33s seconds


Training complete!


In [19]:
results_df = pipeline_result_conve.metric_results.to_df()
test_results_df = results_df[
    (results_df['Side'] == 'tail') &
    (results_df['Rank_type'] == 'realistic')
]

print("\n--- Model Performance on the Test Set ---")
print(test_results_df[['Metric', 'Value']])


--- Model Performance on the Test Set ---
                                  Metric         Value
4          adjusted_arithmetic_mean_rank  1.094974e+00
13             median_absolute_deviation  3.275068e+03
22                 z_geometric_mean_rank -1.275054e+00
31           inverse_geometric_mean_rank  3.521477e-04
40                  arithmetic_mean_rank  4.149873e+03
49    adjusted_geometric_mean_rank_index -1.769658e-02
58                              variance  6.032445e+06
67                    harmonic_mean_rank  8.278647e+02
76                   inverse_median_rank  2.259376e-04
85            inverse_harmonic_mean_rank  1.207927e-03
94          inverse_arithmetic_mean_rank  2.409712e-04
103         z_inverse_harmonic_mean_rank -2.297058e-01
112                  geometric_mean_rank  2.839718e+03
121  adjusted_arithmetic_mean_rank_index -9.499906e-02
130                                count  5.161000e+03
139                   standard_deviation  2.456104e+03
148  adjusted_inverse_

## Link Prediction Using Muril for Initial Embeddings

We'll try to use muril embeddings from this [paper](https://arxiv.org/abs/2103.10730#:~:text=LMs,MuRIL%20in%20handling%20transliterated%20data). Its also trained on data collected from the Common Crawl OSCAR corpus and Wikipedia.

We're taking the `[CLS]` token embedding from the MuRIL model when feeding in the entity name. these would be of dimension (1,768). We'd like to reduce this to 200 or 150 depending on our model. 
first ill use pca for decomposition then autoencoder(TODO) which should give better performance. 


In [20]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.decomposition import PCA
from pykeen.nn.init import PretrainedInitializer

In [None]:
model_name = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# 2. Generate MuRIL Embeddings for Entity Names
entity_embeddings_dict = {}
with torch.no_grad(): #Disable gradient calculation
    for entity, entity_id in all_tf.entity_to_id.items():
        #Tokenize entity name
        entity_name = urllib.parse.unquote(entity.split('/')[-1]) # Get the name from resource url
        # print(f"encoding entity: {entity_name}")
        # TODO: try by fetching the corresponding wiki article and encoding that
        encoded_input = tokenizer(entity_name, padding=True, truncation=True, return_tensors='pt')

        output = model(**encoded_input)

        #[CLS] embedding extraction
        cls_embedding = output.last_hidden_state[:,0,:]

        #Store the Embedding
        entity_embeddings_dict[entity_id] = cls_embedding.cpu().numpy()

print("All Done!")

All Done!


In [None]:
# prepare Entity Embedding Tensor for PCA
num_entities = tf_train.num_entities
embedding_dim_muril = 768  # MuRIL Embedding dimension
entity_embeddings_matrix = np.zeros((num_entities, embedding_dim_muril)) 

In [23]:
for entity_id, embedding in entity_embeddings_dict.items():
    entity_embeddings_matrix[entity_id] = embedding.flatten() #Store numpy array

In [24]:
# Apply PCA for Dimensionality Reduction
embedding_dim_reduced = 200  # Reduced embedding dimension
pca = PCA(n_components=embedding_dim_reduced)
reduced_entity_embeddings = pca.fit_transform(entity_embeddings_matrix)

In [25]:
entity_embeddings_tensor = torch.tensor(reduced_entity_embeddings, dtype=torch.float)

### TransE with Muril initial embeddings

In [26]:
pipeline_result_transe_muril = pipeline(
    training=tf_train,
    validation=tf_validation,
    testing=tf_test,
    model='TransE',
    model_kwargs=dict(embedding_dim=200, entity_initializer=PretrainedInitializer(tensor=entity_embeddings_tensor)),
    training_kwargs=dict(num_epochs=100, batch_size=256),
    optimizer_kwargs=dict(lr=0.01), 
    random_seed=42,
    device='mps'
)

INFO:pykeen.pipeline.api:Using device: mps
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.triples.triples_factory:Creating inverse triples.


Training epochs on mps:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

INFO:pykeen.triples.triples_factory:Creating inverse triples.


Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/323 [00:00<?, ?batch/s]



Evaluating on mps:0:   0%|          | 0.00/5.16k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.60s seconds


In [27]:
results_df = pipeline_result_transe_muril.metric_results.to_df()
test_results_df = results_df[
    (results_df['Side'] == 'tail') &
    (results_df['Rank_type'] == 'realistic')
]

print("\n--- Model Performance on the Test Set ---")
print(test_results_df[['Metric', 'Value']])


--- Model Performance on the Test Set ---
                                  Metric         Value
4          adjusted_arithmetic_mean_rank  1.841849e-01
13             median_absolute_deviation  3.780636e+02
22                 z_geometric_mean_rank  6.658794e+01
31           inverse_geometric_mean_rank  4.706084e-03
40                  arithmetic_mean_rank  6.980475e+02
49    adjusted_geometric_mean_rank_index  9.241793e-01
58                              variance  1.564711e+06
67                    harmonic_mean_rank  2.235836e+01
76                   inverse_median_rank  3.267974e-03
85            inverse_harmonic_mean_rank  4.472600e-02
94          inverse_arithmetic_mean_rank  1.432567e-03
103         z_inverse_harmonic_mean_rank  2.127621e+02
112                  geometric_mean_rank  2.124909e+02
121  adjusted_arithmetic_mean_rank_index  8.160304e-01
130                                count  5.161000e+03
139                   standard_deviation  1.250884e+03
148  adjusted_inverse_

## ConvE with muril

In [28]:
pipeline_result_conve_muril = pipeline(
    training=tf_train,
    validation=tf_validation,
    testing=tf_test,
    model='ConvE',
    model_kwargs=dict(embedding_dim=200, entity_initializer=PretrainedInitializer(tensor=entity_embeddings_tensor)),
    training_kwargs=dict(num_epochs=200, batch_size=128),
    optimizer_kwargs=dict(lr=0.001), 
    random_seed=42,
    device='mps',
    stopper='early'
)

INFO:pykeen.pipeline.api:Using device: mps
INFO:pykeen.nn.modules:Resolving None * None * None = 200.
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: /Users/adityavenkatesh/.data/pykeen/checkpoints/best-model-weights-26507925-dd18-4d77-881e-d2b5bbbe611e.pt
INFO:pykeen.triples.triples_factory:Creating inverse triples.


Training epochs on mps:0:   0%|          | 0/200 [00:00<?, ?epoch/s]

INFO:pykeen.triples.triples_factory:Creating inverse triples.
INFO:pykeen.training.training_loop:Dropping last (incomplete) batch each epoch (1/645 (0.16%) batches).


Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.60s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.045824452625460185. Saved model weights to /Users/adityavenkatesh/.data/pykeen/checkpoints/best-model-weights-26507925-dd18-4d77-881e-d2b5bbbe611e.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.


Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.47s seconds


Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

Training batches on mps:0:   0%|          | 0.00/645 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.73s seconds
INFO:pykeen.stoppers.early_stopping:Stopping early at epoch 30. The best result 0.045824452625460185 occurred at epoch 10.
INFO:pykeen.stoppers.early_stopping:Re-loading weights from best epoch from /Users/adityavenkatesh/.data/pykeen/checkpoints/best-model-weights-26507925-dd18-4d77-881e-d2b5bbbe611e.pt


Evaluating on mps:0:   0%|          | 0.00/5.16k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.83s seconds


In [29]:
results_df = pipeline_result_conve_muril.metric_results.to_df()
test_results_df = results_df[
    (results_df['Side'] == 'tail') &
    (results_df['Rank_type'] == 'realistic')
]

print("\n--- Model Performance on the Test Set ---")
print(test_results_df[['Metric', 'Value']])


--- Model Performance on the Test Set ---
                                  Metric          Value
4          adjusted_arithmetic_mean_rank       0.120428
13             median_absolute_deviation     345.446320
22                 z_geometric_mean_rank      67.504543
31           inverse_geometric_mean_rank       0.005650
40                  arithmetic_mean_rank     456.413300
49    adjusted_geometric_mean_rank_index       0.936901
58                              variance  325885.875000
67                    harmonic_mean_rank      31.789898
76                   inverse_median_rank       0.003817
85            inverse_harmonic_mean_rank       0.031457
94          inverse_arithmetic_mean_rank       0.002191
103         z_inverse_harmonic_mean_rank     147.816927
112                  geometric_mean_rank     177.005676
121  adjusted_arithmetic_mean_rank_index       0.879804
130                                count    5161.000000
139                   standard_deviation     570.864136
148  

## Analysis
analysing latest results with MuRIL initial embeddings and discuss how to further increase performance.

### 1. Analysis of Latest Results with MIN_ENTITY_DEGREE = 10

Putting all four models' "realistic" tail prediction metrics side-by-side for a clear comparison:

| Model                       | `hits_at_1` | `hits_at_3` | `hits_at_5` | `hits_at_10` | `arithmetic_mean_rank` (Lower is better) | `inverse_harmonic_mean_rank` (Higher is better) |
| :-------------------------- | :---------- | :---------- | :---------- | :----------- | :--------------------------------------- | :---------------------------------------------- |
| **TransE (Default Init)**   | 0.0098      | 0.0353      | 0.0599      | 0.1129       | 147.49                                   | 0.0487                                          |
| **ConvE (Default Init)**    | 0.0049      | 0.0088      | 0.0118      | 0.0285       | 266.15                                   | 0.0185                                          |
| **TransE (MuRIL Init)**     | **0.0226**  | **0.0667**  | **0.0903**  | **0.1315**   | **137.47**                               | **0.0646**                                      |
| **ConvE (MuRIL Init)**      | 0.0147      | 0.0402      | 0.0579      | 0.0903       | 153.57                                   | 0.0446                                          |

**Key Observations:**

1.  **MuRIL Embeddings are Beneficial:** Both TransE and ConvE show an improvement when initialized with MuRIL embeddings (after PCA reduction).
    *   **TransE (MuRIL)** is now the **best performing model** across all `Hits@k` metrics, and has the lowest `arithmetic_mean_rank` and highest `inverse_harmonic_mean_rank`. This is a clear win for using MuRIL embeddings with TransE.
    *   **ConvE (MuRIL)** shows a very significant improvement over **ConvE (Default)** across all metrics. For instance, its Hits@10 jumped from 0.0285 to 0.0903. This indicates that MuRIL embeddings provide a much better starting point for ConvE, helping it learn more effectively.

2.  **TransE still outperforming ConvE:** Even with MuRIL initializations, TransE (MuRIL) still slightly outperforms ConvE (MuRIL) on this dataset and hyperparameter set. This is still a bit counter-intuitive given ConvE's theoretical capacity, reinforcing the idea that ConvE is more sensitive to its setup.

**Potential areas for refinement:**

*   **Richer Entity Representations:** Instead of just the entity name, we could try to embed a short textual description of the entity (first para from wikipedia) using MuRIL to get a more comprehensive semantic representation. 
*   **Contextualized Embeddings:** For entities that are ambiguous based on their name alone, providing more context might help. However, this is significantly more complex and often involves joint training or more sophisticated initializations beyond a simple `PretrainedInitializer`.
*   **Fine-tuning MuRIL:** In very advanced setups, the MuRIL model itself could be fine-tuned *during* the KGE training process, allowing the text embeddings to adapt further to the knowledge graph structure. we can explore ways to do this with inbuilt pykeen methods. 

### 3. What we can try next:


**Hyperparameter Tuning:**

*   **Systematic Tuning:** For a serious hyperparameter search, consider using tools like:
    *   **Grid Search:** Trying all combinations of a predefined set of hyperparameters (can be computationally expensive).
    *   **Random Search:** Randomly sampling hyperparameters from a distribution (often more efficient than grid search for complex models).
    *   **Bayesian Optimization (e.g., Optuna, Ray Tune):** More advanced methods that intelligently explore the hyperparameter space.

**Other KGE Models:**

*   Exploring other models might be beneficial like:
    *   **DistMult** 
    *   **RotatE** 
    *   **ComplEx**
    *   **GNNs** 



### Analysis of Data Pruning and Graph Characteristics

we changed the pruning thresholds to:
*   `MIN_RELATION_FREQUENCY = 50` (down from 100)
*   `MIN_ENTITY_DEGREE = 6` (down from 10)

This relaxation of the pruning criteria has significantly increased the size of our "high-quality triples" dataset:

*   **Previous `pruned_triples` count:** 10,183
*   **Current `pruned_triples` count:** 51,607 (a ~5x increase)

Similarly, the number of unique entities in this pruned graph has increased:
*   **Previous unique entities:** 1,091
*   **Current unique entities:** 7,584 (a ~7x increase)

**Impact on Graph Density:**
While our initial "Entity Degree / connectivity analysis" still reflects the `core_triples` (showing a median degree of 1, indicating a very sparse original graph with many leaf nodes), the new `MIN_ENTITY_DEGREE = 6` filter will enforce that every entity remaining in `pruned_triples` has at least 6 connections. This substantially increases the effective density and connectivity of the graph used for training, making it potentially more amenable to KGE models. The increased number of triples and entities means your models now have a much larger dataset to learn from.

### Model Performance on the New, Larger Dataset

Let's compare the "realistic" tail prediction metrics for all four model configurations on this updated dataset.

| Model                       | `hits_at_1` (Higher is better) | `hits_at_3` (Higher is better) | `hits_at_5` (Higher is better) | `hits_at_10` (Higher is better) | `arithmetic_mean_rank` (Lower is better) | `inverse_harmonic_mean_rank` (Higher is better) |
| :-------------------------- | :----------------------------- | :----------------------------- | :----------------------------- | :------------------------------ | :--------------------------------------- | :---------------------------------------------- |
| **TransE (Default Init)**   | 0.0050                         | 0.0143                         | 0.0211                         | 0.0382                          | 631.26                                   | 0.0238                                          |
| **ConvE (Default Init)**    | 0.0000                         | 0.0002                         | 0.0002                         | 0.0002                          | 4149.87                                  | 0.0012                                          |
| **TransE (MuRIL Init)**     | **0.0242**                     | **0.0420**                     | 0.0512                         | 0.0692                          | 698.05                                   | **0.0447**                                      |
| **ConvE (MuRIL Init - Tuned)** | 0.0047                         | 0.0219                         | **0.0436**                     | **0.0913**                      | **456.41**                               | 0.0315                                          |

**New Key Observations:**

1.  **Overall Performance Degradation (without MuRIL init):**
    *   **TransE (Default Init):** Its performance metrics (Hits@k, MR, MRR) have significantly *decreased* compared to its performance on the smaller, more stringently pruned dataset. For example, Hits@10 dropped from 0.1129 to 0.0382. This suggests that while the dataset is larger, the added triples, even with `MIN_ENTITY_DEGREE=6`, might introduce more noise or less coherent patterns that a basic TransE struggles with, especially without a strong initialization.
    *   **ConvE (Default Init):** Its performance is abysmal, with Hits@1 at 0.0. This indicates complete failure to learn anything meaningful in this setup, likely due to the combination of its complexity, default initialization, and perhaps an even greater sensitivity to the increased "noise" from lower pruning thresholds.

2.  **MuRIL Embeddings Remain Crucial and Improve Performance:**
    *   **TransE (MuRIL Init):** This model still shows the best `Hits@1`, `Hits@3`, and `inverse_harmonic_mean_rank`. It's a clear improvement over TransE with default initialization on this larger dataset.
    *   **ConvE (MuRIL Init - Tuned):** we've made hyperparameter changes for this run (`num_epochs=200`, `batch_size=128`, `lr=0.001`, and adding `stopper='early'`).
        *   This ConvE model performs dramatically better than the **ConvE (Default Init)** on this dataset. This strongly validates the importance of both MuRIL initialization and proper hyperparameter tuning for ConvE.
        *   It now outperforms `TransE (MuRIL Init)` on `Hits@10` (0.0913 vs 0.0692) and `arithmetic_mean_rank` (456.41 vs 698.05), which is a positive sign.
        *   The early stopping (stopping at epoch 30, with best at epoch 10) indicates that 200 epochs was too much, and the model started overfitting or oscillating after epoch 10. Early stopping correctly captured the best performance.

3.  **ConvE Catching Up (and Surpassing in some aspects) with Tuning and Pre-trained Embeddings:** With careful tuning and the powerful MuRIL initial embeddings, ConvE is now showing its potential and is competitive with TransE, surpassing it in higher Hits@k and mean rank. The previous runs likely suffered from a mismatch between model complexity and default hyperparameters on a very sparse graph.

**Overall Conclusion on Performance:**
The increased dataset size through relaxed pruning, while providing more data, also appears to have made the learning task harder for models without proper initialization or tuning. **The use of MuRIL embeddings is consistently beneficial, acting as a strong anchor for both TransE and ConvE, allowing them to learn more effectively from this Hindu Wikipedia knowledge graph.** Furthermore, the targeted hyperparameter tuning (especially `lr` and `batch_size`) along with early stopping significantly unlocked ConvE's capabilities, making it the top performer in terms of Hits@10 and Mean Rank on this larger dataset.

### What we can try next:

The results with MuRIL initial embeddings are promising! To further enhance performance and potentially fully leverage ConvE's capabilities, here's a refined strategy:

**1. Comprehensive Hyperparameter Tuning (Focus on ConvE):**

*   **For ConvE (MuRIL Init):** we will perform a systematic search
    *   **Learning Rate (`lr`):** Since 0.001 showed improvement, explore a narrower range around it: `[0.0005, 0.001, 0.002, 0.005]`.
    *   **Batch Size (`batch_size`):** we moved from 256 to 128 for ConvE (MuRIL Tuned). Continue exploring: `[64, 128, 256, 512]` to see the effect on generalization.
    *   **Embedding Dimension (`embedding_dim`):** we used 200. Test other values like `[150, 250, 300]`.
    *   **ConvE Specific Parameters:** These are critical for ConvE's architecture. PyKEEN often exposes them via `model_kwargs`.
        *   `num_filters`: The number of convolutional filters. Try `[32, 64, 128]`.
        *   `output_channels`: Related to the output of the convolutional layer.
        *   `kernel_size`: The size of the convolutional filters.
        *   `input_dropout`, `hidden_dropout`, `feature_map_dropout`: Dropout rates for regularization. Experiment with `[0.0, 0.1, 0.2, 0.3]`. Dropout can be very effective in preventing overfitting in neural models like ConvE.
    *   **Regularization (`regularizer`):** Even with dropout, explicit regularization (`'L2'` on embeddings) can sometimes help.
    *   **Early Stopping Parameters:** Fine-tune `patience` and `min_delta` within the `stopper='early'` settings for optimal stopping.

*   **Systematic Tuning Tools:**
    *   **Optuna or Ray Tune:** These libraries are highly recommended for automated hyperparameter optimization. They can efficiently search large parameter spaces and help you find the best combinations.

**2. Explore Other KGE Models with MuRIL Initial Embeddings:**

Since MuRIL embeddings proved beneficial for both TransE and ConvE, try applying them to other models that are known for strong performance and different inductive biases:

*   **RotatE:** Excels at modeling various relation patterns (symmetric, antisymmetric, inversion) by using complex embeddings and rotations.
*   **ComplEx:** Also uses complex embeddings and is good for asymmetric relations. It's often strong for link prediction.
*   **DistMult:** A simpler, more interpretable model that assumes symmetric relations. It might still be worth trying on this larger dataset if you haven't already with the new pruning.

**3. Advanced Entity Representations (If simple name is not enough):**

*   **Richer Entity Text:** As mentioned, if available, fetching and embedding a short descriptive text (like the first paragraph of a Wikipedia article for that entity) using MuRIL could provide even richer initial entity embeddings. This would be a more involved data pipeline step.

**4. Loss Function and Optimizer Tuning:**

*   **Loss Function (`loss` in `training_kwargs`):** While defaults are usually good, different loss functions (e.g., `BCEWithLogitsLoss` often used with ConvE, or `MarginRankingLoss` for TransE) can sometimes yield marginal gains.
*   **Optimizer (`optimizer` and `optimizer_kwargs`):** we're using Adam with `lr=0.01`. While `lr` is critical, we could experiment with other optimizers like AdamW or RMSprop if performance plateaus.
