In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter

file_path = '/kaggle/input/train-txt/train.txt'
df = pd.read_csv(file_path, sep=r'\s+', names=['head','relation','tail'])

relations = sorted(df['relation'].unique())
print("triples:", len(df), "| relations:", len(relations), "| people:", pd.Index(df['head']).union(df['tail']).nunique())

# Index edges by relation
edges = {r: set() for r in relations}
head_to_tails = {r: defaultdict(list) for r in relations}  # for r(y,z)
tail_to_heads = {r: defaultdict(list) for r in relations}  # for r(x,y)

for h,r,t in df[['head','relation','tail']].itertuples(index=False):
    edges[r].add((h,t))
    head_to_tails[r][h].append(t)
    tail_to_heads[r][t].append(h)

print("done indexing")

triples: 13821 | relations: 28 | people: 1316
done indexing


In [2]:
def compose_pair_counts(r1, r2):
    """
    Counts groundings of (x,z) generated by r1(x,y) & r2(y,z) over all y.
    Returns:
      pair_count[(x,z)] = number of y witnesses
      body_support = total number of groundings (x,y,z)
    """
    pair_count = Counter()
    body_support = 0

    Ys = set(tail_to_heads[r1].keys()) & set(head_to_tails[r2].keys())
    for y in Ys:
        Xs = tail_to_heads[r1][y]
        Zs = head_to_tails[r2][y]
        if not Xs or not Zs: 
            continue
        body_support += len(Xs) * len(Zs)
        for x in Xs:
            for z in Zs:
                pair_count[(x,z)] += 1
    return pair_count, body_support

rule_rows = []
cache = {}

for r1 in relations:
    for r2 in relations:
        pair_count, body_sup = compose_pair_counts(r1,r2)
        if body_sup == 0:
            continue
        cache[(r1,r2)] = (pair_count, body_sup)

        keyset = set(pair_count.keys())
        # score against every possible r3
        for r3 in relations:
            inter = keyset & edges[r3]
            if not inter:
                continue
            supp = sum(pair_count[p] for p in inter)          # grounding support
            conf = supp / body_sup
            distinct = len(inter)                             # distinct (x,z) pairs explained
            rule_rows.append((r1,r2,r3, body_sup, supp, distinct, conf))

rules_df = pd.DataFrame(rule_rows, columns=['r1','r2','r3','body_support','support','distinct_xz','confidence'])

print("Total candidate rules found:", len(rules_df))
print("\nTop rules by confidence then support (showing 25):")
display(rules_df.sort_values(['confidence','support'], ascending=False).head(25))

Total candidate rules found: 628

Top rules by confidence then support (showing 25):


Unnamed: 0,r1,r2,r3,body_support,support,distinct_xz,confidence
559,sisterOf,granddaughterOf,granddaughterOf,772,772,476,1.0
289,grandfatherOf,sisterOf,grandfatherOf,747,747,459,1.0
309,grandmotherOf,sisterOf,grandmotherOf,747,747,459,1.0
111,brotherOf,granddaughterOf,grandsonOf,722,722,442,1.0
562,sisterOf,grandsonOf,granddaughterOf,722,722,508,1.0
276,grandfatherOf,brotherOf,grandfatherOf,673,673,463,1.0
296,grandmotherOf,brotherOf,grandmotherOf,673,673,463,1.0
114,brotherOf,grandsonOf,grandsonOf,624,624,418,1.0
460,nephewOf,granddaughterOf,greatGrandsonOf,608,608,360,1.0
272,grandfatherOf,auntOf,greatGrandfatherOf,562,562,338,1.0


In [3]:
strong = rules_df[(rules_df['confidence'] >= 0.99) & (rules_df['support'] >= 50)].copy()
print("Strong rules (conf>=0.99, support>=50):", len(strong))
display(strong.sort_values(['support'], ascending=False).head(30))

Strong rules (conf>=0.99, support>=50): 167


Unnamed: 0,r1,r2,r3,body_support,support,distinct_xz,confidence
559,sisterOf,granddaughterOf,granddaughterOf,772,772,476,1.0
309,grandmotherOf,sisterOf,grandmotherOf,747,747,459,1.0
289,grandfatherOf,sisterOf,grandfatherOf,747,747,459,1.0
562,sisterOf,grandsonOf,granddaughterOf,722,722,508,1.0
111,brotherOf,granddaughterOf,grandsonOf,722,722,442,1.0
296,grandmotherOf,brotherOf,grandmotherOf,673,673,463,1.0
276,grandfatherOf,brotherOf,grandfatherOf,673,673,463,1.0
114,brotherOf,grandsonOf,grandsonOf,624,624,418,1.0
460,nephewOf,granddaughterOf,greatGrandsonOf,608,608,360,1.0
292,grandmotherOf,auntOf,greatGrandmotherOf,562,562,338,1.0


In [4]:
def show_examples(r1, r2, r3, k=8):
    A = df[df['relation']==r1][['head','tail']].rename(columns={'head':'x','tail':'y'})
    B = df[df['relation']==r2][['head','tail']].rename(columns={'head':'y','tail':'z'})
    C = df[df['relation']==r3][['head','tail']].rename(columns={'head':'x','tail':'z'})

    body = A.merge(B, on='y', how='inner')
    full = body.merge(C, on=['x','z'], how='inner')
    print(f"Rule: {r1}(X,Y) & {r2}(Y,Z) -> {r3}(X,Z)")
    print("Examples (x,y,z):")
    display(full.head(k))
    print("Example triples:")
    for _, row in full.head(min(k, len(full))).iterrows():
        print(f"({row['x']}, {r1}, {row['y']})  ({row['y']}, {r2}, {row['z']})  => ({row['x']}, {r3}, {row['z']})")

# Pick 5 rules to showcase (update these after you see Cell 2 output)

In [5]:
show_examples('sisterOf','granddaughterOf','granddaughterOf', k=6)  # example; adjust based on mined rules

Rule: sisterOf(X,Y) & granddaughterOf(Y,Z) -> granddaughterOf(X,Z)
Examples (x,y,z):


Unnamed: 0,x,y,z
0,olivia0,selina10,ella19
1,olivia0,selina10,david20
2,olivia0,selina10,emma7
3,olivia0,selina10,moritz8
4,olivia0,isabella11,ella19
5,olivia0,isabella11,david20


Example triples:
(olivia0, sisterOf, selina10)  (selina10, granddaughterOf, ella19)  => (olivia0, granddaughterOf, ella19)
(olivia0, sisterOf, selina10)  (selina10, granddaughterOf, david20)  => (olivia0, granddaughterOf, david20)
(olivia0, sisterOf, selina10)  (selina10, granddaughterOf, emma7)  => (olivia0, granddaughterOf, emma7)
(olivia0, sisterOf, selina10)  (selina10, granddaughterOf, moritz8)  => (olivia0, granddaughterOf, moritz8)
(olivia0, sisterOf, isabella11)  (isabella11, granddaughterOf, ella19)  => (olivia0, granddaughterOf, ella19)
(olivia0, sisterOf, isabella11)  (isabella11, granddaughterOf, david20)  => (olivia0, granddaughterOf, david20)


In [6]:
show_examples('brotherOf','grandsonOf','grandsonOf',k=6)

Rule: brotherOf(X,Y) & grandsonOf(Y,Z) -> grandsonOf(X,Z)
Examples (x,y,z):


Unnamed: 0,x,y,z
0,oskar24,adam9,ella19
1,oskar24,adam9,david20
2,oskar24,adam9,emma7
3,oskar24,adam9,moritz8
4,nico4,elias6,katharina1
5,nico4,elias6,dominik2


Example triples:
(oskar24, brotherOf, adam9)  (adam9, grandsonOf, ella19)  => (oskar24, grandsonOf, ella19)
(oskar24, brotherOf, adam9)  (adam9, grandsonOf, david20)  => (oskar24, grandsonOf, david20)
(oskar24, brotherOf, adam9)  (adam9, grandsonOf, emma7)  => (oskar24, grandsonOf, emma7)
(oskar24, brotherOf, adam9)  (adam9, grandsonOf, moritz8)  => (oskar24, grandsonOf, moritz8)
(nico4, brotherOf, elias6)  (elias6, grandsonOf, katharina1)  => (nico4, grandsonOf, katharina1)
(nico4, brotherOf, elias6)  (elias6, grandsonOf, dominik2)  => (nico4, grandsonOf, dominik2)


In [7]:
def rule_2hop_conf(r1, r2, r3):
    pair_count, body_sup = cache.get((r1,r2), (None,0))
    if body_sup == 0:
        pair_count, body_sup = compose_pair_counts(r1,r2)
    keyset = set(pair_count.keys())
    inter = keyset & edges[r3]
    supp = sum(pair_count[p] for p in inter)
    conf = supp / body_sup if body_sup else 0
    return body_sup, supp, conf

for r2 in ['motherOf','fatherOf']:
    body,supp,conf = rule_2hop_conf('auntOf', r2, 'greatAuntOf')
    print(f"auntOf(X,Y) & {r2}(Y,Z) -> greatAuntOf(X,Z) | body={body} supp={supp} conf={conf:.4f}")

    body,supp,conf = rule_2hop_conf('uncleOf', r2, 'greatUncleOf')
    print(f"uncleOf(X,Y) & {r2}(Y,Z) -> greatUncleOf(X,Z) | body={body} supp={supp} conf={conf:.4f}")

auntOf(X,Y) & motherOf(Y,Z) -> greatAuntOf(X,Z) | body=121 supp=121 conf=1.0000
uncleOf(X,Y) & motherOf(Y,Z) -> greatUncleOf(X,Z) | body=112 supp=112 conf=1.0000
auntOf(X,Y) & fatherOf(Y,Z) -> greatAuntOf(X,Z) | body=157 supp=157 conf=1.0000
uncleOf(X,Y) & fatherOf(Y,Z) -> greatUncleOf(X,Z) | body=101 supp=101 conf=1.0000


In [8]:
for r2 in ['grandmotherOf','grandfatherOf']:
    body,supp,conf = rule_2hop_conf('motherOf', r2, 'greatGrandmotherOf')
    print(f"motherOf(X,Y) & {r2}(Y,Z) -> greatGrandmotherOf(X,Z) | body={body} supp={supp} conf={conf:.4f}")

    body,supp,conf = rule_2hop_conf('fatherOf', r2, 'greatGrandfatherOf')
    print(f"fatherOf(X,Y) & {r2}(Y,Z) -> greatGrandfatherOf(X,Z) | body={body} supp={supp} conf={conf:.4f}")

motherOf(X,Y) & grandmotherOf(Y,Z) -> greatGrandmotherOf(X,Z) | body=256 supp=256 conf=1.0000
fatherOf(X,Y) & grandmotherOf(Y,Z) -> greatGrandfatherOf(X,Z) | body=256 supp=256 conf=1.0000
motherOf(X,Y) & grandfatherOf(Y,Z) -> greatGrandmotherOf(X,Z) | body=287 supp=287 conf=1.0000
fatherOf(X,Y) & grandfatherOf(Y,Z) -> greatGrandfatherOf(X,Z) | body=287 supp=287 conf=1.0000
