## 🧐 ZeroRel data inspection

In [1]:
import json
with open('data/zero_rel_all.jsonl', 'r') as f:
    # data = []
    # for i in range(5):
    #     data.append(json.loads(next(f)))
    data = [json.loads(line) for line in f]

In [11]:
# relation labels in ZeroRel

relationship_counts = {}

for item in data:
    relations = item['relations']
    for relation in relations:
        relation_text = relation['relation_text']
        if relation_text in relationship_counts:
            relationship_counts[relation_text] += 1
        else:
            relationship_counts[relation_text] = 1

sorted_relationship_counts = sorted(relationship_counts.items(), key=lambda x: x[1], reverse=True)

print(f"Number of unique labels: {len(sorted_relationship_counts)}")
sorted_relationship_counts[:50]

Number of unique labels: 327703


[('no relation', 294612),
 ('location of', 200588),
 ('member of', 104258),
 ('location', 77447),
 ('location in', 75358),
 ('author of', 65698),
 ('located in', 64409),
 ('created by', 63424),
 ('[{', 49686),
 ('produced by', 45733),
 ('reported by', 43428),
 ('hosted by', 42880),
 ('part of', 42318),
 ('self reference', 40707),
 ('supports', 38431),
 ('worked for', 37266),
 ('collaborator', 35232),
 ('publisher', 32530),
 ('mentioned in', 31527),
 ('same person', 31480),
 ('produces', 31099),
 ('supported by', 29970),
 ('written by', 29527),
 ('played for', 29064),
 ('subject of', 28427),
 ('child of', 26222),
 ('lives in', 26180),
 ('owned by', 26030),
 ('funding', 25543),
 ('published in', 25415),
 ('belongs to', 24444),
 ('designed by', 24360),
 ('born in', 23576),
 ('educated at', 23371),
 ('performs at', 23014),
 ('published by', 22488),
 ('head of', 22475),
 ('hosts', 21387),
 ('owner of', 20800),
 ('founder', 20390),
 ('featured in', 20362),
 ('sponsor', 19959),
 ('caused by',

In [12]:
i = 4

In [13]:
data[i].keys()

dict_keys(['ner', 'relations', 'tokenized_text'])

In [14]:
data[i]['ner'][:5]

[[27, 31, 'DATE', 'a few months ago'],
 [138, 140, 'PERSON', 'Chicken Man'],
 [143, 144, 'ORDINAL', 'first']]

In [15]:
len(data[i]['ner'])

3

In [16]:
" ".join(data[i]['tokenized_text'])

'There ’s nothing sadder than a neglected animal . And believe it or not , it really hits them hard . This neglected parrot found himself abandoned a few months ago . \n He had passed through several foster families and they all returned him because he had some serious aggression issues . \n Over time , this adorable looking creature became so stressed and overwhelmed that he started to pluck out his feathers . \n Undeterred by his brutish behavior and slightly harrowing exterior , a kind - hearted woman decided to take him in . And surprisingly , his new mom managed to bond with him ! \n The little bird , which she lovingly named Chicken Man , has found a forever home with her and the pair could n’t be happier . \n Though Chicken Man was anxious at first , he started to warm to his new home and bonded with the family . Not only that , but his feathers started to grow back too ! \n Do n’t forget to watch this heartwarming friendship blossom between a parrot and its human in the video b

In [22]:
[r for r in data[i]['relations'] if r['relation_text'] == 'no relation'][-3:]

[]

In [23]:
# get exceptions not considered for relations
# should include self-pairs 

seen_rels = set()

for rel in data[i]['relations']:
    seen_rels.add(((rel['head']['position'][0], rel['head']['position'][1]), (rel['tail']['position'][0], rel['tail']['position'][1])))

exception = []
for ent1 in data[i]['ner']:
    for ent2 in data[i]['ner']:

        if ((ent1[0], ent1[1]), (ent2[0], ent2[1])) not in seen_rels:
            exception.append((ent1, ent2))

In [24]:
exception[:4]

[([27, 31, 'DATE', 'a few months ago'], [27, 31, 'DATE', 'a few months ago']),
 ([138, 140, 'PERSON', 'Chicken Man'], [138, 140, 'PERSON', 'Chicken Man']),
 ([143, 144, 'ORDINAL', 'first'], [143, 144, 'ORDINAL', 'first'])]

In [25]:
len(exception)

3

In [26]:
should_be_rels = (len(data[i]['ner']) * (len(data[i]['ner'])))
print(f"{should_be_rels} entity-entity pairs")

print(f"So should be {should_be_rels} (entity pairs) - {len(exception)} (self-pairs) => {len(data[i]['relations'])}")

9 entity-entity pairs
So should be 9 (entity pairs) - 3 (self-pairs) => 6
