## 🧐 ZeroRel data inspection

In [1]:
import json
with open('data/zero_rel_all.jsonl', 'r') as f:
    # data = []
    # for i in range(5):
    #     data.append(json.loads(next(f)))
    data = [json.loads(line) for line in f]

In [2]:
# relation labels in ZeroRel

relationship_counts = {}

for item in data:
    relations = item['relations']
    for relation in relations:
        relation_text = relation['relation_text']
        if relation_text in relationship_counts:
            relationship_counts[relation_text] += 1
        else:
            relationship_counts[relation_text] = 1

sorted_relationship_counts = sorted(relationship_counts.items(), key=lambda x: x[1], reverse=True)

print(f"Number of unique labels: {len(sorted_relationship_counts)}")
sorted_relationship_counts[:50]

Number of unique labels: 916367


[('no relation', 1038391),
 ('location of', 704818),
 ('member of', 386934),
 ('location', 268480),
 ('location in', 257950),
 ('located in', 239440),
 ('created by', 218583),
 ('author of', 213249),
 ('hosted by', 177776),
 ('part of', 151153),
 ('produced by', 147802),
 ('self reference', 145815),
 ('worked for', 140374),
 ('reported by', 137596),
 ('supports', 133059),
 ('mentioned in', 116558),
 ('collaborator', 113567),
 ('same person', 113015),
 ('played for', 106867),
 ('publisher', 105739),
 ('written by', 104687),
 ('produces', 99504),
 ('lives in', 98620),
 ('supported by', 97468),
 ('child of', 93260),
 ('educated at', 90800),
 ('designed by', 89883),
 ('subject of', 88617),
 ('owned by', 86224),
 ('born in', 85736),
 ('sponsor', 85414),
 ('hosts', 84673),
 ('belongs to', 84605),
 ('funding', 82576),
 ('head of', 81399),
 ('published in', 78730),
 ('featured in', 75916),
 ('published by', 73839),
 ('performs at', 73569),
 ('owner', 71872),
 ('founder', 70769),
 ('leads', 698

In [3]:
i = 4

In [4]:
data[i].keys()

dict_keys(['ner', 'relations', 'tokenized_text'])

In [5]:
data[i]['ner'][:5]

[[0, 3, 'DATE', '30 Jan 2024'],
 [4, 9, 'ORG', 'The Bangkok North Municipal Court'],
 [16, 19, 'ORG', 'Move Forward Party'],
 [20, 22, 'PERSON', 'Rukchanok Srinork'],
 [23, 24, 'CARDINAL', 'two']]

In [6]:
len(data[i]['ner'])

18

In [7]:
" ".join(data[i]['tokenized_text'])

'30 Jan 2024 \n The Bangkok North Municipal Court has dismissed a defamation lawsuit filed against Move Forward Party MP Rukchanok Srinork by two former Nation TV anchors on the ground that the two presenters are public figures and so can be criticized . \n 10 Aug 2022 \n The Supreme Court has dismissed a defamation lawsuit against former Voice TV reporter Suchanee Cloitre filed by the Thammakaset Company on the grounds that her report was criticism made in good faith . \n 11 Oct 2021 \n Art critic Pearamon Tulavardhana reports that she has received a court summons on a charge of defamation by publication filed against her by Chiang Mai University lecturer Pongsiri Kiddee , after she published an article on an exhibition organized by the Office of Contemporary Art and Culture ( OCAC ) .'

In [8]:
[r for r in data[i]['relations'] if r['relation_text'] == 'no relation'][-3:]

[{'head': {'mention': '30 Jan 2024', 'position': [0, 3], 'type': 'DATE'},
  'tail': {'mention': 'the Thammakaset Company',
   'position': [66, 69],
   'type': 'ORG'},
  'relation_text': 'no relation'},
 {'head': {'mention': '10 Aug 2022', 'position': [45, 48], 'type': 'DATE'},
  'tail': {'mention': 'Move Forward Party',
   'position': [16, 19],
   'type': 'ORG'},
  'relation_text': 'no relation'},
 {'head': {'mention': 'the Thammakaset Company',
   'position': [66, 69],
   'type': 'ORG'},
  'tail': {'mention': 'Nation TV', 'position': [25, 27], 'type': 'ORG'},
  'relation_text': 'no relation'}]

In [9]:
# get exceptions not considered for relations
# should include self-pairs 

seen_rels = set()

for rel in data[i]['relations']:
    seen_rels.add(((rel['head']['position'][0], rel['head']['position'][1]), (rel['tail']['position'][0], rel['tail']['position'][1])))

exception = []
for ent1 in data[i]['ner']:
    for ent2 in data[i]['ner']:

        if ((ent1[0], ent1[1]), (ent2[0], ent2[1])) not in seen_rels:
            exception.append((ent1, ent2))

In [10]:
exception[:4]

[([0, 3, 'DATE', '30 Jan 2024'], [0, 3, 'DATE', '30 Jan 2024']),
 ([4, 9, 'ORG', 'The Bangkok North Municipal Court'],
  [4, 9, 'ORG', 'The Bangkok North Municipal Court']),
 ([16, 19, 'ORG', 'Move Forward Party'],
  [16, 19, 'ORG', 'Move Forward Party']),
 ([20, 22, 'PERSON', 'Rukchanok Srinork'],
  [20, 22, 'PERSON', 'Rukchanok Srinork'])]

In [11]:
len(exception)

18

In [12]:
should_be_rels = (len(data[i]['ner']) * (len(data[i]['ner'])))
print(f"{should_be_rels} entity-entity pairs")

print(f"So should be {should_be_rels} (entity pairs) - {len(exception)} (self-pairs) => {len(data[i]['relations'])}")

324 entity-entity pairs
So should be 324 (entity pairs) - 18 (self-pairs) => 306
