In [70]:
import json
data = []
with open("dev.jsonl", "r") as f:
    for line in f:
        data.append(json.loads(line))
print("Number of examples:", len(data))

Number of examples: 141


In [2]:
ner_tag = set([
    "PER",   # player
    "CLU",   # club
    "TME",   # time
    "NUM",   # number 
])

relation_tag = set([
    "COMP",  # (CLU, CLU) compete with
    "DEFE",  # (CLU, CLU) defeat / win over
    # "SCON",  # (CLU, NUM) score
    "SCOP",  # (CLU, PER) score player
    "SCOT",  # (PER, TME) score time
    "CARP",  # (CLU, PER) card player,
    "CART",  # (PER, TIME) card time
    "SUBP",  # (PER, PER) substitute players,
    "SUBT",  # (PER, TIME) substitute time
])

Entity format:

```
<entity id> <ner_tag> <start_pos> <end_pos> <text>
```

Ví dụ:
```
T3-7	GPE 522 531	community
```

In [71]:
class Entity():
    global_id = 1
    def __init__(self, ner_type, start_pos, end_pos, text):
        self.id = f"T-{Entity.global_id}"
        self.ner_type = ner_type
        self.start_pos = start_pos
        self.end_pos = end_pos
        self.text = text
        Entity.global_id += 1
        
    def __repr__(self):
        return f"{self.id}\t{self.ner_type} {self.start_pos} {self.end_pos}\t{self.text}"
        
class Relation():
    global_id = 1
    def __init__(self, relation_type, *args):
        self.id = f"R-{Relation.global_id}"
        self.relation_type = relation_type
        self.args = args
        Relation.global_id += 1
        
    def __repr__(self):
        res = f"{self.id}\t{self.relation_type}"
        for args_id, entity in enumerate(self.args):
            res += f" Arg{args_id + 1}:{entity.id}"
        return res

In [4]:
from bs4 import BeautifulSoup

def parse_paragraph(data):
    event_texts = {0: ""}
    event_paddings = {0: 0}
    event_ids = []
    e_id = None
    content = ""
    for html in data["html_annotation"]:
        for e in BeautifulSoup(html):
            try:
                e_id = int(e["event_id"])
                event_ids.append(e_id)
                event_texts[e_id] = ""
            except: pass
            if e_id is None:
                continue
            event_texts[e_id] += e.text.strip() + " "
    event_ids.sort()
    
    for i, event in enumerate(event_ids):
        cur_event_ids = event_ids[i]
        pre_event_ids = event_ids[i - 1] if i != 0 else 0
        event_paddings[cur_event_ids] = event_paddings[pre_event_ids] + len(event_texts[pre_event_ids])
        content += event_texts[cur_event_ids]
    
    event_texts["0"] = content
    return {
        "content": content,
        "event_texts": event_texts,
        "event_paddings": event_paddings,
    }

In [5]:
def is_match(team1, team2):
    team1 = team1.strip()
    team2 = team2.strip()
    return team1 in team2 or team2 in team1

In [42]:
def parse_entity_relation(data, event_texts, event_paddings):
    def parse_entity_other(ner_type, text, ref_event_ids):
        text = text.strip()
        for eid in ref_event_ids.split(","):
            pos = event_texts[int(eid)].lower().find(text.lower())
            if pos != -1:
                start_pos = event_paddings[int(eid)] + pos
                end_pos = start_pos + len(text)
                return Entity(ner_type, start_pos, end_pos, text)
            
        edited_texts = []
        if ner_type == "PER" and len(text.split()) != 1:
            edited_texts = text.split()
        elif ner_type == "TME" and "'" in text:
            text = text.replace("'", "")
            edited_texts = ["phút thứ " + text, "phút " + text] 
        for text in edited_texts:
            edited_ner = parse_entity_other(ner_type, text, ref_event_ids)
            if edited_ner.start_pos != edited_ner.end_pos:
                return edited_ner
            
        return Entity(ner_type, -1, -1, text)
    
    def parse_entity_score(ner_type, score1, score2, ref_event_ids):
        edited_scores = [score1 + "-" + score2, score1 + " - " + score2, score2 + "-" + score1, score2 + " - " + score1]
        for eid in ref_event_ids.split(","):
            for score in edited_scores:
                pos = event_texts[int(eid)].find(score)
                if pos != -1:
                    start_pos = event_paddings[int(eid)] + pos
                    end_pos = start_pos + len(score)
                    return Entity(ner_type, start_pos, end_pos, score)
        return Entity(ner_type, -1, -1, score1 + "-" + score2)
    
    
    
    id = data["train_id"]
    summary = data["match_summary"]
    team = summary["players"]
    score_board = summary["score_board"]
    score_list = summary["score_list"]
    card_list = summary["card_list"]
    subst_list = summary["substitution_list"]
    
    entities, relations = [], []
    
    # Yield teams' name
    team1_ner = parse_entity_other("CLU", team["team1"], team["ref_event_ids"])
    team2_ner = parse_entity_other("CLU", team["team2"], team["ref_event_ids"])
    entities.extend([team1_ner, team2_ner])
    relations.append(Relation("COMP", team1_ner, team2_ner))
    
    # Yield scores
    score_ner = parse_entity_score("SCO", score_board["score1"], score_board["score2"], score_board["ref_event_ids"])
    # score_ner = parse_entity_score("SCO", score_board["score1"], score_board["score2"], "0")
    entities.append(score_ner)
    if int(score_board["score1"]) > int(score_board["score2"]):
        relations.append(Relation("DEFE", team1_ner, team2_ner))
    elif int(score_board["score1"]) < int(score_board["score2"]):
        relations.append(Relation("DEFE", team2_ner, team1_ner))
    else:
        relations.append(Relation("DRAW", team1_ner, team2_ner))
    
    # Yield time
    for info in score_list:
        player = parse_entity_other("PER", info["player_name"], info["ref_event_ids"])
        time = parse_entity_other("TME", info["time"], info["ref_event_ids"])
        team = team1_ner if info["team"] == team1_ner.text else team2_ner
        entities.extend([player, time])
        relations.append(Relation("SCOP", team, player))
        relations.append(Relation("SCOT", player, time))
        
        
    for info in card_list:
        player = parse_entity_other("PER", info["player_name"], info["ref_event_ids"])
        time = parse_entity_other("TME", info["time"], info["ref_event_ids"])
        team = team1_ner if is_match(info["team"], team1_ner.text) else team2_ner
        entities.extend([player, time])
        relations.append(Relation("CARP", team, player))
        relations.append(Relation("CART", player, time))
        
    for info in subst_list:
        player_in = parse_entity_other("PER", info["player_in"], info["ref_event_ids"])
        player_out = parse_entity_other("PER", info["player_out"], info["ref_event_ids"])
        entities.extend([player_in, player_out, time])
        relations.append(Relation("SUBP", player_out, player_in))
        if "time" in info:
            time = parse_entity_other("TME", info["time"], info["ref_event_ids"])
            relations.append(Relation("SUBT", player_in, time))
        
    return entities, relations

In [72]:
sample_idx = 0
paragraph = parse_paragraph(data[sample_idx])
content, event_texts, event_paddings = paragraph["content"], paragraph["event_texts"], paragraph["event_paddings"]
entities, relations = parse_entity_relation(data[sample_idx], event_texts, event_paddings)

print(*entities, sep="\n")
print()
print(*relations, sep="\n")

T-1	CLU 119 130	Tây Ban Nha
T-2	CLU 151 161	Costa Rica
T-3	SCO 1966 1969	5-0
T-4	PER 510 520	Jordi Alba
T-7	TME -1 -1	phút 6
T-8	PER 804 817	Alvaro Morata
T-10	TME 795 802	phút 23
T-11	PER 1247 1258	David Silva
T-13	TME 1293 1300	phút 51
T-14	PER 1247 1258	David Silva
T-15	TME 1179 1179	
T-16	PER 1781 1795	Andres Iniesta
T-18	TME 1762 1769	phút 73

R-1	COMP Arg1:T-1 Arg2:T-2
R-2	DEFE Arg1:T-1 Arg2:T-2
R-3	SCOP Arg1:T-2 Arg2:T-4
R-4	SCOT Arg1:T-4 Arg2:T-7
R-5	SCOP Arg1:T-2 Arg2:T-8
R-6	SCOT Arg1:T-8 Arg2:T-10
R-7	SCOP Arg1:T-2 Arg2:T-11
R-8	SCOT Arg1:T-11 Arg2:T-13
R-9	SCOP Arg1:T-2 Arg2:T-14
R-10	SCOT Arg1:T-14 Arg2:T-15
R-11	SCOP Arg1:T-2 Arg2:T-16
R-12	SCOT Arg1:T-16 Arg2:T-18


In [74]:
!mkdir dev

In [75]:
from tqdm import tqdm

for x in tqdm(data):
    file_id = x["train_id"]
    para = parse_paragraph(x)
    entities, relations = parse_entity_relation(x, para["event_texts"], para["event_paddings"])
    
    with open(f"./dev/{file_id}.txt", "w") as f:
        f.write(content)
    with open(f"./dev/{file_id}.ann", "w") as f:
        for entity in entities:
            f.write(str(entity) + "\n")
        for relation in relations:
            f.write(str(relation) + "\n")

100%|██████████| 141/141 [00:02<00:00, 63.67it/s]


In [29]:
!ls

convert_to_ner_relation.ipynb  test.jsonl  train.jsonl
dev.jsonl		       train	   train_dev_split.ipynb


In [68]:
data[375]["match_summary"]

{'players': {'team1': 'Bayern Munich',
  'team2': 'Real Madrid',
  'ref_event_ids': '1'},
 'score_board': {'score1': '2', 'score2': '4', 'ref_event_ids': '7'},
 'score_list': [{'player_name': ' Lewandowski ',
   'time': '',
   'team': 'Bayern Munich',
   'ref_event_ids': '2'},
  {'player_name': 'Cristiano Ronaldo',
   'time': '',
   'team': 'Real Madrid',
   'ref_event_ids': '3'},
  {'player_name': 'Cristiano Ronaldo',
   'time': "105'",
   'team': 'Real Madrid',
   'ref_event_ids': '6'},
  {'player_name': 'Cristiano Ronaldo',
   'time': "109'",
   'team': 'Real Madrid',
   'ref_event_ids': '7'},
  {'player_name': 'Asensio',
   'time': "112'",
   'team': 'Real Madrid',
   'ref_event_ids': '7'}],
 'card_list': [{'player_name': 'Vidal ',
   'time': "84'",
   'team': 'Bayern Munich',
   'ref_event_ids': '5'}],
 'substitution_list': [{'player_in': 'Kimmich',
   'time': '84',
   'player_out': 'Lewandowski',
   'ref_event_ids': '5'}]}

In [52]:
data[346]["match_summary"]["card_list"] = []

In [67]:
data[375]["match_summary"]["score_list"][1]["ref_event_ids"] = "3"

In [62]:
para["event_texts"][3]

'Tuy nhiên Cristiano Ronaldo đã dội một gáo nước lạnh vào ý chí chiến đấu của nhà ĐKVĐ Bundesliga. Ngôi sao người Bồ Đào Nha đã thực hiện cú đánh đầu chuẩn xác trong vòng cấm để gỡ hòa 1-1 cho Real Madrid sau quả tạt của Casemiro. Ở tuổi 32, liệu có mấy cầu thủ làm được như CR7. '