In [12]:
import json
import itertools
data = []

data_type = "dev"
with open(f"{data_type}.jsonl", "r") as f:
    for line in f:
        data.append(json.loads(line))
print("Number of examples:", len(data))

Number of examples: 141


In [2]:
from vncorenlp import VnCoreNLP
segmentor = VnCoreNLP("../../../../VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg")

Entity format:

```
<entity id> <ner_tag> <start_pos> <end_pos> <text>
```

Ví dụ:
```
T3-7	GPE 522 531	community
```

In [3]:
class Entity():
    global_id = 1
    def __init__(self, ner_type, start_pos, end_pos, text):
        self.id = f"T-{Entity.global_id}"
        self.ner_type = ner_type
        self.start_pos = start_pos
        self.end_pos = end_pos
        self.text = text
        Entity.global_id += 1
        
    def __repr__(self):
        return f"{self.id}\t{self.ner_type} {self.start_pos} {self.end_pos}\t{self.text}"
        
class Relation():
    global_id = 1
    def __init__(self, relation_type, *args):
        self.id = f"R-{Relation.global_id}"
        self.relation_type = relation_type
        self.args = args
        Relation.global_id += 1
        
    def __repr__(self):
        res = f"{self.id}\t{self.relation_type}"
        for args_id, entity in enumerate(self.args):
            res += f" Arg{args_id + 1}:{entity.id}"
        return res

In [4]:
from bs4 import BeautifulSoup

def segmentize(text):
    text = segmentor.tokenize(text.strip())
    return " ".join(" ".join(x) for x in text)

def parse_paragraph(data):
    event_texts = {0: ""}
    event_paddings = {0: 0}
    event_ids = []
    e_id = None
    content = ""
    
    for html in data["html_annotation"]:
        for e in BeautifulSoup(html, "html.parser"):
            try:
                e_id = int(e["event_id"])
                event_ids.append(e_id)
                event_texts[e_id] = ""
            except: pass
            if e_id is None:
                continue
            event_texts[e_id] += segmentize(e.text.replace("-", " - ")) + " "
    event_ids.sort()
    
    for i, event in enumerate(event_ids):
        cur_event_ids = event_ids[i]
        pre_event_ids = event_ids[i - 1] if i != 0 else 0
        event_paddings[cur_event_ids] = event_paddings[pre_event_ids] + len(event_texts[pre_event_ids])
        content += event_texts[cur_event_ids]
    
    event_texts["0"] = content
    title = segmentize(data["original_doc"]["_source"]["description"])
    return {
        "title": title,
        "content": content,
        "event_texts": event_texts,
        "event_paddings": event_paddings,
    }

In [5]:
def is_match(team1, team2):
    team1 = team1.strip()
    team2 = team2.strip()
    return team1 in team2 or team2 in team1

In [6]:
def full_text_token(text, start, passage):
    end = start + len(text)
    while end < len(passage) and passage[end] != " ":
        end += 1
    while start > 0 and passage[start - 1] != " ":
        start -= 1
    text = passage[start : end]
    return start, text

In [7]:
def _parse_entities_other(ner_type, text, ref_event_ids):
    def parse_entity_other(text, eid):
        if text == "":
            return Entity(ner_type, -1, -1, text)
        
        text = segmentize(text)
        pos = event_texts[eid].lower().find(text.lower())
        if pos != -1:
            pos, text = full_text_token(text, pos, event_texts[eid])
            start_pos = pos + event_paddings[eid]
            end_pos = start_pos + len(text)
            return Entity(ner_type, start_pos, end_pos, text)

        edited_texts = []
        if ner_type == "PER" and len(text.split()) != 1:
            edited_texts = text.split()
        elif ner_type == "TME" and "'" in text:
            text = text.replace("'", "")
            edited_texts = ["phút thứ " + text, "phút " + text] 
        for text in edited_texts:
            edited_ner = parse_entity_other(text, eid)
            if edited_ner.start_pos != edited_ner.end_pos:
                return edited_ner

        return Entity(ner_type, -1, -1, text)

    res = []
    for eid in ref_event_ids.split(","):
        entity = parse_entity_other(text, int(eid))
        if entity.start_pos == entity.end_pos or not entity.text:
            continue
        res.append(entity)
    return res


def _parse_entities_score(ner_type, score1, score2, ref_event_ids):
    def parse_entity_single_score(eid, score, start, end):
        pos = event_texts[eid][start : end].find(score)
        if pos != -1:
            pos, score = full_text_token(score, start + pos, event_texts[eid])
            start_pos = event_paddings[eid] + pos
            end_pos = start_pos + len(score)
            
            return Entity(ner_type, start_pos, end_pos, score)
        return Entity(ner_type, -1, -1, "")
    
    def parse_entity_couple_score(eid, score1, score2):
        edited_scores = [score1 + " - " + score2, score2 + " - " + score1]      
        for score in edited_scores:
            score = segmentize(score)
            score_ner = parse_entity_single_score(eid, score, 0, -1)
            if score_ner.start_pos != score_ner.end_pos:
                return score_ner
        return Entity(ner_type, -1, -1, "")

    
    score1_entities, score2_entities = [], []
    for eid in ref_event_ids.split(","):
        eid = int(eid)
        entity = parse_entity_couple_score(eid, score1, score2)
        if entity.start_pos == entity.end_pos:
            score1_entity = parse_entity_single_score(eid, score1, 0, -1)
            score2_entity = parse_entity_single_score(eid, score2, 0, -1)
        else:
            entity.start_pos -= event_paddings[eid]
            entity.end_pos -= event_paddings[eid]
            score1_entity = parse_entity_single_score(eid, score1, entity.start_pos, entity.end_pos)
            score2_entity = parse_entity_single_score(eid, score2, entity.start_pos, entity.end_pos)
           
        if score1_entity.start_pos != score1_entity.end_pos:
            score1_entities.append(score1_entity)
        if score2_entity.start_pos != score2_entity.end_pos:
            score2_entities.append(score2_entity)
        
    return score1_entities, score2_entities


def _parse_relations(rel_type, args):
    res = []
    for entities in itertools.product(*args):
        res.append(Relation(rel_type, *entities))
    return res

In [8]:
def parse_entity_relation(data, event_texts, event_paddings):
    entities, relations = [], []
    id = data["train_id"]
    summary = data["match_summary"]
    team = summary["players"]
    score_board = summary["score_board"]
    score_list = summary["score_list"]
    card_list = summary["card_list"]
    subst_list = summary["substitution_list"]
    
    # Yield teams' name
    team1_entities = _parse_entities_other("CLU", team["team1"], team["ref_event_ids"])
    team2_entities = _parse_entities_other("CLU", team["team2"], team["ref_event_ids"])
    entities.extend(team1_entities + team2_entities)
    relations.extend(_parse_relations("COMP", [team1_entities, team2_entities]))
    relations.extend(_parse_relations("COMP", [team2_entities, team1_entities]))
    
    # Yield scores
    score1_entities, score2_entities = _parse_entities_score("SCO", score_board["score1"], score_board["score2"], score_board["ref_event_ids"])
    entities.extend(score1_entities + score2_entities)
    relations.extend(_parse_relations("SCOC", [team1_entities, score1_entities]))
    relations.extend(_parse_relations("SCOC", [team2_entities, score2_entities]))

    # Yield list
    for info in score_list:
        player_entities = _parse_entities_other("PSC", info["player_name"], info["ref_event_ids"])
        time_entities = _parse_entities_other("TSC", info["time"], info["ref_event_ids"])
        team_entities = _parse_entities_other("CLU", info["team"], info["ref_event_ids"])
        entities.extend(player_entities + time_entities + team_entities)
        relations.extend(_parse_relations("SCOP", [player_entities, team_entities]))
        relations.extend(_parse_relations("SCOT", [player_entities, time_entities]))       
        
    for info in card_list:
        player_entities = _parse_entities_other("PCA", info["player_name"], info["ref_event_ids"])
        time_entities = _parse_entities_other("TCA", info["time"], info["ref_event_ids"])
        team_entities = _parse_entities_other("CLU", info["team"], info["ref_event_ids"])
        entities.extend(player_entities + time_entities + team_entities)
        relations.extend(_parse_relations("CARP", [player_entities, team_entities]))
        relations.extend(_parse_relations("CART", [player_entities, time_entities]))   
        
    for info in subst_list:
        playerin_entities = _parse_entities_other("PSI", info["player_in"], info["ref_event_ids"])
        playerout_entities = _parse_entities_other("PSO", info["player_out"], info["ref_event_ids"])
        time_entities = [] if "time" not in info else _parse_entities_other("TSI", info["time"], info["ref_event_ids"])
        entities.extend(playerin_entities + playerout_entities + time_entities)
        relations.extend(_parse_relations("SUBP", [playerin_entities, playerout_entities]))
        relations.extend(_parse_relations("SUBT", [playerin_entities, time_entities]))
        
    return entities, relations

In [9]:
sample_idx = 120
paragraph = parse_paragraph(data[sample_idx])
content, event_texts, event_paddings = paragraph["content"], paragraph["event_texts"], paragraph["event_paddings"]
entities, relations = parse_entity_relation(data[sample_idx], event_texts, event_paddings)

In [10]:
print(*entities, sep="\n")
print()
print(*relations, sep="\n")

T-1	CLU 61 72	Bournemouth
T-2	CLU 21 28	Arsenal
T-4	SCO 46 47	3
T-7	SCO 3032 3033	3
T-5	SCO 46 47	3
T-8	SCO 3032 3033	3
T-9	PSC 1051 1066	Charlie_Daniels
T-10	TSC 963 965	19
T-11	CLU 789 800	Bournemouth
T-12	PSC 4165 4176	Ryan_Fraser
T-13	TSC 4241 4243	58
T-14	CLU 4222 4233	Bournemouth
T-15	PSC 2565 2579	Alexis_Sanchez
T-16	TSC 2503 2505	70
T-17	CLU 2640 2647	Arsenal
T-18	PSC 2782 2793	Lucas_Perez
T-19	TSC 2654 2664	5 phút sau
T-20	CLU 2701 2708	Arsenal
T-22	PSC 1343 1356	Callum_Wilson
T-23	TSC 1111 1121	2 phút sau
T-25	CLU 1243 1254	Bournemouth
T-26	CLU 1414 1425	Bournemouth
T-27	PSC 4368 4382	Olivier_Giroud
T-28	TSC 4354 4365	phút bù giờ
T-29	CLU 4275 4282	Arsenal
T-30	PCA 2932 2945	Simon_Francis
T-31	TCA 2920 2922	82
T-32	CLU 2950 2961	Bournemouth
T-33	PSI 2123 2134	Lucas_Perez
T-34	PSO 2148 2153	Iwobi

R-1	COMP Arg1:T-1 Arg2:T-2
R-2	COMP Arg1:T-2 Arg2:T-1
R-3	SCOC Arg1:T-1 Arg2:T-4
R-4	SCOC Arg1:T-1 Arg2:T-7
R-5	SCOC Arg1:T-2 Arg2:T-5
R-6	SCOC Arg1:T-2 Arg2:T-8
R-7	SCOP Arg1:T-9 Ar

In [13]:
from tqdm import tqdm
if data_type == "train":
    data[83]["match_summary"]["score_list"][1]["ref_event_ids"] = "4"
    data[361]["match_summary"]["score_list"][0]["ref_event_ids"] = "4"
    data[346]["match_summary"]["card_list"] = []
    data[375]["match_summary"]["score_list"][1]["ref_event_ids"] = "3"
    
for x in tqdm(data):
    file_id = x["train_id"]
    paragraph = parse_paragraph(x)
    content, event_texts, event_paddings = paragraph["content"], paragraph["event_texts"], paragraph["event_paddings"]
    entities, relations = parse_entity_relation(x, event_texts, event_paddings)
    
    with open(f"./{data_type}/{file_id}.txt", "w") as f:
        f.write(paragraph["content"])
    
    with open(f"./{data_type}/{file_id}.ann", "w") as f:
        for entity in entities:
            f.write(str(entity) + "\n")
        for relation in relations:
            f.write(str(relation) + "\n")

100%|██████████| 141/141 [00:17<00:00,  8.15it/s]


In [14]:
import json
import itertools
data = []

data_type = "test"
with open(f"{data_type}.jsonl", "r") as f:
    for line in f:
        data.append(json.loads(line))
print("Number of examples:", len(data))

Number of examples: 211


In [15]:
for x in tqdm(data):
    file_id = x["test_id"]
    content = " ".join(segmentize(body["text"].replace("-", " - ")) for body in x["original_doc"]["_source"]["body"])
    
    with open(f"./{data_type}/{file_id}.txt", "w") as f:
        f.write(content)
    with open(f"./{data_type}/{file_id}.ann", "w") as f:
        for entity in entities:
            f.write("")
        for relation in relations:
            f.write("")

100%|██████████| 211/211 [00:11<00:00, 18.78it/s]
