In [1]:
import matplotlib.pyplot as plt

from dataset.dataset import get_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
from tqdm import tqdm
import pathlib
from final.matcher import OIE_Match

In [2]:
nlp = spacy.load("pt_core_news_lg")
matcher = OIE_Match()

In [24]:
import json
from OIE.utils.contractions import transform_portuguese_contractions


def load_bio_datasets(path:str, length:int=None):
    pt = f'datasets/validated_splits/normal/TA4/{path}'
    dataset_pt = dict()
    with open(pt, "r", encoding="utf-8") as f_pt:
        dataset = {}
        actual_pos = 0
        raw_txt = f_pt.read()
        sentences = raw_txt.split("\n\n")
        sentences = [sent.split("\n") for sent in sentences]
        sentences = [[s.split("\t") for s in sent] for sent in sentences]
        if length is None:
            stop = len(sentences)
        else:
            stop = length
        i = 0
        for sent in sentences:
            if len(sent)>=3:
                full_sent = []
                arg0 = []
                rel = []
                arg1 = []
                for tk in sent:
                    if len(tk) == 10:
                        full_sent.append(tk[0])
                        if 'ARG0' in tk[-2]:
                            arg0.append(tk[0])
                        elif 'V' in tk[-2]:
                            rel.append(tk[0])
                        elif 'ARG1' in tk[-2]:
                            arg1.append(tk[0])
                full_sent = ' '.join(full_sent)
                arg0 = ' '.join(arg0)
                rel = ' '.join(rel)
                arg1 = ' '.join(arg1)
                dataset[i] = {
                    'sent': full_sent,
                    'arg1': arg0,
                    'rel': rel,
                    'arg2': arg1
                }
                i += 1

        for sentence in dataset:
            dataset_pt[actual_pos] = {"phrase": dataset[sentence]['sent'], "extractions": []}
            arg1 = dataset[sentence]["arg1"]
            arg2 = dataset[sentence]["arg2"]
            rel = dataset[sentence]["rel"]

            dataset_pt[actual_pos]["extractions"].append(
                {
                    "arg1": transform_portuguese_contractions(arg1),
                    "rel": transform_portuguese_contractions(rel),
                    "arg2": transform_portuguese_contractions(arg2),
                    "valid": 1,
                }
            )
            actual_pos += 1
            if actual_pos >= stop:
                break
        return dataset_pt

In [37]:
ds = load_bio_datasets('s2_alan_valid_corpus.txt')

In [38]:
len(ds)

936

In [8]:
text = 'ele'
doc = nlp(text)
for tk in doc:
    print(tk.pos_)

PRON


In [3]:
def save_data(dataset, name: str):
    dir = 'datasets/outputs/clean_dataset'
    path = pathlib.Path(dir)
    path.mkdir(parents=True, exist_ok=True)
    file_text = ""
    before_tag = '\tXX\t-\t-\t-\t-\t-\t*\t'
    for key in tqdm(dataset, desc='saving data'):
        ext = dataset[key]['ext']['splited']
        sent = dataset[key]['sent']['sent']
        match = matcher.match(sent=sent, 
                              arg1=ext['arg0'], 
                              rel=ext['rel'],
                              arg2=ext['arg1'])
        if match[-1]:
            for i,token in enumerate([token.text for token in nlp(sent)]):
                if i in range(match[0][0], match[0][1]+1):
                    if len(range(match[0][0], match[0][1]+1)) == 1:
                        file_text += token + before_tag + "S-ARG0" + "\n"
                    elif i == match[0][0]:
                        file_text += token + before_tag + "B-ARG0" + "\n"
                    elif i > match[0][0] and i<match[0][1]:
                        file_text += token + before_tag + "I-ARG0" + "\n"
                    elif i == match[0][1]:
                        file_text += token + before_tag + "E-ARG0" + "\n"

                elif i in range(match[1][0], match[1][1]+1):
                    if len(range(match[1][0], match[1][1] + 1)) == 1:
                        file_text += token + before_tag + "S-V" + "\n"
                    elif i == match[1][0]:
                        file_text += token + before_tag + "B-V" + "\n"
                    elif i > match[1][0] and i < match[1][1]:
                        file_text += token + before_tag + "I-V" + "\n"
                    elif i == match[1][1]:
                        file_text += token + before_tag + "E-V" + "\n"

                elif i in range(match[2][0], match[2][1]+1):
                    if len(range(match[2][0], match[2][1] + 1)) == 1:
                        file_text += token + before_tag + "S-ARG1" + "\n"
                    elif i == match[2][0]:
                        file_text += token + before_tag + "B-ARG1" + "\n"
                    elif i > match[2][0] and i < match[2][1]:
                        file_text += token + before_tag + "I-ARG1" + "\n"
                    elif i == match[2][1]:
                        file_text += token + before_tag + "E-ARG1" + "\n"
                else:
                    file_text += token + before_tag + "O" + "\n"
            file_text += "\n"
    with open(dir + "/" + name + ".txt", "a", encoding="utf-8") as f:
        try:
            txt_f = f.read()
        except:
            txt_f = ""
        txt_f += file_text
        f.write(txt_f)

In [4]:
def load_dataset(): 
    dataset = get_dataset()
    extractions = {}
    i = 0
    split = 0
    while split<=1:
        for triple in dataset[split]:
            sent = triple.phrase
            sent_counter = 0
            for token in nlp(sent):
                sent_counter += 1
                
            ext = triple.gold_extractions[0]
            merged = ext.arg0 + " " + ext.rel + " " + ext.arg1
            ext_counter = 0
            for token in nlp(merged):
                ext_counter += 1
                
            ext = {"length" : ext_counter,
                   "merged": merged,
                   "splited" : {
                       "arg0" : ext.arg0,
                       "rel" : ext.rel,
                       "arg1" : ext.arg1
                        }
                   }
            extractions[i] = {"sent" : {"sent" : sent,
                                        "length" : sent_counter
                                        },
                              "ext" : ext}
            i+=1
        split+=1
    return extractions

In [5]:
dataset = load_dataset()
dataset[0]

processando TA4: 100%|██████████| 102616/102616 [11:30<00:00, 148.64it/s]


{'sent': {'sent': 'A linha de a cintura foi colocada mais alta e as saias se tornaram mais longas .',
  'length': 17},
 'ext': {'length': 6,
  'merged': 'as saias se tornaram mais longas',
  'splited': {'arg0': 'as saias',
   'rel': 'se tornaram',
   'arg1': 'mais longas'}}}

In [7]:
len(dataset)

102615

In [8]:
clean_dataset = {}
trash = {}
i=0
j=0
for data in dataset:
    arg1 = dataset[data]['ext']['splited']['arg0']
    arg1_doc = nlp(arg1)
    for tk in arg1_doc:
        if tk.pos_ in ['NOUN', 'PROPN']:
            clean_dataset[i] = dataset[data]
            i+=1
            break
        if tk.text == arg1_doc[-1].text:
            trash[j] = dataset[data]
            j+=1

In [18]:
save_data(clean_dataset, 'TA4_corpus')

saving data: 100%|██████████| 97238/97238 [26:29<00:00, 61.17it/s]


In [10]:
len(clean_dataset)

97238

In [9]:
len(trash)

5458

In [17]:
print(clean_dataset[10])

{'sent': {'sent': 'No entanto , a ação de o governo , que pegou a gerência de a Jaguar de surpresa , pode frustrar o negócio minoritário de a GM obrigando-os a lutar por toda a Jaguar .', 'length': 35}, 'ext': {'length': 13, 'merged': 'a ação de o governo pode frustrar o negócio minoritário de a GM', 'splited': {'arg0': 'a ação de o governo', 'rel': 'pode frustrar o negócio minoritário de', 'arg1': 'a GM'}}}


In [7]:
same_string_extractions = {}
minimum_extractions = {}
small_extractions = {}
long_extractions = {}
small_sentences = {}

long_sentences = {}
long_sentece_small_extractions = {}

for key in dataset:
    inst = dataset[key]
    sent_len = inst["sent"]["length"]
    ext_len = inst["ext"]["length"]
    if sent_len-1 == ext_len or sent_len == ext_len:
        same_string_extractions[key] = inst
    elif ext_len > 8:
        long_extractions[key] = inst
    elif ext_len == 3:
        minimum_extractions[key] = inst
    elif ext_len > 3 and ext_len <=10:
        small_extractions[key] = inst
    

In [9]:
print("extractions with same sentence length: ",len(same_string_extractions))
print("extractions with one tag on arg0, one on rel and one on arg1: ", len(minimum_extractions))
print("small extractions, token counter <= 8: ", len(small_extractions))
print("long extractions, token counter > 8: ", len(long_extractions))

extractions with same sentence length:  16463
extractions with one tag on arg0, one on rel and one on arg1:  927
small extractions, token counter <= 8:  38553
long extractions, token counter > 8:  61959


In [10]:
save_data(small_extractions, 'small')

saving data: 100%|██████████| 38553/38553 [08:56<00:00, 71.83it/s]
