In [1]:
# Make sure our working directory is the root of project, e.g /amr-parser
%cd ..

d:\MasterDegree\AMR\amr-parser


In [2]:
import string
from collections import defaultdict
from importlib import reload

import nltk
import penman
import stanza
from tqdm import tqdm
import tensorflow as tf
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

reload(penman)


<module 'penman' from 'D:\\miniconda3\\envs\\amr\\lib\\site-packages\\penman.py'>

In [3]:
from prediction.predict import predict_and_process, process_sentence

# from utils.amr import penman_to_dot
from utils.ner.entity_recognizer import get_entities, get_entities_tf
from utils.amr_parsing.io import AMRIO
from utils.amr_parsing.amr import AMR



[2021-12-21 08:53:16,124 INFO] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
2021-12-21 08:53:16 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |

[2021-12-21 08:53:16,179 INFO] Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |

2021-12-21 08:53:16 INFO: Use device: cpu
[2021-12-21 08:53:16,179 INFO] Use device: cpu
2021-12-21 08:53:16 INFO: Loading: tokenize
[2021-12-21 08:53:16,189 INFO] Loading: tokenize
2021-12-21 08:53:16 INFO: Loading: pos
[2021-12-21 08:53:16,229 INFO] Loading: pos
2021-12-21 08:53:16 INFO: Loading: lemma
[2021-12-21 08:53:16,722 INFO] Loading: lemma
2021-12-21 08:53:16 INFO: Loading: depparse
[2021-12-21 08:53:16,769 INFO] 

In [4]:
physical_devices = tf.config.list_physical_devices('GPU') 
# tf.config.experimental.set_memory_growth(physical_devices[0], True) #kalau cudablas nya ga mau, run ini sebelum import get_entities

In [5]:
class FeatureAnnotator:
    word_dict = {}
    
    def __init__(self, params):
        self.nlp = stanza.Pipeline(lang="id",use_gpu=True,verbose=False, tokenize_pretokenized=True)
        self.annotation={}
        factory = StemmerFactory()
        self.stemmer = factory.create_stemmer()
        self.params = params
        if self.params["ner_tagger"]=='tf' :
            self.ner = get_entities_tf
        else:
            self.ner = get_entities
        
        if self.params['pos_tagger'] == 'nltk':
            self.pos_tagger = nltk.tag.CRFTagger()
            self.pos_tagger.set_model_file('pretrained/pos_tagger/all_indo_man_tag_corpus_model.crf.tagger')
        elif self.params['pos_tagger'] == 'stanza':
            pass
        
    def annotate(self, sentence):
        self.annotation = defaultdict(list)
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        self.annotation["sentence"] = sentence
        doc = self.nlp(sentence)
        
        self.annotation['ner_tags'] = self.ner(sentence)
        
        word_dict = defaultdict(int)
        
        for sent in doc.sentences:
            for idx, word in enumerate(sent.words):
                self.annotation['tokens'].append(word.text)
                stemmed_word = self.stemmer.stem(word.text)                
                if (self.annotation['ner_tags'][idx] in ['PER', 'ORG']):
                    stemmed_word = word.text.lower()
                word_dict[stemmed_word] += 1
                self.annotation['lemmas'].append(stemmed_word)
                self.annotation['pos_tags'].append(word.upos)
    
        if self.params['pos_tagger'] == 'nltk':
            self.annotation['pos_tags'] = [tag[1] for tag in self.pos_tagger.tag(self.annotation['tokens'])]
            
        
        return self.annotation

ner_labels_map = {
    "PER" : "PERSON",
    "LOC" : "LOCATION",
    "FAC" : "LOCATION",
    "GPE" : "LOCATION",
    "NOR" : "ORGANIZATION",
    "ORG" : "ORGANIZATION",
    "MON" : "MONEY",
    "CRD" : "NUMBER",
    "ORD" : "ORDINAL",
    "PRC" : "PERCENT",
    "DAT" : "DATE",
    "TIM" : "TIME",
}

def transform_ner_tags(annotation):
    ner_tags = annotation["ner_tags"]
    for i, ner in enumerate(ner_tags):
        annotation["ner_tags"][i] = ner_labels_map.get(ner.split("-")[-1],"O")

In [9]:
def dump_amr_features(amr_graph, annotation, f, index=None):
    amr = AMR()
    if amr_graph != "":
        amr.graph = amr_graph
    if index:
        amr.id = index
    amr.sentence = annotation["sentence"].strip()
    amr.tokens = annotation['tokens']
    amr.lemmas = annotation['lemmas']
    amr.pos_tags = annotation['pos_tags']
    amr.ner_tags = annotation['ner_tags']
    AMRIO.dump([amr], f)

def predict_sentence(sentence,annotator, f, index=0):
    try:
        annotation = annotator.annotate(sentence)
    except Exception as e:
        print(f"Problem with sentence : {sentence}")
        return
    try:
        filtered_X = process_sentence(sentence)
        amr_graph = predict_and_process(filtered_X)
        dump_amr_features(amr_graph, annotation, f, index)
    except Exception as e:
        print("problem with", e)
        print(annotation)
        dump_amr_features("", annotation, f, index)

def prepare_sentence_for_parsing(filepath, params, start=1):
    index = 0
    annotator = FeatureAnnotator(params)
    outfile = filepath if start == 1 else filepath+f".{str(start)}"
    with open(outfile + '.features', 'w', encoding='utf-8') as f:
        with open(filepath, 'r', encoding='utf-8') as t:
            for sentence in tqdm(t):
                index += 1
                if index < start:
                    continue
                try:
                    annotation = annotator.annotate(sentence)
                except Exception as e:
                    print(f"Problem with sentence : {sentence}")
                    return
                dump_amr_features("(t/temp)", annotation, f, index)

def preprocess_sentence(filepath, params):
    index = 0
    annotator = FeatureAnnotator(params)
    with open(filepath + '.features.pred', 'w', encoding='utf-8') as f:
        with open(filepath, 'r', encoding='utf-8') as t:
            for sentence in tqdm(t):
                index += 1
                predict_sentence(sentence,annotator,f, index=index)

def predict_from_amr(filepath, params):
    annotator = FeatureAnnotator(params)
    
    with open(filepath + '.features.pred', 'w', encoding='utf-8') as f:
        for i, amr in enumerate(AMRIO.read(filepath), 1):
            if i % 100 == 0:
                print('{} processed.'.format(i))
                
            predict_sentence(amr.sentence, annotator, f, amr.id)

def preprocess_amr(filepath, params):
    annotator = FeatureAnnotator(params)
    
    with open(filepath + '.features', 'w', encoding='utf-8') as f:
        i = 1
        for amr in AMRIO.read(filepath):
            if i % 100 == 0:
                print('{} processed.'.format(i))
            i+=1
            annotation = annotator.annotate(amr.sentence)
            transform_ner_tags(annotation)
            dump_amr_features(amr.graph, annotation, f, index=amr.id)

In [7]:
annotator=FeatureAnnotator({'pos_tagger': 'nltk', 'ner_tagger':'anago'})
with open("data/amr_experiment/test_1.txt", "w") as f:
    predict_sentence("Aku membuang Roti itu ke tempat sampah", annotator, f)

In [10]:
# prepare_sentence_for_parsing("data/bppt_id_en/split/dev_id.txt", {'pos_tagger': 'nltk', 'ner_tagger':'anago'})
prepare_sentence_for_parsing("data/bppt_id_en/split/train_id.txt", {'pos_tagger': 'nltk', 'ner_tagger':'anago'}, start=19214)

21839it [20:09, 18.06it/s]  


In [7]:
preprocess_amr("data/amr_id_2.0/train.txt", {'pos_tagger': 'nltk', 'ner_tagger':'anago'})
preprocess_amr("data/amr_id_2.0/dev.txt", {'pos_tagger': 'nltk', 'ner_tagger':'anago'})
preprocess_amr("data/amr_id_2.0/test.txt", {'pos_tagger': 'nltk', 'ner_tagger':'anago'})

100 processed.
200 processed.
300 processed.
400 processed.
500 processed.
600 processed.
700 processed.
800 processed.
900 processed.
1000 processed.
1100 processed.
1200 processed.
1300 processed.
1400 processed.
1500 processed.
1600 processed.
1700 processed.
1800 processed.
1900 processed.
2000 processed.
2100 processed.
2200 processed.
2300 processed.
Exception raised : 
 (d / dan    :op1 (j / jadi            :ARG0 (h / hendra)            :ARG1 (a / alumni                     :mod (l / lulusan                             :mod (b / baik)))            :location (u / universitas                         :mod (i / islam                                 :location (m / malang)))            :time (t / tahun                     :mod (_ / 2013)))    :op2 (d2 / dagang             :ARG0 (a2 / ayah)             :ARG1 (s / siomay))) 358 (
(d / dan    :op1 (j / jadi            :ARG0 (h / hendra)            :ARG1 (a / alumni                     :mod (l / lulusan                             :mod (b

In [48]:
sentence="Yasmin membuang sampah di tempat sampah"
annotator = FeatureAnnotator({'pos_tagger': 'nltk', 'ner_tagger':'tf'})
annotation = annotator.annotate(sentence)
filtered_X = process_sentence(sentence)
amr_graph = predict_and_process(filtered_X)
# amr_graph = predict_and_process(filtered_X)


In [53]:
# amr_graph._triples.pop(3) 
# amr_graph.triples()
with open("error.amr", 'w', encoding='utf-8') as f:
    dump_amr_features(amr_graph, annotation, f, 1)