In [1]:
# Make sure our working directory is the root of project, e.g /amr-parser
%cd ..
%pwd

c:\Users\bandi\MasterDegree\AMR\amr-parser


'c:\\Users\\bandi\\MasterDegree\\AMR\\amr-parser'

In [2]:
import string
from collections import defaultdict
from importlib import reload

import nltk
import penman
import stanza
from tqdm import tqdm
import tensorflow as tf
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

reload(penman)


<module 'penman' from 'D:\\miniconda3\\envs\\amr\\lib\\site-packages\\penman.py'>

In [3]:
from prediction.predict import predict_and_process, process_sentence

# from utils.amr import penman_to_dot
from utils.ner.entity_recognizer import get_entities, get_entities_tf
from utils.amr_parsing.io import AMRIO
from utils.amr_parsing.amr import AMR



[2021-11-03 11:52:37,848 INFO] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
2021-11-03 11:52:37 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |

[2021-11-03 11:52:37,888 INFO] Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |

2021-11-03 11:52:37 INFO: Use device: cpu
[2021-11-03 11:52:37,890 INFO] Use device: cpu
2021-11-03 11:52:37 INFO: Loading: tokenize
[2021-11-03 11:52:37,892 INFO] Loading: tokenize
2021-11-03 11:52:37 INFO: Loading: pos
[2021-11-03 11:52:37,908 INFO] Loading: pos
2021-11-03 11:52:38 INFO: Loading: lemma
[2021-11-03 11:52:38,193 INFO] Loading: lemma
2021-11-03 11:52:38 INFO: Loading: depparse
[2021-11-03 11:52:38,247 INFO] 

In [4]:
physical_devices = tf.config.list_physical_devices('GPU') 
# tf.config.experimental.set_memory_growth(physical_devices[0], True) #kalau cudablas nya ga mau, run ini sebelum import get_entities

In [5]:
class FeatureAnnotator:
    word_dict = {}
    
    def __init__(self, params):
        self.nlp = stanza.Pipeline(lang="id",use_gpu=True,verbose=False, tokenize_pretokenized=True)
        
        factory = StemmerFactory()
        self.stemmer = factory.create_stemmer()
        self.params = params
        if self.params["ner_tagger"]=='tf' :
            self.ner = get_entities_tf
        else:
            self.ner = get_entities
        
        self.pos_tagger = nltk.tag.CRFTagger()
        self.pos_tagger.set_model_file('pretrained/pos_tagger/all_indo_man_tag_corpus_model.crf.tagger')
        
    def annotate(self, sentence):
        annotation = defaultdict(list)
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        annotation["sentence"] = sentence
        doc = self.nlp(sentence)
        
        annotation['ner_tags'] = self.ner(sentence)
        
        word_dict = defaultdict(int)
        
        for sent in doc.sentences:
            for idx, word in enumerate(sent.words):
                annotation['tokens'].append(word.text)
                stemmed_word = self.stemmer.stem(word.text)                
                if (annotation['ner_tags'][idx] in ['PER', 'ORG']):
                    stemmed_word = word.text.lower()
                word_dict[stemmed_word] += 1
                annotation['lemmas'].append(stemmed_word)
                annotation['pos_tags'].append(word.upos)
    
        if self.params['pos_tagger'] == 'nltk':
            annotation['pos_tags'] = [tag[1] for tag in self.pos_tagger.tag(annotation['tokens'])]
            
        
        return annotation

In [24]:
def dump_amr_features(amr_graph, annotation, f, index=None):
    amr = AMR()
    if amr_graph != "":
        amr.graph = amr_graph
    if index:
        amr.id = index
    amr.sentence = annotation["sentence"]
    amr.tokens = annotation['tokens']
    amr.lemmas = annotation['lemmas']
    amr.pos_tags = annotation['pos_tags']
    amr.ner_tags = annotation['ner_tags']
    AMRIO.dump([amr], f)

def predict_sentence(sentence,annotator, f, index=0):
    try:
        annotation = annotator.annotate(sentence)
    except Exception as e:
        print(f"Problem with sentence : {sentence}")
        return
    try:
        filtered_X = process_sentence(sentence)
        amr_graph = predict_and_process(filtered_X)
        dump_amr_features(amr_graph, annotation, f, index)
    except Exception as e:
        print("problem with", e)
        print(annotation)
        dump_amr_features("", annotation, f, index)

def preprocess_sentence(filepath, params):
    index = 0
    annotator = FeatureAnnotator(params)
    with open(filepath + '.features.pred', 'w', encoding='utf-8') as f:
        with open(filepath, 'r', encoding='utf-8') as t:
            for sentence in tqdm(t):
                index += 1
                predict_sentence(sentence,annotator,f, index=index)

def predict_from_amr(filepath, params):
    annotator = FeatureAnnotator(params)
    
    with open(filepath + '.features.pred', 'w', encoding='utf-8') as f:
        for i, amr in enumerate(AMRIO.read(filepath), 1):
            if i % 100 == 0:
                print('{} processed.'.format(i))
                
            predict_sentence(amr.sentence, annotator, f, amr.id)

def preprocess_amr(filepath, params):
    annotator = FeatureAnnotator(params)
    
    with open(filepath + '.features', 'w', encoding='utf-8') as f:
        for i, amr in enumerate(AMRIO.read(filepath), 1):
            if i % 100 == 0:
                print('{} processed.'.format(i))

            annotation = annotator.annotate(amr.sentence)
            dump_amr_features(amr.graph, annotation, f, index=amr.id)

In [26]:
# preprocess_amr("data/amr_id_v2/train.txt", {'pos_tagger': 'nltk', 'ner_tagger':'tf'})
# preprocess_amr("data/amr_id_v2/dev.txt", {'pos_tagger': 'nltk', 'ner_tagger':'tf'})
predict_from_amr("data/amr_id_v2/test.txt", {'pos_tagger': 'nltk', 'ner_tagger':'tf'})

penman  ('t', [('t', 'instance', 'tiup'), ('i', 'instance', 'ilham'), ('b', 'instance', 'balon')], [('t', ':ARG0', 'i'), ('t', ':ARG1', 'b')])
penman  ('t', [('t', 'instance', 'tulis'), ('o', 'instance', 'obe'), ('p', 'instance', 'puisi')], [('t', ':ARG0', 'o'), ('t', ':ARG1', 'p')])
penman  ('k', [('k', 'instance', 'ketik'), ('s', 'instance', 'saya'), ('m', 'instance', 'makalah')], [('k', ':ARG0', 's'), ('k', ':ARG1', 'm')])
penman  ('a1', [('a1', 'instance', 'angga'), ('a2', 'instance', 'anak'), ('a3', 'instance', 'ajaib')], [('a1', ':mod', 'a2'), ('a2', ':mod', 'a3')])
penman  ('o', [('o', 'instance', 'orang'), ('i', 'instance', 'ibu'), ('d', 'instance', 'dosen')], [('o', ':ARG0', 'i'), ('o', ':ARG1', 'd')])
penman  ('r', [('r', 'instance', 'renang'), ('d', 'instance', 'dia')], [('r', ':ARG0', 'd')])
penman  ('c', [('c', 'instance', 'cari'), ('i', 'instance', 'ilham'), ('b', 'instance', 'buku'), ('h', 'instance', 'hilang')], [('c', ':ARG0', 'i'), ('c', ':ARG1', 'b'), ('b', ':mod', '

In [47]:
sentence="ketua mpr zulkifli hasan menyesalkan kisruh yang terjadi antara pelaku sarana transportasi online dan tradisional ."
annotator = FeatureAnnotator({"pos_tagger":"nltk"})
annotation = annotator.annotate(sentence)
# filtered_X = process_sentence(sentence)
# amr_graph = predict_and_process(filtered_X)
with open("test" + '.features', 'w', encoding='utf-8') as f:
    dump_amr_features(1, "", annotation, f)
