In [2]:
import string
import nltk
import pandas as pd
from collections import defaultdict
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [3]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [4]:
annotators = ['Furqon', 'Richard']

In [17]:
filepath = '../data/SRL/indonesia_srl_annotation.csv'
base_output_filepath = '../data/SRL/indonesia_srl_annotation_predicates_{}.txt'
base_sentence_predicate_pair_filepath = '../data/SRL/sentence_predicate_pair_{}.txt'

In [6]:
class SRLAnnotation:
    USELESS_TOKEN = "()"
    def __init__(self, sentence, annotation):
        cleaned_sentence = sentence
        for token in self.USELESS_TOKEN:
            cleaned_sentence = cleaned_sentence.replace(token, '')
        self.sentence = cleaned_sentence
        self.words = nltk.tokenize.word_tokenize(self.sentence)
        annotation_list = annotation.split(' ')
        self.annotation = annotation_list
    def get_predicate(self):
        try:
            predicate_index = self.annotation.index('B-V')
            return stemmer.stem(self.words[predicate_index])
        except:
            return "NOPRED"

In [18]:
def get_output_filepath(annotator):
    return base_output_filepath.format(annotator)

def get_sentence_predicate_pair_filepath(annotator):
    return base_sentence_predicate_pair_filepath.format(annotator)

In [8]:
srl_annotations_df = pd.read_csv(filepath)

### Create SRLAnnotations objects from csv

In [9]:
srl_annotations = defaultdict(defaultdict)

for annotator in annotators:
    current_srl_annotations = []
    for index, row in srl_annotations_df.iterrows():
        current_srl_annotations.append(SRLAnnotation(row['Kalimat'], row['Anotasi '+annotator]))
    srl_annotations[annotator]["annotations"] = current_srl_annotations

### Get predicates from SRLAnnotation object

In [10]:
print(srl_annotations[annotator]["annotations"][2].sentence)

Di tempat cawagub petahana Djarot Saiful Hidayat, TPS 08 Setiabudi, Kuningan Timur, Jakarta Selatan, suara pasangan Ahok-Djarot berada di bawah suara pasangan calon Anies-Sandi.


In [11]:
for annotator in srl_annotations:
    predicates = list(
        map(lambda srl: srl.get_predicate(), srl_annotations[annotator]["annotations"])
    )
    filtered_predicates = list(
        filter(lambda predicate: predicate is not None, predicates)
    )
    srl_annotations[annotator]["predicates"] = filtered_predicates

#### Sanity Check

In [12]:
for annotator in srl_annotations:
    print(annotator)
    print(srl_annotations[annotator]["predicates"][:100])
    print(srl_annotations[annotator]["predicates"][18])
    print(len(srl_annotations[annotator]["predicates"]))

Furqon
['NOPRED', 'hitung', 'pasang', 'unggul', 'dapat', 'NOPRED', 'jumlah', 'ikut', 'lolos', 'raih', 'oleh', 'kerja', 'tetap', '', '', 'tahu', 'menang', 'kantong', 'NOPRED', 'jadi', 'oleh', 'oleh', 'unggul', 'NOPRED', 'coblos', 'oleh', 'NOPRED', 'kalah', 'raih', 'NOPRED', 'menang', 'guna', 'raih', 'raih', 'ujar', 'pantau', 'menang', 'raih', 'ada', 'ada', 'klaim', 'klaim', 'kata', 'nyata', 'kata', 'aku', 'oleh', 'kata', 'sebut', 'kata', 'jamin', 'dapat', 'kata', 'urus', 'kata', 'raih', 'ada', 'demokat', 'klaim', 'tang', 'ujar', 'kata', 'tegas', 'ucap', 'kata', 'dapat', 'ucap', 'sampai', 'kata', 'usai', 'tarung', 'kalah', 'kalah', 'hana', 'NOPRED', 'dukung', 'dukung', 'kalah', '', 'pasang', 'dan', 'kholiq', 'kalah', 'muka', 'kata', 'resmi', 'menang', 'ujar', 'ujar', 'calon', 'ujar', 'lanjut', 'dapat', 'ujar', 'ujar', 'ingat', 'ujar', 'sebut', 'ujar', 'kata']
NOPRED
2112
Richard
['NOPRED', 'hitung', 'ada', 'unggul', 'dapat', 'NOPRED', 'jumlah', 'ikut', 'lolos', 'raih', 'oleh', 'kerja', '

#### Only get the agreed predicates (Furqon == Richard)

Also filter the predicates (Delete list that has no predicates)

In [13]:
predicates_pair = zip(srl_annotations[annotators[0]]["predicates"], srl_annotations[annotators[1]]["predicates"])
agreed_predicates = set([predicate[0] for predicate in predicates_pair if predicate[0] == predicate[1]])
filtered_agreed_predicates = [predicate for predicate in agreed_predicates if predicate != 'NOPRED']

In [14]:
print("Agreed predicates: ", len(agreed_predicates))
print("Filtered agreed predicates", len(filtered_agreed_predicates))
print("Deleted predicates", len(agreed_predicates) - len(filtered_agreed_predicates))

Agreed predicates:  325
Filtered agreed predicates 324
Deleted predicates 1


### Analysis

Turns out that Richard's annotation is better thatn Furqon's. Therefore, we should use the Richard's one

In [15]:
richard_filtered_predicates = [predicate for predicate in srl_annotations["Richard"]["predicates"] if predicate != 'NOPRED']
print(richard_filtered_predicates[:100])
len(richard_filtered_predicates)

['hitung', 'ada', 'unggul', 'dapat', 'jumlah', 'ikut', 'lolos', 'raih', 'oleh', 'kerja', 'tetap', 'unggul', 'kalah', 'tahu', 'menang', 'kantong', 'jadi', 'oleh', 'oleh', 'unggul', 'coblos', 'oleh', 'kalah', 'raih', 'menang', 'guna', 'raih', 'raih', 'ujar', 'pantau', 'menang', 'raih', 'ada', 'ada', 'klaim', 'klaim', 'kata', 'nyata', 'kata', 'anggap', 'kata', 'sebut', 'kata', 'jamin', 'dapat', 'catat', 'kata', 'raih', 'ada', 'ada', 'klaim', 'tang', 'ujar', 'kata', 'tegas', 'ucap', 'kata', 'dapat', 'ucap', 'sampai', 'kata', 'usai', 'tarung', 'kalah', 'kalah', 'kalah', 'dukung', 'dukung', 'kalah', 'kalah', 'kalah', 'kalah', '', 'kalah', 'muka', 'kata', 'resmi', 'menang', 'ujar', 'ujar', 'kata', 'ujar', 'lanjut', 'dapat', 'ujar', 'ujar', 'ingat', 'ujar', 'sebut', 'ujar', 'kata', 'aku', 'kata', 'duga', 'tegas', 'sikap', 'bahas', 'minta', 'ujar', 'dapat']


2029

Let's just use both anyway

In [16]:
for annotator in annotators:    
    filtered_annotator_predicates = set([predicate for predicate in srl_annotations[annotator]["predicates"] if predicate != 'NOPRED'])
#     filtered_annotator_predicates = srl_annotations[annotator]["predicates"]
    print("Writing {} predicates ".format(len(filtered_annotator_predicates)))
    with open(get_output_filepath(annotator), 'w') as fp:
        fp.seek(0)
        for predicate in filtered_annotator_predicates:
            fp.writelines(predicate + '\n')
        fp.truncate()

Writing 364 predicates 
Writing 357 predicates 


### Create (predicate, sentence) pair

In [21]:
for annotator in srl_annotations:
    with open(get_sentence_predicate_pair_filepath(annotator), 'w') as fp:
        fp.seek(0)
        for annotation in srl_annotations[annotator]["annotations"]:
            fp.writelines("{}|{}\n".format(annotation.sentence, annotation.get_predicate()))
        fp.truncate()