In [1]:
from libs.corp_df import *
from libs.Embeddings import *
from lexicons.mila import get_lex
import numpy as np
import pickle
from tqdm.auto import tqdm
import random
import os
import time
import pandas as pd

In [2]:
!mkdir corpora/articles_stage2
logname_embedding = 'logs/embedding_log.txt'
logname_gold = 'logs/gold_annotation.txt'
def logInfo(doc_id, num_toks, json_size, time_to_encode, gold_info):
    with open(logname_embedding, 'a') as f:
        w = lambda i: f.write(i + '\n')
        w('DOC ID: ' + str(doc_id))
        w('# TOKS: ' + str(num_toks))
        w('JSON SIZE: ' + str(json_size) + 'mb')
        w('ENCODING TIME: ' + str(time_to_encode) + 's')
        w('')
        f.close()
        
    with open(logname_gold, 'a') as g:
        g.write('DOC ID: ' + str(doc_id) + '\n\n')
        g.write(gold_info + '\n\n')
        g.close()

In [3]:
# retrieve lexicon, delete ambiguous tokens (ie multiple tokens with the same root or the same binyan)
vlex, ambiguous_undotted = get_lex()
print('# Ambiguous tokens omitted: ' + str(len(ambiguous_undotted)))
print('Ambiguous tokens omitted:')
print(ambiguous_undotted)

# get dictionaries for mapping
def getDictionary(cat_to):
    get_cat = lambda i: vlex[i].to_list()
    
    return {i: j for i, j in zip(get_cat('undotted'), get_cat(cat_to))}

root_dict = getDictionary('root')
binyan_dict = getDictionary('binyan')

# function for getting value from dictionary
get_root = lambda i: root_dict[i]
get_binyan = lambda i: binyan_dict[i]

# VB: Verb | BN: Participle | BNT: Participle in construct state
verb_tagset = ['VB', 'BN', 'BNT']

# verbs that weren't in the dictionary
unsuccessful_verbs = set()
unsuccessful_count = 0
successful_count = 0

  0%|          | 0/4831 [00:00<?, ?it/s]

# Ambiguous tokens omitted: 34
Ambiguous tokens omitted:
{'הושט', 'ניצל', 'צוחצח', 'חרש', 'התפלח', 'אוורר', 'הושב', 'פרש', 'חבק', 'שר', 'ניתק', 'בוסס', 'השיב', 'הודח', 'נתפרש', 'השביע', 'הונה', 'הוצל', 'רוקן', 'הסיח', 'התחבר', 'ניחם', 'ניבא', 'גבה', 'פילח', 'עייף', 'הדיח', 'הזיח', 'חולל', 'טח', 'כופף', 'סובב', 'לווה', 'צותת'}


In [4]:
# get raw articles sans embeddings
articles_raw_dir = 'corpora/articles_stage1.pickle'

with open(articles_raw_dir, 'rb') as f:
    raw_docs = pickle.load(f)

In [5]:
# filter raw docs by size for space efficiency
raw_docs_sampled = list([d for d in raw_docs if len(str(d).split()) < 500])
print(len(raw_docs_sampled))

524


In [6]:
# instantiate AlephBERT model
alephBERT = Embedding('onlplab/alephbert-base')

Some weights of the model checkpoint at onlplab/alephbert-base were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight

In [7]:
# save directory
savedir = 'corpora/articles_stage2/'

In [8]:
# new_docs = []

In [9]:
# iterate through documents, embedding each sentence
for doc in tqdm(raw_docs_sampled):
    goldstandard_log = []
    for pg in doc:
        for sent in pg:
            sent.encode_sent(alephBERT)
            # get lemmas
            sent.lemmatize_sent()
            
            # check for errors
            for tok in sent:
                if len(tok.lemmas) > 5:
                    print('WARNING: POSSIBLE ERROR')
                    print(sent)
                    print()
                    print(tok.raw_tok)
                    print()
                    print('Forms:')
                    [print(l.form) for l in tok]
                    print('\n\n')
                    
                if tok.tokenizer_index is None:
                    print(tok.raw_tok)
                
                # append gold standard verb labels (roots and verbal templates)
                for lem in tok:
                        if lem.pos_tag in verb_tagset:
                            if lem.lemma in root_dict.keys():
                                lem.setShoresh(get_root(lem.lemma))
                                lem.setBinyan(get_binyan(lem.lemma))
                                successful_count += 1
                                goldstandard_log.append(lem.lemma + ' <|> ' + get_root(lem.lemma) + ' <|> ' + get_binyan(lem.lemma))
                            else:
                                lem.setShoresh(None)
                                lem.setBinyan(None)
                                unsuccessful_verbs.add(lem.lemma)
                                unsuccessful_count += 1
                                goldstandard_log.append('NOT FOUND')
                        else:
                            lem.setShoresh(None)
                            lem.setBinyan(None)
                            goldstandard_log.append('NOT VERB')
    
    
    # new_docs.append(doc)
    fName = savedir + str(doc.doc_id) + '.json'
    start = time.time()
    doc.to_json(fileDir=fName)
    end = time.time()
    elapsed = end - start
    
    # get size info
    fsize = os.path.getsize(fName)/(1024 ** 2)
    logInfo(doc.doc_id, 
            len(str(doc).split()), 
            np.round(fsize, 1), 
            np.round(elapsed, 2), 
            '\n'.join(goldstandard_log))

  0%|          | 0/524 [00:00<?, ?it/s]

[CLS] והתשובה כנראה לא מצאה חן בעיניה והיא השתמשה בתשובתינו ובמראינו החיצוני , כהצדקה למעצר הילדות עד תום ההליכים ... בתום הדיון השני מצאנו לנכון להסביר ביתר פירוט מה דעתינו על חוקים וקדושתם ... [SEP]

ובמראינו

Forms:
ו
ב
ה
הראה
את
אנחנו





In [10]:
success_rate = np.round(successful_count/(successful_count + unsuccessful_count)*100, 2)
print('Gold-standard tagging success rate for verbs: ' + str(success_rate) + '%')

Gold-standard tagging success rate for verbs: 99.92%
