In [4]:
import stanza
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [5]:
#IF RUNNING FOR THE FIRST TIME
#download the Slovene stanza model
#stanza.download('sl')

In [6]:
#make a preprocessing pipeline for Slovene language, sentences are already tokenized
nlp = stanza.Pipeline(lang='sl', tokenize_pretokenized=True)

2020-05-24 18:37:11 INFO: Loading these models for language: sl (Slovenian):
| Processor | Package |
-----------------------
| tokenize  | ssj     |
| pos       | ssj     |
| lemma     | ssj     |
| depparse  | ssj     |

2020-05-24 18:37:11 INFO: Use device: cpu
2020-05-24 18:37:11 INFO: Loading: tokenize
2020-05-24 18:37:11 INFO: Loading: pos
2020-05-24 18:37:12 INFO: Loading: lemma
2020-05-24 18:37:13 INFO: Loading: depparse
2020-05-24 18:37:14 INFO: Done loading processors!


In [7]:
entity_df = pd.read_pickle("../data/entities.pkl")
document_df = pd.read_pickle("../data/documents.pkl")

In [8]:
#preprocess the document tokens
#return a lemma, POS tag and dependency head of the token
def get_tagged_doc(index):
    try:
        doc = nlp(list(document_df[document_df["Document"] == index]["Text"]))
    except:
        return []
    
    words = []
    for sentence in doc.sentences:
        for word in sentence.words:
            words.append((word.lemma, word.upos, word.head-1))
    
    return words

In [1]:
#get the 1st level dependecy context of a token
#allow only tokens with useful POS tags
def get_context(words, tokens):
    context = []
    tags = ['ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB']
    
    for i in range(len(words)):
        if words[i][2] in tokens and words[i][1] in tags:
            context.append((words[i][0], words[i][1]))
            
    return list(set(context))

In [10]:
#return context of each entity based on its tokens in document
nid = 1
words = get_tagged_doc(nid)
context = []
for index, row in entity_df.iterrows():
    if row.Document != nid:
        nid = row.Document
        words = get_tagged_doc(nid)
            
    context.append(get_context(words, row.Tokens))

In [11]:
entity_df['Context'] = context

In [12]:
entity_df.to_pickle("../data/entity_context.pkl")

In [13]:
#unpickled_df = pd.read_pickle("../data/entity_context.pkl")
#unpickled_df

Unnamed: 0,Document,Entity,Tokens,Type,Occurrence,Sentiment,Context
0,1,1,"[11, 12, 13, 15, 51, 52, 53, 54, 55, 57, 228, ...",PER,11,4,"[(Iztok, PROPN), (kolega, NOUN), (kmetijski, A..."
1,1,2,[49],LOC,1,3,[]
2,1,3,"[68, 301, 312, 322, 384, 428]",ORG,6,3,"[(EU, PROPN), (Francija, PROPN)]"
3,1,4,"[98, 105, 186, 429]",LOC,4,3,[]
4,1,5,"[70, 303, 310, 382, 436]",ORG,5,3,"[(Slovenija, PROPN), (Jarc, PROPN)]"
...,...,...,...,...,...,...,...
14567,9966,25,[355],LOC,1,3,"[(odpravljati, VERB)]"
14568,9966,26,[390],LOC,1,3,[]
14569,9966,27,[405],LOC,1,3,[]
14570,9966,28,[411],LOC,1,3,[]
