In [1]:
import os
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import lemmagen.lemmatizer
from lemmagen.lemmatizer import Lemmatizer

lemmatizer = Lemmatizer(dictionary=lemmagen.DICTIONARY_SLOVENE)

In [2]:
def get_neighborhood(index, distance):
    before = df.word.values[index-distance:index]
    after = df.word.values[index+1:index+distance+1]
    
    return before.tolist() + after.tolist()

In [3]:
folder = "SentiCoref_1.0"
files = os.listdir(folder)
combined_entities = []
for file in files:
    df = pd.read_csv(folder + "/" + file, 
                     sep="\t", 
                     skiprows=7, 
                     header=None, 
                     names=["token", "char", "word", "entity", "sentiment", "occurrence", "number", "NaN"], 
                     quoting=3, 
                     encoding="utf-8")

    names = []
    entities = []

    for i in range(df.shape[0]):
        #check if word is an entity
        if df["entity"].values[i] != "_" and df["occurrence"].values[i] != df["occurrence"].values[i-1]:
            number = df["number"].values[i]
            name = file.split(".")[0] + "_" + number[number.find("[")+1:number.find("]")]
            entity = df["entity"].values[i][:3]
            neighborhood = get_neighborhood(i, 3)
            #check if entity already in list
            if name in names:
                #add tokens
                entities[names.index(name)]["tokens"] += neighborhood
            else:
                #create new entity entry
                names.append(name)
                entities.append({"name": name,
                                 "entity": entity,
                                 "tokens": neighborhood,
                                 "sentiment": -1
                })
            #add sentiment to entity
            if df["sentiment"].values[i] != "_":
                entities[names.index(name)]["sentiment"] = int(df["sentiment"].values[i][0])
                
    combined_entities += entities
    df = pd.DataFrame(combined_entities)

In [4]:
#lemmatization, punctuation removal, lowercase
for i in range(df.shape[0]):
    df.tokens[i] = " ".join([lemmatizer.lemmatize(token).lower() for token in df.tokens[i] if str.isalnum(token)])

In [5]:
#remove negative sentiments which are a result of a bug
df = df.loc[df['sentiment'] != -1]

In [6]:
#add two separate classes for binary classification of neutral and polar entities
binary_list = []
for i in range(df.shape[0]):
    tokens = df.tokens.values[i]
    entity = df.entity.values[i]
    sentiment = df.sentiment.values[i]
    if sentiment == 3:
        neutral = 1
    else:
        neutral = 0
    if sentiment > 3:
        positive = 1
    elif sentiment < 3:
        positive = 0
    else:
        positive = -1
    data = {"tokens":tokens, "LOC": 0, "ORG": 0, "PER": 0, "sentiment": sentiment, "neutral": neutral, "positive": positive}
    data[entity] = 1
    binary_list.append(data)

In [7]:
df = pd.DataFrame(binary_list)

In [8]:
name = 'senticoref_cleaned.tsv'
df.to_csv(name, index=False)
#is the data saved ok
aa = pd.read_csv(name)
aa.head(10)

Unnamed: 0,tokens,LOC,ORG,PER,sentiment,neutral,positive
0,komisija mora narediti kolega b komisija anali...,0,1,0,3,1,-1
1,današnji srečanje v povedati minister za,1,0,0,3,1,-1
2,po njun beseda in francija podoben podoben usm...,0,1,0,3,1,-1
3,beseda slovenija in podoben stališče glede usm...,0,1,0,3,1,-1
4,v čas predsedovanje prihodnji leto med možen u...,0,1,0,3,1,-1
5,ukrep eu jesti omeniti obnova zemlja dogajanje...,0,0,1,4,0,1
6,raven tuditi mena da slovenija in francija jes...,0,0,1,4,0,1
7,stelzero jesti že sin france rosti stelzero že...,0,0,1,3,1,-1
8,analiza dnk roda naj biti jesti domneven sin r...,0,0,1,3,1,-1
9,on oddati v p poročanje soden medicina v n pa,1,0,0,3,1,-1
