In [1]:
import os
import re
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
#path to SentiCoref dataset
folder = "../data/SentiCoref_1.0"
files = os.listdir(folder)

In [3]:
#gets all unique entity numbers within a document
def get_entity_count(df):
    #only extract entities which have sentiment
    entities = [(int) (s) for s in re.findall(r'\d+', "".join(df.number.unique()))]
    return sorted(set(entities))

In [4]:
#build a dictionary of entities
def build_initial_dict(df):
    #{key: ([tokens], entity_type, sentiment, occurrence)}
    d = dict((key, [[], "_", 0, 0]) for key in get_entity_count(df))
    
    for index, row in df[df.number != "_"].iterrows():
        #entity numbers, can be a single one or a few
        numbers = [(int) (s) for s in re.findall(r'\d+', row.number)]
        #token index of the word in document
        token_index = ((int) (row.token.split("-")[1])) - 1
        for i in range(len(numbers)):
            key = numbers[i]

            #append token index
            d[key][0].append(token_index)

            #append entity type
            if len(numbers) > 1 and len(row.entity.split("|")) > 1:
                if i > len(row.entity.split("|"))-1:
                    continue
                else:
                    types = row.entity.split("|")
                    d[key][1] = types[i][:3]
            else:
                if row.entity != "_":
                    d[key][1] = row.entity[:3]

            #append sentiment
            if len(numbers) > 1 and len(row.sentiment.split("|")) > 1:
                if i > len(row.sentiment.split("|"))-1:
                    continue
                else:
                    sentiments = row.sentiment.split("|")
                    d[key][2] = (int) (sentiments[i][:1])
            else:
                if row.sentiment != "_":
                    d[key][2] = (int) (row.sentiment[:1])

            #append occurrence
            if len(numbers) > 1 and len(row.occurrence.split("|")) > 1:
                occurrences = row.occurrence.split("|")
                d[key][3] = (int) (occurrences[i].split("-")[-1])
            else:
                if row.sentiment != "_":
                    d[key][3] = (int) (row.occurrence.split("-")[-1])

    return d

In [5]:
#creates a dataframe with features from a single document
def create_dataframe_from_dict(d, file_number):
    keys = list(d.keys())
    doc = [file_number for i in range(len(keys))]
    tokens = [d[key][0] for key in keys]
    types = [d[key][1] for key in keys]
    occurrences = [d[key][3] for key in keys]
    sentiments = [d[key][2] for key in keys]
    
    return pd.DataFrame([doc, keys, tokens, types, occurrences, sentiments],
                        index=["Document", "Entity", "Tokens", "Type", "Occurrence", "Sentiment"]).T

In [6]:
#PROCESSING SENTICOREF DATA
entity_df = pd.DataFrame(columns=["Document", "Entity", "Tokens", "Type", "Occurrence", "Sentiment"])
file_numbers = []
file_texts = []

for file in files:
   
    #read document, skip header, fill a data frame
    document = pd.read_csv(folder + "/" + file, 
                     sep="\t", 
                     skiprows=7,
                     header=None, 
                     names=["token", "char", "word", "entity", "sentiment", "occurrence", "number", "NaN"], 
                     quoting=3, 
                     encoding="utf-8")
    
    #save file number and text
    file_number = (int) (file.split(".")[0])
    file_numbers.append(file_number)
    file_texts.append(list(document.word))
    
    #create a dataframe from a document and append it to the corpus dataframe
    d = build_initial_dict(document)
    df = create_dataframe_from_dict(d, file_number)
    entity_df = entity_df.append(df)

#reset index and save dataframe to pickle    
entity_df.reset_index(drop=True, inplace=True)
entity_df.to_pickle("../data/entities.pkl")

In [7]:
#unpickled_df = pd.read_pickle("../data/entities.pkl")
#unpickled_df

Unnamed: 0,Document,Entity,Tokens,Type,Occurrence,Sentiment
0,1,1,"[11, 12, 13, 15, 51, 52, 53, 54, 55, 57, 228, ...",PER,11,4
1,1,2,[49],LOC,1,3
2,1,3,"[68, 301, 312, 322, 384, 428]",ORG,6,3
3,1,4,"[98, 105, 186, 429]",LOC,4,3
4,1,5,"[70, 303, 310, 382, 436]",ORG,5,3
...,...,...,...,...,...,...
14567,9966,25,[355],LOC,1,3
14568,9966,26,[390],LOC,1,3
14569,9966,27,[405],LOC,1,3
14570,9966,28,[411],LOC,1,3


In [8]:
#PROCESSING SENTINEWS DATA
#load sentinews document level text
sentinews_df = pd.read_csv("../data/SentiNews_document-level.txt", sep="\t")
#use only documents used in senticoref
sentinews_df = sentinews_df[sentinews_df['nid'].isin(file_numbers)]

In [9]:
#create dataframe of documents with sentinews data
ids = list(sentinews_df.nid)
sources = [url.split(".")[1] for url in sentinews_df.main_url]
lengths = [len(text) for text in sentinews_df.content]
years = [date.split("-")[0] for date in sentinews_df.date]
avg_sentiments = [avg for avg in sentinews_df.avg_sentiment]
sd_sentiments = [sd for sd in sentinews_df.sd_sentiment]
sentiments = [sent for sent in sentinews_df.sentiment]
texts = [x for _,x in sorted(zip(file_numbers,file_texts))]

document_df = pd.DataFrame([ids, texts, lengths, sources, avg_sentiments, sd_sentiments, sentiments],
                        index=["Document", "Text", "Length", "Source", "Avg_sentiment", "Sd_sentiment", "Sentiment"]).T
document_df.to_pickle("../data/documents.pkl")

In [10]:
#unpickled_df = pd.read_pickle("../data/documents.pkl")
#unpickled_df

Unnamed: 0,Document,Text,Length,Source,Avg_sentiment,Sd_sentiment,Sentiment
0,1,"[Evropska, komisija, mora, narediti, analizo, ...",2939,24ur,3.5,0.707,neutral
1,20,"[Pojavljajo, se, namigi, ,, da, naj, bi, Deuts...",2191,24ur,3,0,neutral
2,32,"[Predstavniki, ruskega, plinskega, giganta, Ga...",2149,24ur,2.5,0.707,neutral
3,42,"[Slovenija, ima, z, 3,6, -, odstotno, stopnjo,...",1977,24ur,2.5,0.707,neutral
4,44,"[Nadzorniki, Darsa, so, razpravljali, o, izgra...",3026,24ur,2.5,0.707,neutral
...,...,...,...,...,...,...,...
832,10323,"[Čeprav, je, Gašpar, Gašpar, Mišič, član, PS, ...",3005,zurnal24,3,0,neutral
833,10369,"[SD, največ, podpore, ,, sledi, SDS, ., Desus,...",2304,zurnal24,3,0,neutral
834,10395,"[Direktor, ZD, Velenje, Jože, Zupančič, je, za...",3381,zurnal24,3,0,neutral
835,10405,"[To, je, razlog, za, pesimizem, pa, tudi, spod...",2285,zurnal24,3,0,neutral
