In [1]:
import pandas as pd
import math
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [3]:
models = [LogisticRegression(), LinearSVC(), MultinomialNB()]
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=0, norm='l2')

def get_metrics(df, label_name, model):
    df['tokens'] = df['tokens'].astype('U')
    features = df.tokens
    labels = df[label_name]
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)
    
    X_train = tfidf.fit_transform(X_train)
    clf = model
    clf.fit(X_train, y_train)
    
    X_test = tfidf.transform(X_test)
    y_pred = clf.predict(X_test)
    
    return metrics.classification_report(y_test, y_pred)

In [4]:
name = 'senticoref_cleaned.tsv'
df = pd.read_csv(name)
df.head()

Unnamed: 0,tokens,LOC,ORG,PER,sentiment,neutral,positive
0,komisija mora narediti kolega b komisija anali...,0,1,0,3,1,-1
1,današnji srečanje v povedati minister za,1,0,0,3,1,-1
2,po njun beseda in francija podoben podoben usm...,0,1,0,3,1,-1
3,beseda slovenija in podoben stališče glede usm...,0,1,0,3,1,-1
4,v čas predsedovanje prihodnji leto med možen u...,0,1,0,3,1,-1


In [5]:
with open("negative_words_Kadunc.txt", "r", encoding="utf-8") as file:
    data = file.read()
    negative_words = data.split("\n")
    
with open("positive_words_Kadunc.txt", "r", encoding="utf-8") as file:
    data = file.read()
    positive_words = data.split("\n")

In [6]:
lexicon_results = []
positive = 0
negative = 0
for i in range(df.shape[0]):
    if not isinstance(df.tokens.values[i], str) and math.isnan(df.tokens.values[i]):
        positive = 0
    else:
        positive = sum([1 if token in positive_words else 0 for token in df.tokens.values[i].split()])
    
    if not isinstance(df.tokens.values[i], str) and math.isnan(df.tokens.values[i]):
        negative = 0
    else:
        negative = sum([1 if token in negative_words else 0 for token in df.tokens.values[i].split()])
    
    #assign sentiment
    if positive == negative:
        neutral = 1
    else:
        neutral = 0
    lexicon_results.append(neutral)

In [7]:
print("Lexicon results")
print("Majority: " + str(max(df.neutral.value_counts())/sum(df.neutral.value_counts())))
print("Accuracy: " + str(accuracy_score(df.neutral.values, lexicon_results)))
print("F1 score: " + str(f1_score(df.neutral.values, lexicon_results)))

Lexicon results
Majority: 0.7695204122076893
Accuracy: 0.6427863654379706
F1 score: 0.7588467456017125


In [8]:
#Multiclass classification
for model in models:
    print(str(model).split("(")[0])
    print(get_metrics(df, 'sentiment', model))

LogisticRegression
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         5
           2       0.62      0.05      0.10       252
           3       0.77      0.99      0.87      1541
           4       0.50      0.01      0.03       216
           5       0.00      0.00      0.00         5

    accuracy                           0.77      2019
   macro avg       0.38      0.21      0.20      2019
weighted avg       0.72      0.77      0.68      2019

LinearSVC
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         4
           2       0.47      0.28      0.35       233
           3       0.81      0.94      0.87      1550
           4       0.44      0.15      0.23       228
           5       0.00      0.00      0.00         4

    accuracy                           0.77      2019
   macro avg       0.34      0.27      0.29      2019
weighted avg       0.72      0.77      0.73     

In [9]:
#Binary classification for neutral entities
for model in models:
    print(str(model).split("(")[0])
    print(get_metrics(df, 'neutral', model))

LogisticRegression
              precision    recall  f1-score   support

           0       0.48      0.07      0.13       439
           1       0.79      0.98      0.87      1580

    accuracy                           0.78      2019
   macro avg       0.63      0.53      0.50      2019
weighted avg       0.72      0.78      0.71      2019

LinearSVC
              precision    recall  f1-score   support

           0       0.58      0.35      0.44       474
           1       0.82      0.92      0.87      1545

    accuracy                           0.79      2019
   macro avg       0.70      0.64      0.65      2019
weighted avg       0.77      0.79      0.77      2019

MultinomialNB
              precision    recall  f1-score   support

           0       0.67      0.03      0.07       461
           1       0.78      0.99      0.87      1558

    accuracy                           0.78      2019
   macro avg       0.72      0.51      0.47      2019
weighted avg       0.75      0.

In [10]:
#Binary classification for positive and negative entitites
pdf = df.loc[df['positive'] != -1]

for model in models:
    print(str(model).split("(")[0])
    print(get_metrics(pdf, 'positive', model))

LogisticRegression
              precision    recall  f1-score   support

           0       0.73      0.83      0.78       244
           1       0.78      0.66      0.72       222

    accuracy                           0.75       466
   macro avg       0.76      0.75      0.75       466
weighted avg       0.75      0.75      0.75       466

LinearSVC
              precision    recall  f1-score   support

           0       0.77      0.73      0.75       274
           1       0.64      0.68      0.66       192

    accuracy                           0.71       466
   macro avg       0.70      0.71      0.70       466
weighted avg       0.71      0.71      0.71       466

MultinomialNB
              precision    recall  f1-score   support

           0       0.66      0.90      0.76       241
           1       0.83      0.51      0.63       225

    accuracy                           0.71       466
   macro avg       0.75      0.71      0.70       466
weighted avg       0.74      0.