In [1]:
import numpy as np

In [16]:
import pandas as pd

In [32]:
import nltk

In [30]:
import re

In [3]:
import os

In [41]:
from sklearn.metrics import accuracy_score

In [26]:
class TokenizedDataFrame():
    __slots__ = ['stopwords', 'stemmed_dict']
    
    def __init__(self, stopwords):
        self.stopwords = stopwords
        self.stemmed_dict = None
        
    
    def prepare_text(self, doc):
        doc = re.sub(r'[^\w\s]','',doc)
        doc = doc.lower()
        doc = nltk.word_tokenize(doc)
        doc = [word for word in doc if word not in self.stopwords]
        return doc
    
    
    def apply_stemming(self, df):
        all_words = list()

        for item in list(df):
            all_words.extend(item)
    
        unique_words = list(set(all_words))
        
        self.stemmed_dict = dict()
        unique_words_stemmer = stemmer.stem(unique_words, parser)

        for item in unique_words_stemmer:
            original = item[0]
            try:
                stemmed = list(item[1])[0]
                self.stemmed_dict[original] = stemmed
            except IndexError:
                self.stemmed_dict[original] = stemmed
        
        
        missings = {item:item for item in unique_words if item not in self.stemmed_dict.keys()}
        self.stemmed_dict.update(missings)
        
        
    @staticmethod
    def remove_empty(df, col):
        return df.loc[df[col].apply(lambda x: len(x)) > 0, :]
    
       
    def transform(self, X, col, **kwargs):
        try:
            X_local = X.copy()
            X_local[col] = X_local[col].apply(self.prepare_text)
            X_local = self.remove_empty(X_local, col)
        except KeyError:
            raise KeyError("{} not present in dataframe".format(col))
            
        if not self.stemmed_dict:
            self.apply_stemming(X_local[col])
            
        X_local[col] = X_local[col].apply(lambda doc: list(map(self.stemmed_dict.get, doc)))
        X_local = X_local.reset_index(drop=True)
        
        return X_local

In [27]:
STOPWORDS_PATH = 'data/polish_stopwords.csv'

In [34]:
from pyMorfologik import Morfologik
from pyMorfologik.parsing import ListParser

parser = ListParser()
stemmer = Morfologik()


In [28]:
stopwords = list(pd.read_csv(STOPWORDS_PATH, engine='python', header=None).iloc[:, 0])

In [None]:
df = pd.read_csv('data/challenge_set_dlanas.csv', sep=';')

In [36]:
scoring_set = TokenizedDataFrame(stopwords).transform(df, 'token')

In [39]:
y_true =  scoring_set['sentiment']

In [42]:
def get_accuracy_score(pred, y_test):
    return accuracy_score(y_test, pred)

In [2]:
# evaluate

In [43]:
files = os.listdir('wyniki')

In [44]:
names = []
results = []
for file in files:
    pred = np.load(f"wyniki/{file}")
    results.append(get_accuracy_score(y_true, pred))
    names.append(file)
    

In [45]:
order = np.argsort(results)[::-1]
final = np.array(names)[order]
for idx, name in enumerate(final):
    print(f"#{idx}: {name}")

#0: rafal_test_lstm.npy
#1: rafal_test.npy
