## Benchmark: Sentimental Analysis

In [2]:
from sentimental_hwglu.sa_naive import NaiveSA
from sentimental_hwglu.utils import loadIMDBdataset, tokenizer_porter
from sentimental_hwglu.words_statistics import WordStatistics
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
df = loadIMDBdataset(filename='/data/datasets/aclImdb_vclean.csv')

In [4]:
def test_naive():
    naive_sa = NaiveSA(verbose=False)
    for k in range(5):
        print(df.reviews[k])
        print(naive_sa.predict(df.reviews[k]), "<=>", df.sentiment[k])
    n_tests = 500
    print("Score of Native SA", naive_sa.score(df.reviews[:n_tests], df.sentiment[:n_tests]))

In [5]:
import pandas as pd
def dummy():
    len(df.reviews)
    df = pd.DataFrame({"reviews": ["Hello Hello This is good", "Hello this not good"], "sentiment": [0, 1]})

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

class WordStatistics:

    def __init__(self) -> None:
        self._vect = None
        self._bag = None
        self._agg = None
    
    def words2Vect(self, X, vectorizer, tokenizer = None):
        self._vect = vectorizer(tokenizer=tokenizer)
        self._bag = self._vect.fit_transform(X).toarray()
        self._agg = self._bag.sum(axis=0)

    def _reshape(self, Z):
        return np.transpose(Z.reshape(Z.shape[1]))

    def words2VectFiltered(self, X, y, vectorizer, tokenizer = None, filter=None):
        data = X.loc[y.apply(filter)] if filter is not None else X
        self._vect = vectorizer(tokenizer=tokenizer)
        self._bag = self._vect.fit_transform(data).toarray()
        self._agg = self._bag.sum(axis=0)
    
    def hist(self, sort=False):
        if sort: plt.plot(np.sort(self._agg))
        else: plt.plot(self._agg)
        plt.show()
    
    def top(self, n=10):
        # indexes = np.argpartition(self._agg, n)[-n:]
        indexes = np.argsort(-self._agg)[:n]
        names = self._vect.get_feature_names_out()
        top_words = [names[k] for k in indexes]
        return top_words
    
    def getWords(self):
        return self._vect.get_feature_names_out()
    
    def getAggregated(self):
        return self._agg


In [7]:
def dummy():
    n = 100
    sw = WordStatistics()
    X, y = df.reviews[0:n], df.sentiment[0:n]
    # sw.words2Vect(X, vectorizer=CountVectorizer, tokenizer=tokenizer_porter)
    sw.words2VectFiltered(X, y, vectorizer=CountVectorizer, tokenizer=tokenizer_porter, filter=lambda x : x == 1)
    Z = sw.hist(False)
    sw.top(n=10)

In [8]:
n = -1
sets = []
vocabularies = []
aggregations = []
for k, filter in enumerate([lambda x : x == 0, lambda x : x == 1]):
    print("Processing ", k)
    sw = WordStatistics()
    X, y = df.reviews[0:n], df.sentiment[0:n]
    # sw.words2Vect(X, vectorizer=CountVectorizer, tokenizer=tokenizer_porter)
    sw.words2VectFiltered(X, y, vectorizer=CountVectorizer, tokenizer=tokenizer_porter, filter=filter)
    sets.append(set(sw.top(n=-1)))
    vocabularies.append(sw._vect.vocabulary_)
    aggregations.append(sw.getAggregated())

set0 = sets[0].difference(sets[1])
set1 = sets[1].difference(sets[0])
set_common = sets[0].intersection(sets[1])

print("set 0 - ", len(set0), ": ", set0)
print("set 1 - ", len(set1), ": ", set1)
print("set common - ", len(set_common), ": ", set_common)

Processing  0
Processing  1
set 0 -  19360 :  {'tosa', 'mombi', 'allway', 'pluh', 'bluescreen', 'ipswitch', 'baisez', 'bronwyn', 'refrenc', 'skunker', 'horrorvis', 'hrolfgar', 'dianet', 'cartland', 'gauteng', 'holliston', 'polnareff', 'apocado', '_plan', 'galerian', 'nastassja', 'saishu', 'hellspawn', 'upanishad', 'baghban', 'teigh', '362', 'housecoat', 'bekker', 'encyclopidi', 'completeist', 'incredu', 'teenkil', 'dsd', 'conceptionless', 'bezac', 'grito', 'mehbooba', 'dagher', 'moneyshot', 'doggish', 'malplacé', 'geneticel', 'schirippa', 'conpirici', 'swip', 'maby', 'palomo', 'meathook', 'ghostlik', 'widerberg', 'spm', '_absolut', 'culpa', 'pollen', 'kroona', 'coriveau', 'skagg', 'drivelesqu', 'nakano', 'dennehey', 'lithuanian', 'fusco', 'morgia', 'mehster', 'anesthesiologist', 'caio', 'horrormoviejourn', 'whhhyyyy', 'masterb', 'letisha', 'disaster', 'ballew', 'garroway', 'szalinski', 'bizzarr', 'mam', 'unprovock', 'trickiest', 'fleed', 'rigoberta', 'fuad', 'fertilis', 'dummest', 'vid

In [9]:
def fr_from_set(set_, index):
    frequencies = []
    for k in set_: 
        frequencies.append(aggregations[index][vocabularies[index][k]])
    frequencies = list(set(frequencies))
    frequencies.sort(reverse=True)
    return frequencies
# frequencies_list = [fr_from_set(set0, 0), fr_from_set(set1, 1)]

In [10]:
freq_maps_list = []
with open('/data/zibaldone/projects/ai/betchelorZhanna/python/logs/sets_diff.txt', 'w+') as f:
    freq_map = {}
    print("--------------\nset 0\n--------------", file=f)
    for k in set0: 
        aggregates = aggregations[0][vocabularies[0][k]]
        if aggregates > 30:
            print(k, ": ", aggregations[0][vocabularies[0][k]], file=f)
        freq_map[k] = aggregates
    freq_maps_list.append(freq_map)

    freq_map = {}
    print("--------------\nset 1\n--------------", file=f)
    for k in set1: 
        aggregates = aggregations[1][vocabularies[1][k]]
        if aggregates > 30:
            print(k, ": ", aggregations[1][vocabularies[1][k]], file=f)
        freq_map[k] = aggregates
    freq_maps_list.append(freq_map)

In [18]:
class StatDiff:
    def __init__(self, word, use0, use1) -> None:
        self.word = word
        self.use0 = use0
        self.use1 = use1

print("--------------\nset 0 <=> set 1\n--------------")
diff_use = {}
diff_use_stats = {}
for k in set_common: 
    use_1 = aggregations[1][vocabularies[1][k]]
    use_0 = aggregations[0][vocabularies[0][k]]
    s = k +  ": " + str(use_1) + " ({:.3f})".format((use_1)/(use_1 + use_0)) + " <--> " + str(use_0) + " ({:.3f})".format((use_0)/(use_0 + use_1))
    # print(s)
    try:
        diff_use[use_1 - use_0].append(s)
        diff_use_stats[use_1 - use_0].append(StatDiff(k, use_0, use_1))
    except:
        diff_use[use_1 - use_0] = [s]
        diff_use_stats[use_1 - use_0] = [StatDiff(k, use_0, use_1)]

a = list(diff_use.keys())
a.sort(reverse=True)
with open('/data/zibaldone/projects/ai/betchelorZhanna/python/logs/diff_words.txt', 'w+') as f:
    print("--------------\nset 0 <=> set 1\n--------------", file=f)
    for k in a:
        print(k, ": ", file=f)
        for u in diff_use[k]:
            print("  => ", u, file=f)

--------------
set 0 <=> set 1
--------------


In [33]:
a = list(diff_use_stats.keys())
a.sort()
list_positive_words = {}
list_negtive_words = {}
positive = lambda z: z.use1
negative = lambda z: z.use0
with open('/data/zibaldone/projects/ai/betchelorZhanna/python/logs/list_positive_words.txt', 'w+') as fpos:
    with open('/data/zibaldone/projects/ai/betchelorZhanna/python/logs/list_nagative_words.txt', 'w+') as fneg:
        for name, func in [["NEGATIVE", negative], ["POSITIVE", positive]]:
            for k in a:
                stats = diff_use_stats[k]
                for u in stats:
                    tot = u.use1 + u.use0
                    value = func(u)
                    perc = value / (tot)
                    if perc >= 0.80 and value > 100:
                        print("[{}]".format(name), u.word, ": ", func(u), ' -> {:.3f}%'.format(perc))
                        if name == "NEGATIVE": 
                            list_negtive_words[u.word] = perc 
                            print(u.word, ": {:.3f}%".format(perc), file=fneg)
                        else: 
                            list_positive_words[u.word] = perc
                            print(u.word, ": {:.3f}%".format(perc), file=fpos)

[NEGATIVE] worst :  4888  -> 0.916%
[NEGATIVE] wast :  4008  -> 0.914%
[NEGATIVE] aw :  3352  -> 0.902%
[NEGATIVE] terribl :  3271  -> 0.848%
[NEGATIVE] stupid :  3289  -> 0.838%
[NEGATIVE] horribl :  2545  -> 0.856%
[NEGATIVE] wors :  2479  -> 0.843%
[NEGATIVE] crap :  1803  -> 0.855%
[NEGATIVE] poorli :  1258  -> 0.904%
[NEGATIVE] lame :  1259  -> 0.883%
[NEGATIVE] dull :  1364  -> 0.828%
[NEGATIVE] suck :  1311  -> 0.823%
[NEGATIVE] mess :  1241  -> 0.805%
[NEGATIVE] badli :  1073  -> 0.836%
[NEGATIVE] laughabl :  916  -> 0.903%
[NEGATIVE] pointless :  899  -> 0.887%
[NEGATIVE] pathet :  917  -> 0.873%
[NEGATIVE] garbag :  824  -> 0.859%
[NEGATIVE] embarrass :  832  -> 0.838%
[NEGATIVE] redeem :  751  -> 0.853%
[NEGATIVE] insult :  702  -> 0.824%
[NEGATIVE] whatsoev :  565  -> 0.848%
[NEGATIVE] wooden :  545  -> 0.833%
[NEGATIVE] unfunni :  467  -> 0.923%
[NEGATIVE] seagal :  431  -> 0.921%
[NEGATIVE] crappi :  468  -> 0.851%
[NEGATIVE] rubbish :  461  -> 0.851%
[NEGATIVE] amateuris