## SentimentAnalysisModel

In this notebook we use the `sentimental_hwglu.sa_model.SentimentAnalysisModel` class to train different models in a standardize method.

In [1]:
%load_ext autoreload
%autoreload 2

In [14]:
# imports
import time

import sentimental_hwglu.sa_model as sam
import sentimental_hwglu.utils as util_sa
from sklearn.model_selection import train_test_split

from sentimental_hwglu.utils import Project


In [15]:
data_directory = "/home/zhanna/bachelorarbeit/zbb/data"

In [16]:
if data_directory is None:
    print("Project directory: ")
    data_directory = input()

In [17]:
project = Project(data_directory)
df_imdb = util_sa.loadIMDBdataset(filename=project.csv_filename_extened)
# df_imdb = df_imdb[:2000]
split_precentage_tests=0.25

In [18]:
df_imdb

Unnamed: 0,index,reviews,sentiment,emoticons,reviews_no_punctuation,length,words,sentences,positive_emoticons,negative_emoticons,stamm,stamm_no_punctuation,stamm_length,stamm_words,stamm_sentences
0,0,andreas arrives in a strange city . he doesn't...,1,,andreas arrives in a strange city he doesn't r...,1439,288,16,0,0,andrea arriv strang citi . rememb came got . o...,andrea arriv strang citi rememb came got order...,851,162,14
1,1,"carrot top's "" chairman of the board "" and his...",0,,carrot top's chairman of the board and his at&...,931,159,5,0,0,"carrot top' "" chairman board "" hi at&t commerc...",carrot top chairman board hi at&t commerci liv...,639,111,5
2,2,"the bad out takes from "" reign of fire "" strun...",0,,the bad out takes from reign of fire strung to...,187,42,3,0,0,"bad take "" reign fire "" strung togeth , withou...",bad take reign fire strung togeth without ani ...,131,29,3
3,3,i saw the the bourne ultimatum last summer wit...,1,,i saw the the bourne ultimatum last summer wit...,866,190,9,0,0,"saw bourn ultimatum last summer friend , , wow...",saw bourn ultimatum last summer friend wow alr...,557,120,8
4,4,this is possibly the worst film i've ever seen...,0,,this is possibly the worst film i've ever seen...,3018,602,26,0,0,thi possibl worst film i'v ever seen . fact ha...,thi possibl worst film i'v ever seen fact ha f...,1852,351,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,"today , being president's day , my wife and i ...",0,,today being president's day my wife and i had ...,2407,506,21,0,0,"today , president' day , wife "" notebook "" dvd...",today presidentday wife notebook dvd check loc...,1506,318,17
49996,49996,this is one of the worst movies i have ever se...,0,,this is one of the worst movies i have ever se...,204,38,3,0,0,thi one worst movi ever seen ! saw toronto fil...,thi one worst movi ever seen saw toronto film ...,135,25,3
49997,49997,this is exactly the type of film that frustrat...,0,,this is exactly the type of film that frustrat...,1278,265,14,0,0,"thi exactli type film frustrat . great cast , ...",thi exactli type film frustrat great cast grea...,807,163,11
49998,49998,""" algie , the miner "" is one bad and unfunny ...",0,,algie the miner is one bad and unfunny silent...,663,132,9,0,0,""" algi , miner "" one bad unfunni silent comedi...",algi miner one bad unfunni silent comedi time...,448,88,8


In [19]:
from sentimental_hwglu.sa_naive import NaiveSA
from sentimental_hwglu.sa_afinn import AFinnPipeline
from sentimental_hwglu.sa_afinn import VaderPipeline
from sentimental_hwglu.sa_logistic_regression import LogisticRegressionTfid
from sentimental_hwglu.sa_rnn import SA_LSTM_Pipeline

Let's load the data, split it into train and test and feed the model!

In [8]:
results = {}

In [None]:

models = [
    NaiveSA(verbose=True, weigth_added_words=0.0),
    AFinnPipeline(),
    VaderPipeline(),
    LogisticRegressionTfid(),
    SA_LSTM_Pipeline(max_words=1000, max_length=None, epochs=5),
]
X_train, X_test, y_train, y_test = train_test_split(df_imdb.reviews, df_imdb.sentiment, test_size=split_precentage_tests, random_state=42)
for model in models:
    print(" Running pipeline: " + str(model))
    st = time.time()
    r = sam.run_sentimental_analysis_pipeline(model, X_train, y_train, X_test, y_test)
    et = time.time()
    print(" precision = {}, recall = {}, F1 = {}".format(r.precision(), r.recall(), r.f1_score()))
    results[str(model)] = r
    print(' Execution time:', et - st, 'seconds')

In [None]:
for k, r in results.items():
    print("------------------------------------")
    print(k, ":")
    print("    precision: ", r.precision())
    print("    recall   : ", r.recall())
    print("    F1       : ", r.f1_score())

In [None]:
def __():
    models = [
        NaiveSA(verbose=True, weigth_added_words=0.0),
        # AFinnPipeline(),
        # VaderPipeline(),
        LogisticRegressionTfid(),
        SA_LSTM_Pipeline(max_words=1000, max_length=None, epochs=5),
    ]
    X_train, X_test, y_train, y_test = train_test_split(df_imdb.reviews, df_imdb.sentiment, test_size=split_precentage_tests, random_state=42)
    for model in models:
        print(" Running pipeline: " + str(model))
        st = time.time()
        r = sam.run_sa_cross_validation_pipeline(model, X_train, y_train, X_test, y_test)
        et = time.time()
        print(" precision = {}, recall = {}, F1 = {}".format(r.precision(), r.recall(), r.f1_score()))
        results[str(model)] = r
        print(' Execution time:', et - st, 'seconds')

#### Test to use GridSearchCV

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
X_train, X_test, y_train, y_test = train_test_split(df_imdb.reviews, df_imdb.sentiment, test_size=split_precentage_tests, random_state=42)
param_grid = [
    {"weigth_added_words": [0.0]},
    {"weigth_added_words": [0.5]},
    {"weigth_added_words": [1.0]},
    {"weigth_added_words": [2.0]},
]

pipeline = NaiveSA(verbose=True)
gs = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='accuracy')

In [22]:
gs.fit(X_train, y_train, verbose=True)

Traceback (most recent call last):
  File "/home/zhanna/bachelorarbeit/zbb/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/zhanna/bachelorarbeit/zbb/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/zhanna/bachelorarbeit/zbb/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/home/zhanna/bachelorarbeit/zbb/venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/home/zhanna/bachelorarbeit/zbb/venv/lib/python3.10/site-packages/sklearn/utils/_response.py", line 182, in _get_response_values
    classes = estimator.classes_
AttributeError: 'NaiveSA' object has no attribute 'classes_'



 running set_words  1 / 2
 tokenization for review  [################### ] 99.8%
 tokenization took  1.6386289596557617  sec.
 running set_words  2 / 2
 tokenization for review  [##################  ] 93.5%
 tokenization took  1.4227676391601562  sec.
 create common words
 function executed in 0.1048s
 create only negative words
 function executed in 0.0442s
 create only positive words
 function executed in 0.0497s
 function executed in 3.2856s
 running set_words  1 / 2
 tokenization for review  [################### ] 99.8%
 tokenization took  1.4570131301879883  sec.
 running set_words  2 / 2
 tokenization for review  [##################  ] 93.5%
 tokenization took  1.4087939262390137  sec.
 create common words
 function executed in 0.0964s
 create only negative words
 function executed in 0.0477s
 create only positive words
 function executed in 0.0739s
 function executed in 3.1018s
 running set_words  1 / 2
 tokenization for review  [################### ] 99.8%
 tokenization took  1



In [23]:
gs.best_estimator_

In [24]:
gs.best_params_

{'weigth_added_words': 0.0}