In [89]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
)
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

data = pd.read_csv("../IMDB-Dataset-GoogleTranslate-Processed2.csv")
review, sentiment = data["review"], data["sentiment"]
x_train, x_test, y_train, y_test = train_test_split(
    review, sentiment, test_size=0.2, random_state=0
)
nb_pipeline = Pipeline(
    [
        ("vect", TfidfVectorizer()),
        ("clf", MultinomialNB(fit_prior=True, class_prior=None)),
    ]
)
svc_pipeline = Pipeline(
    [
        ("vect", TfidfVectorizer()),
        ("clf", LinearSVC(dual="auto")),
    ]
)
logistic_regression_pipeline = Pipeline(
    [
        ("vect", TfidfVectorizer()),
        ("clf", LogisticRegression(solver="saga")),
    ]
)

nb_pipeline.fit(x_train, y_train)
svc_pipeline.fit(x_train, y_train)
logistic_regression_pipeline.fit(x_train, y_train)

predict_1 = nb_pipeline.predict(x_test)
predict_2 = svc_pipeline.predict(x_test)
predict_3 = logistic_regression_pipeline.predict(x_test)

print(classification_report(y_test, predict_1, digits=4))
print(classification_report(y_test, predict_2, digits=4))
print(classification_report(y_test, predict_3, digits=4))

              precision    recall  f1-score   support

    negative     0.8448    0.8810    0.8625      4975
    positive     0.8770    0.8398    0.8580      5025

    accuracy                         0.8603     10000
   macro avg     0.8609    0.8604    0.8603     10000
weighted avg     0.8610    0.8603    0.8603     10000

              precision    recall  f1-score   support

    negative     0.8977    0.8907    0.8942      4975
    positive     0.8926    0.8995    0.8960      5025

    accuracy                         0.8951     10000
   macro avg     0.8951    0.8951    0.8951     10000
weighted avg     0.8951    0.8951    0.8951     10000

              precision    recall  f1-score   support

    negative     0.8963    0.8808    0.8885      4975
    positive     0.8840    0.8991    0.8915      5025

    accuracy                         0.8900     10000
   macro avg     0.8901    0.8900    0.8900     10000
weighted avg     0.8901    0.8900    0.8900     10000



In [141]:
def evaluate_score(text):
    text = [text]
    s = logistic_regression_pipeline.predict(text)
    # find all features and coefficients that have the text and sum up the values
    s = sum(
        [
            i[1]
            for x, i in enumerate(
                zip(
                    logistic_regression_pipeline[0].get_feature_names_out(),
                    logistic_regression_pipeline[1].coef_[0],
                )
            )
            if i[0] in text[0].split(" ")
        ]
    )
    if s >= 1:
        print("(%s) Positive, score is %f" % (text[0], s))
    else:
        print("(%s) Negative, score is %f" % (text[0], s))


evaluate_score("hræðilegur frábær")
evaluate_score("slæmur vel besta")
evaluate_score("lélegur vel")

(hræðilegur frábær) Positive, score is 1.122848
(slæmur vel besta) Positive, score is 4.489404
(lélegur vel) Negative, score is 0.106368


In [145]:
def get_most_important_features(vectorizer, model, n=5):
    index_to_word = {v: k for k, v in vectorizer.vocabulary_.items()}

    # loop for each class
    classes = {}
    for class_index in range(model.coef_.shape[0]):
        word_importances = [
            (el, index_to_word[i]) for i, el in enumerate(model.coef_[class_index])
        ]
        sorted_coeff = sorted(word_importances, key=lambda x: x[0], reverse=True)
        tops = sorted(sorted_coeff[:n], key=lambda x: x[0])
        bottom = sorted_coeff[-n:]
        classes[class_index] = {"tops": tops, "bottom": bottom}
    return classes


def show_most_informative_features(vectorizer, clf, n=20):
    print(clf)
    feature_names = vectorizer.get_feature_names_out()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[: -(n + 1) : -1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.3f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))


# show_most_informative_features(pipeline_2[0], pipeline_2[1], n=5)
show_most_informative_features(
    logistic_regression_pipeline[0], logistic_regression_pipeline[1], n=10
)


def plot_important_words(top_scores, top_words, bottom_scores, bottom_words, name):
    top_pairs = [(a, b) for a, b in zip(top_words, top_scores)]
    top_pairs = sorted(top_pairs, key=lambda x: x[1])

    bottom_pairs = [(a, b) for a, b in zip(bottom_words, bottom_scores)]
    bottom_pairs = sorted(bottom_pairs, key=lambda x: x[1], reverse=True)

    top_words = [a[0] for a in top_pairs]
    top_scores = [a[1] for a in top_pairs]

    bottom_words = [a[0] for a in bottom_pairs]
    bottom_scores = [a[1] for a in bottom_pairs]

    print(top_words)
    print(top_scores)

    # ax = plt.subplot(121)
    # y_pos = np.arange(len(bottom_words))
    # p1 = plt.barh(y_pos, bottom_scores, align="center")
    # plt.yticks(y_pos, bottom_words)
    # plt.xlabel("Score")
    # #plt.bar_label(p1, fmt='%.2f')

    # plt.savefig("negative_lr.jpg",bbox_inches='tight')
    # plt.show()

    # plt.figure(figsize=(12, 8))
    ax = plt.subplot(122)
    y_pos = np.arange(len(top_words))
    p2 = plt.barh(y_pos, top_scores, align="center")
    plt.yticks(y_pos, top_words)
    plt.xlabel("Score")
    # plt.bar_label(p2, fmt='%.2f')
    plt.savefig("positive_lr.jpg", bbox_inches="tight")
    plt.show()


importance = get_most_important_features(pipeline_3[0], pipeline_3[1], 10)
top_scores = [a[0] for a in importance[0]["tops"]]
top_words = [a[1] for a in importance[0]["tops"]]
bottom_scores = [a[0] for a in importance[0]["bottom"]]
bottom_words = [a[1] for a in importance[0]["bottom"]]


plot_important_words(top_scores, top_words, bottom_scores, bottom_words, "Sentiment")

LogisticRegression(solver='saga')
	-11.377	hræðilegur     		12.4996	frábær         
	-9.104	slæmur         		7.6903	vel            
	-8.970	versta         		5.9035	besta          
	-8.604	leiðinlegur    		5.7820	elska          
	-7.584	lélegur        		5.2837	frábærlega     
	-6.317	illa           		4.9943	skemmtilegur   
	-5.365	vonbrigði      		4.8503	fullkominn     
	-5.256	pirrandi       		4.7078	dásamlegur     
	-5.247	bara           		4.6765	njóta          
	-5.224	líta           		4.5580	hrífandi       
