In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
df = pd.read_csv('amazon_balanced.csv')
df["Sentiment"].value_counts()   # Unbalanced dataset

Sentiment
Positive    82037
Negative    82037
Name: count, dtype: int64

In [2]:
from tabulate import tabulate
def print_relevant_words(pipe):
    words = pipe['vectorizer'].get_feature_names_out()
    print("Vocabulary size", len(words))
    coefs = pipe['model'].coef_
    classes = pipe['model'].classes_

    top_n_words = 10
    sorted_coef_indexes = coefs.argsort(axis=1)

    negative_words = [(words[i], coefs[0, i]) for i in sorted_coef_indexes[0, :top_n_words]]
    positive_words = [(words[i], coefs[0, i]) for i in sorted_coef_indexes[0, -top_n_words:]]

    table = []
    for neg, pos in zip(negative_words, positive_words):
        table.append([f"{neg[0]} | relevance: {neg[1]:.2f}", f"{pos[0]} | relevance: {pos[1]:.2f}"])

    print(tabulate(table, headers=["Negative", "Positive"], tablefmt="grid"))

### Pre processing

In [3]:
X = df['Review']
label_mapping = {'Positive': 1, 'Negative': 0}
y = df['Sentiment'].map(label_mapping)   # 1 for positive, 0 for negative
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Normal tfidf vectorizer

In [4]:

pipe_tfidf = Pipeline([('vectorizer', TfidfVectorizer( stop_words="english")), ('model', LogisticRegression(max_iter=1000))])
# Print vocabulary size

pipe_tfidf.fit(X_train, y_train)  
print("Pipe score: ",pipe_tfidf.score(X_test, y_test))
y_pred = pipe_tfidf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print_relevant_words(pipe_tfidf)


Pipe score:  0.8968154807252781
Accuracy:  0.8968154807252781
Vocabulary size 131413
+-----------------------------------+------------------------------+
| Negative                          | Positive                     |
| worst | relevance: -9.98          | good | relevance: 7.13       |
+-----------------------------------+------------------------------+
| disappointed | relevance: -9.81   | awesome | relevance: 7.15    |
+-----------------------------------+------------------------------+
| terrible | relevance: -9.71       | wonderful | relevance: 7.46  |
+-----------------------------------+------------------------------+
| disappointing | relevance: -9.23  | love | relevance: 7.82       |
+-----------------------------------+------------------------------+
| awful | relevance: -8.74          | loves | relevance: 8.46      |
+-----------------------------------+------------------------------+
| horrible | relevance: -8.22       | perfect | relevance: 8.57    |
+-----------------

Normal CountVectorizer binary=True

In [5]:

pipe_count_binary = Pipeline([('vectorizer', CountVectorizer(binary=True, stop_words="english", min_df=20)), ('model', LogisticRegression(max_iter=1000))])
# Print vocabulary size

pipe_count_binary.fit(X_train, y_train)  
print("Pipe score: ",pipe_count_binary.score(X_test, y_test))
y_pred = pipe_count_binary.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print_relevant_words(pipe_count_binary)


Pipe score:  0.8896236477220784
Accuracy:  0.8896236477220784
Vocabulary size 10826
+----------------------------------+------------------------------+
| Negative                         | Positive                     |
| overrated | relevance: -3.07     | removes | relevance: 2.11    |
+----------------------------------+------------------------------+
| defeats | relevance: -3.02       | definitly | relevance: 2.13  |
+----------------------------------+------------------------------+
| yuck | relevance: -2.80          | addicting | relevance: 2.14  |
+----------------------------------+------------------------------+
| undrinkable | relevance: -2.58   | smoothest | relevance: 2.22  |
+----------------------------------+------------------------------+
| disappointing | relevance: -2.52 | unbeatable | relevance: 2.24 |
+----------------------------------+------------------------------+
| medicinelike | relevance: -2.47  | teens | relevance: 2.28      |
+-------------------------------

Normal CountVectorizer binary=False

In [6]:

pipe_count_binary_false = Pipeline([('vectorizer', CountVectorizer(binary=False, stop_words="english", min_df=100)), ('model', LogisticRegression(max_iter=1000))])
# Print vocabulary size

pipe_count_binary_false.fit(X_train, y_train)  
print("Pipe score: ",pipe_count_binary_false.score(X_test, y_test))
y_pred = pipe_count_binary_false.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print_relevant_words(pipe_count_binary_false)


Pipe score:  0.887368581441414
Accuracy:  0.887368581441414
Vocabulary size 4278
+----------------------------------+------------------------------+
| Negative                         | Positive                     |
| undrinkable | relevance: -2.81   | addicting | relevance: 1.69  |
+----------------------------------+------------------------------+
| yuck | relevance: -2.51          | whim | relevance: 1.72       |
+----------------------------------+------------------------------+
| defeats | relevance: -2.41       | awesome | relevance: 1.72    |
+----------------------------------+------------------------------+
| cancelled | relevance: -2.36     | intend | relevance: 1.80     |
+----------------------------------+------------------------------+
| disappointing | relevance: -2.21 | hooked | relevance: 1.81     |
+----------------------------------+------------------------------+
| worst | relevance: -2.17         | downside | relevance: 1.95   |
+----------------------------------

In [None]:
# Testing for arbitray text
text_positive = ["This product is unbelievably good"]
text_negative = ["This product is unbelievably bad"]  # For some reason this is classified as positive
prediction_positive = pipe_tfidf.predict(text_positive)
prediction_negative = pipe_tfidf.predict(text_negative)
print("Prediction positive: ", prediction_positive)
print("Prediction negative: ", prediction_negative)

## Trying with lemma

In [4]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
X_lemmatized = X.apply(lambda x: ' '.join([lemmatizer.lemmatize(p) for p in x.lower().split()]))

Lemma tfidf

In [None]:
X_train_lemmatized, X_test_lemmatized, y_train_lemmatized, y_test_lemmatized = train_test_split(X_lemmatized, y, test_size=0.2)
pipe_lemma_tfidf = Pipeline([('vectorizer', TfidfVectorizer( stop_words="english")), ('model', LogisticRegression(max_iter=1000))])
# Print vocabulary size

pipe_lemma_tfidf.fit(X_train_lemmatized, y_train_lemmatized)  
print("Pipe score: ",pipe_lemma_tfidf.score(X_test_lemmatized, y_test_lemmatized))
y_pred_lemmatized = pipe_lemma_tfidf.predict(X_test_lemmatized)
accuracy = accuracy_score(y_test_lemmatized, y_pred_lemmatized)
print("Accuracy: ", accuracy)
print_relevant_words(pipe_lemma_tfidf)

Lemma countVectorizer


In [44]:
import re

X_train_lemmatized, X_test_lemmatized, y_train_lemmatized, y_test_lemmatized = (
    train_test_split(X_lemmatized, y, test_size=0.2, random_state=0)
)
pipe_lemma_count_binary = Pipeline(
    [
        (
            "vectorizer",
            CountVectorizer(
                preprocessor=lambda x: re.sub(r"[^a-zA-Z\s]", "", x.lower()),
                binary=True,
                stop_words="english",
                min_df=20,
            ),
        ),
        ("model", LogisticRegression(max_iter=1000)),
    ]
)
# Print vocabulary size

pipe_lemma_count_binary.fit(X_train_lemmatized, y_train_lemmatized)
print(
    "Pipe score: ", pipe_lemma_count_binary.score(X_test_lemmatized, y_test_lemmatized)
)
y_pred_lemmatized = pipe_lemma_count_binary.predict(X_test_lemmatized)
accuracy = accuracy_score(y_test_lemmatized, y_pred_lemmatized)
print("Accuracy: ", accuracy)
print_relevant_words(pipe_lemma_count_binary)

Pipe score:  0.8870028950175225
Accuracy:  0.8870028950175225
Vocabulary size 9612
+----------------------------------+--------------------------------+
| Negative                         | Positive                       |
| overrated | relevance: -3.25     | skeptical | relevance: 2.03    |
+----------------------------------+--------------------------------+
| yikes | relevance: -2.74         | hooked | relevance: 2.07       |
+----------------------------------+--------------------------------+
| weakest | relevance: -2.74       | yum | relevance: 2.08          |
+----------------------------------+--------------------------------+
| dissapointing | relevance: -2.72 | nutritionist | relevance: 2.10 |
+----------------------------------+--------------------------------+
| rediculous | relevance: -2.65    | delish | relevance: 2.17       |
+----------------------------------+--------------------------------+
| yuck | relevance: -2.65          | addicting | relevance: 2.20    |
+------

In [43]:
# Testing for arbitray text
text_positive = ["This product is unbelievably extraordinary"]
text_negative = ["This product is unbelievably bad"]  # For some reason this is classified as positive
prediction_positive = pipe_lemma_count_binary.predict(text_positive)
prediction_negative = pipe_lemma_count_binary.predict(text_negative)
print("Prediction positive: ", prediction_positive)
print("Prediction negative: ", prediction_negative)

Prediction positive:  [1]
Prediction negative:  [0]


# Trying with stems

In [28]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")
X_stemmed = X.apply(lambda x: ' '.join([stemmer.stem(p) for p in x.lower().split()]))

Stemmed with tfidf

In [None]:
X_train_stemmed, X_test_stemmed, y_train_stemmed, y_test_stemmed = train_test_split(X_stemmed, y, test_size=0.2)
pipe_stemmed_tfidf = Pipeline([('vectorizer', TfidfVectorizer( stop_words="english")), ('model', LogisticRegression(max_iter=1000))])
# Print vocabulary size

pipe_stemmed_tfidf.fit(X_train_stemmed, y_train_stemmed)  
print("Pipe score: ",pipe_stemmed_tfidf.score(X_test_stemmed, y_test_stemmed))
y_pred_stemmed = pipe_stemmed_tfidf.predict(X_test_stemmed)
accuracy = accuracy_score(y_test_stemmed, y_pred_stemmed)
print("Accuracy: ", accuracy)
print_relevant_words(pipe_stemmed_tfidf)

Stemmed with countVectorizer

In [None]:
X_train_stemmed, X_test_stemmed, y_train_stemmed, y_test_stemmed = train_test_split(X_stemmed, y, test_size=0.2)
pipe_stemmed_count = Pipeline([('vectorizer', CountVectorizer(binary=True, stop_words="english")), ('model', LogisticRegression(max_iter=1000))])
# Print vocabulary size

pipe_stemmed_count.fit(X_train_stemmed, y_train_stemmed)  
print("Pipe score: ",pipe_stemmed_count.score(X_test_stemmed, y_test_stemmed))
y_pred_stemmed = pipe_stemmed_count.predict(X_test_stemmed)
accuracy = accuracy_score(y_test_stemmed, y_pred_stemmed)
print("Accuracy: ", accuracy)
print_relevant_words(pipe_stemmed_count)