# 1. Choose a labeled text dataset used for classification

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
df = pd.read_csv('amazon_balanced.csv')
df["Sentiment"].value_counts()   # Unbalanced dataset

Sentiment
Positive    82037
Negative    82037
Name: count, dtype: int64

In [2]:
from tabulate import tabulate
def print_relevant_words(pipe):
    words = pipe['vectorizer'].get_feature_names_out()
    print("Vocabulary size", len(words))
    coefs = pipe['model'].coef_
    classes = pipe['model'].classes_

    top_n_words = 10
    sorted_coef_indexes = coefs.argsort(axis=1)

    negative_words = [(words[i], coefs[0, i]) for i in sorted_coef_indexes[0, :top_n_words]]
    positive_words = [(words[i], coefs[0, i]) for i in sorted_coef_indexes[0, -top_n_words:]]

    table = []
    for neg, pos in zip(negative_words, positive_words):
        table.append([f"{neg[0]} | relevance: {neg[1]:.2f}", f"{pos[0]} | relevance: {pos[1]:.2f}"])

    print(tabulate(table, headers=["Negative", "Positive"], tablefmt="grid"))

# 2. Define a classification pipeline

In [45]:
X = df['Review']
label_mapping = {'Positive': 1, 'Negative': 0}
y = df['Sentiment'].map(label_mapping)   # 1 for positive, 0 for negative
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [46]:

pipe_count_binary = Pipeline([('vectorizer', CountVectorizer(binary=True, stop_words="english", min_df=20)), ('model', LogisticRegression(max_iter=1000))])
# Print vocabulary size

pipe_count_binary.fit(X_train, y_train)  
print("Pipe score: ",pipe_count_binary.score(X_test, y_test))
y_pred = pipe_count_binary.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print_relevant_words(pipe_count_binary)


Pipe score:  0.8910559195489868
Accuracy:  0.8910559195489868
Vocabulary size 10786
+-----------------------------------+---------------------------------+
| Negative                          | Positive                        |
| overrated | relevance: -2.93      | drawback | relevance: 2.09      |
+-----------------------------------+---------------------------------+
| yuck | relevance: -2.78           | unbeatable | relevance: 2.11    |
+-----------------------------------+---------------------------------+
| weakest | relevance: -2.64        | jimmies | relevance: 2.11       |
+-----------------------------------+---------------------------------+
| worst | relevance: -2.53          | hydrated | relevance: 2.13      |
+-----------------------------------+---------------------------------+
| dissapointing | relevance: -2.47  | emptying | relevance: 2.20      |
+-----------------------------------+---------------------------------+
| blech | relevance: -2.43          | intend | relev

In [None]:
# Testing for arbitray text
text_positive = ["This product is unbelievably good"]
text_negative = ["This product is unbelievably bad"]  # For some reason this is classified as positive
prediction_positive = pipe_count_binary.predict(text_positive)
prediction_negative = pipe_count_binary.predict(text_negative)
print("Prediction positive: ", prediction_positive)
print("Prediction negative: ", prediction_negative)

## Lemmatized pipeline (selected as best)

In [4]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
X_lemmatized = X.apply(lambda x: ' '.join([lemmatizer.lemmatize(p) for p in x.lower().split()]))

In [44]:
import re

X_train_lemmatized, X_test_lemmatized, y_train_lemmatized, y_test_lemmatized = (
    train_test_split(X_lemmatized, y, test_size=0.2, random_state=0)
)
pipe_lemma_count_binary = Pipeline(
    [
        (
            "vectorizer",
            CountVectorizer(
                preprocessor=lambda x: re.sub(r"[^a-zA-Z\s]", "", x.lower()),
                binary=True,
                stop_words="english",
                min_df=20,
            ),
        ),
        ("model", LogisticRegression(max_iter=1000)),
    ]
)
# Print vocabulary size

pipe_lemma_count_binary.fit(X_train_lemmatized, y_train_lemmatized)
print(
    "Pipe score: ", pipe_lemma_count_binary.score(X_test_lemmatized, y_test_lemmatized)
)
y_pred_lemmatized = pipe_lemma_count_binary.predict(X_test_lemmatized)
accuracy = accuracy_score(y_test_lemmatized, y_pred_lemmatized)
print("Accuracy: ", accuracy)
print_relevant_words(pipe_lemma_count_binary)

Pipe score:  0.8870028950175225
Accuracy:  0.8870028950175225
Vocabulary size 9612
+----------------------------------+--------------------------------+
| Negative                         | Positive                       |
| overrated | relevance: -3.25     | skeptical | relevance: 2.03    |
+----------------------------------+--------------------------------+
| yikes | relevance: -2.74         | hooked | relevance: 2.07       |
+----------------------------------+--------------------------------+
| weakest | relevance: -2.74       | yum | relevance: 2.08          |
+----------------------------------+--------------------------------+
| dissapointing | relevance: -2.72 | nutritionist | relevance: 2.10 |
+----------------------------------+--------------------------------+
| rediculous | relevance: -2.65    | delish | relevance: 2.17       |
+----------------------------------+--------------------------------+
| yuck | relevance: -2.65          | addicting | relevance: 2.20    |
+------

## Stemmed pipeline

In [47]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")
X_stemmed = X.apply(lambda x: ' '.join([stemmer.stem(p) for p in x.lower().split()]))

In [48]:
X_train_stemmed, X_test_stemmed, y_train_stemmed, y_test_stemmed = train_test_split(X_stemmed, y, test_size=0.2)
pipe_stemmed_count = Pipeline([('vectorizer', CountVectorizer(binary=True, stop_words="english", min_df=20)), ('model', LogisticRegression(max_iter=1000))])
# Print vocabulary size

pipe_stemmed_count.fit(X_train_stemmed, y_train_stemmed)  
print("Pipe score: ",pipe_stemmed_count.score(X_test_stemmed, y_test_stemmed))
y_pred_stemmed = pipe_stemmed_count.predict(X_test_stemmed)
accuracy = accuracy_score(y_test_stemmed, y_pred_stemmed)
print("Accuracy: ", accuracy)
print_relevant_words(pipe_stemmed_count)

Pipe score:  0.8837726649398141
Accuracy:  0.8837726649398141
Vocabulary size 7639
+-----------------------------+--------------------------------+
| Negative                    | Positive                       |
| weakest | relevance: -3.70  | leash | relevance: 2.00        |
+-----------------------------+--------------------------------+
| badso | relevance: -3.15    | rusk | relevance: 2.02         |
+-----------------------------+--------------------------------+
| putrid | relevance: -2.92   | snickerdoodl | relevance: 2.06 |
+-----------------------------+--------------------------------+
| yuck | relevance: -2.75     | preschool | relevance: 2.07    |
+-----------------------------+--------------------------------+
| boo | relevance: -2.73      | fairad | relevance: 2.08       |
+-----------------------------+--------------------------------+
| yike | relevance: -2.68     | deduct | relevance: 2.10       |
+-----------------------------+--------------------------------+
| ick |

### Selected Pipeline: CountVectorizer, Lemmatization and Logistic Regression