In [26]:
import pandas as pd

In [54]:
df = pd.read_csv("data/processed_tweets.csv", index_col=0).dropna(subset=["unprocessed_tweet"])

In [55]:
df["emotion"].value_counts() / len(df)

No emotion toward brand or product    0.592609
Positive emotion                      0.327541
Negative emotion                      0.062692
I can't tell                          0.017158
Name: emotion, dtype: float64

In [56]:
df_multi_dropped = df[~(df["emotion"] == "I can't tell")].copy()
df_multi_dropped.shape

(8936, 4)

In [57]:
df_multi_dropped["emotion"].value_counts()

No emotion toward brand or product    5388
Positive emotion                      2978
Negative emotion                       570
Name: emotion, dtype: int64

In [58]:
emotion_map = {
    "No emotion toward brand or product": 2,
    "Positive emotion": 1,
    "Negative emotion": 0,
}

df_multi_dropped["emotion_encoded"] = df_multi_dropped["emotion"].map(emotion_map)
df_multi_dropped["emotion_encoded"].value_counts()

2    5388
1    2978
0     570
Name: emotion_encoded, dtype: int64

In [59]:
from sklearn.model_selection import train_test_split

X = df_multi_dropped["processed_tweet"]
y = df_multi_dropped["emotion_encoded"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6255,), (2681,), (6255,), (2681,))

In [60]:
for dataset in [X_train, X_test,  y_train, y_test]:
    print(dataset.isnull().sum())

0
0
0
0


In [65]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm

pipe_dt = Pipeline([("vectorizer", TfidfVectorizer()), ("dt", DecisionTreeClassifier(random_state=42))])
pipe_rf = Pipeline([("vectorizer", TfidfVectorizer()), ("rf", RandomForestClassifier(random_state=42))])
pipe_knn = Pipeline([("vectorizer", TfidfVectorizer()), ("knn", KNeighborsClassifier())])

pipes = [pipe_dt, pipe_rf, pipe_knn]
names = ["dt", "rf", "knn"]

for pipeline in tqdm(pipes):
    pipeline.fit(X_train, y_train)

100%|██████████| 3/3 [00:04<00:00,  1.37s/it]


In [67]:
preds = {pipe: None for pipe in names}

for name, pipe in tqdm(zip(names, pipes)):
    preds[name] = pipe.predict(X_test)

3it [00:00,  3.85it/s]


In [71]:
from sklearn.metrics import accuracy_score, classification_report

for name in preds.keys():
    print(name.upper())
    print(classification_report(y_test, preds[name]))
    print()

DT
              precision    recall  f1-score   support

           0       0.32      0.18      0.23       189
           1       0.48      0.48      0.48       880
           2       0.68      0.72      0.70      1612

    accuracy                           0.60      2681
   macro avg       0.50      0.46      0.47      2681
weighted avg       0.59      0.60      0.59      2681


RF
              precision    recall  f1-score   support

           0       0.71      0.15      0.25       189
           1       0.61      0.39      0.47       880
           2       0.68      0.88      0.76      1612

    accuracy                           0.67      2681
   macro avg       0.67      0.47      0.50      2681
weighted avg       0.66      0.67      0.63      2681


KNN
              precision    recall  f1-score   support

           0       0.29      0.11      0.16       189
           1       0.53      0.37      0.44       880
           2       0.67      0.83      0.74      1612

    accu