In [2]:
import os, sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, top_k_accuracy_score
from tqdm.notebook import tqdm
from src.model.KeywordBaseline import KeywordBaseline
from src.model.NBBaseline import NBBaseline
from src.utils import label_to_emoji, labels_to_emojis
from src.dataset import load_preprocessed_dataset
import numpy as np

# Baseline models
- Keywords baseline models
    - Bag-of-Words Weighted Classifier
    - Nearest Centroid Classifier
- Naive Bayes Classifier

In [4]:
train, _ = load_preprocessed_dataset()

X = train["TEXT"].values.astype("U")
y = train["Label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Loading preprocessed dataset from disk...


In [3]:
def top_k_accuracy(predict):
    y_scores = np.zeros((len(X_test), 20))
    top_k_accs = {}

    for i, (_, scores) in enumerate(predict):
        for label, score in scores.items():
            y_scores[i, label] = score

    for k in [1, 3, 5]:
        acc = top_k_accuracy_score(y_test, y_scores, k=k)
        top_k_accs[f"top_{k}_accuracy"] = acc

    return top_k_accs

def save_results(filename, predictions):
    # save results to file
    results_dir = os.path.join(project_root, "notebooks", "results")
    os.makedirs(results_dir, exist_ok=True)

    filename = os.path.join(results_dir, filename)

    with open(filename, "w", encoding='utf8') as f:
        f.write("Text\tTrueLabel\tPredictedLabels(Top-5)\n")
        lines = []
        for text, true_label, (_, scores) in tqdm(zip(X_test, y_test, predictions), desc="Writing Weights Results", total=len(X_test)):
            lines.append(f"{text}\t{true_label} {label_to_emoji(true_label)}\t{labels_to_emojis(list(sorted(scores, key=scores.get, reverse=True)[:5]))}\n")
        f.write("".join(lines))

## Keywords Baseline Models
- Bag-of-Words Weighted Classifier
- Nearest Centroid Classifier

In [4]:
keyword_baseline = KeywordBaseline(X_train=X_train, y_train=y_train)
keyword_baseline.extract_keywords(top_n=15)
print("Keyword Mapping (Top-15):")
for label, keywords in keyword_baseline.keyword_mapping.items():
    print(f"Label {label} {label_to_emoji(label)}: {keywords}")

predict_kw_weights = []
predict_kw_cosine = []

for text in tqdm(X_test, desc="Predicting"):
    pred_w = keyword_baseline.predict_weights(text)
    pred_c = keyword_baseline.predict_cosine(text)
    predict_kw_weights.append(pred_w)
    predict_kw_cosine.append(pred_c)

print("\nClassification Report (Weights):")
print(classification_report(y_test, [r[0] for r in predict_kw_weights], digits=4))
print("\nClassification Report (Cosine):")
print(classification_report(y_test, [r[0] for r in predict_kw_cosine], digits=4))


Keyword Mapping (Top-15):
Label 0 üòú: ['new', 'like', 'just', 'fun', 'night', 'know', 'got', 'time', 'york', 'good', 'beach', 'day', 'university', 'love', 'tonight']
Label 1 üì∏: ['new', 'california', 'york', 'park', 'shot', 'like', 'photo', 'new york', 'day', 'angeles', 'city', 'just', 'shoot', 'tonight', 'thanks']
Label 2 üòç: ['love', 'new', 'beautiful', 'york', 'today', 'day', 'best', 'amazing', 'night', 'just', 'park', 'new york', 'got', 'happy', 'favorite']
Label 3 üòÇ: ['lol', 'like', 'just', 'don', 'got', 'new', 'time', 'today', 'look', 'know', 'day', 'good', 'right', 'texas', 'lmao']
Label 4 üòâ: ['just', 'new', 'know', 'like', 'york', 'll', 'good', 'got', 'today', 'night', 'day', 'don', 'time', 'great', 'thanks']
Label 5 üéÑ: ['christmas', 'merry', 'merry christmas', 'tree', 'christmas tree', 'holidays', 'eve', 'christmas eve', 'happy', 'season', 'holiday', 'time', 'family', 'tis', 'party']
Label 6 üì∑: ['york', 'california', 'new', 'park', 'beach', 'day', 'night', 's

Predicting:   0%|          | 0/14000 [00:00<?, ?it/s]


Classification Report (Weights):
              precision    recall  f1-score   support

           0     0.0385    0.1206    0.0584       282
           1     0.1041    0.1450    0.1212       531
           2     0.2352    0.1243    0.1626      1408
           3     0.3638    0.2616    0.3043      1384
           4     0.0442    0.0806    0.0571       372
           5     0.5941    0.5711    0.5823       387
           6     0.0857    0.1183    0.0994       431
           7     0.2951    0.3143    0.3044       875
           8     0.0995    0.1989    0.1326       377
           9     0.4517    0.1364    0.2096      3049
          10     0.0533    0.1493    0.0785       355
          11     0.3644    0.3301    0.3464       509
          12     0.2123    0.4568    0.2899       370
          13     0.1537    0.1988    0.1733       644
          14     0.1038    0.1245    0.1132       466
          15     0.1356    0.1099    0.1214       728
          16     0.1333    0.0954    0.1112    

In [5]:
top_k_accuracy_w = top_k_accuracy(predict_kw_weights)
print("\nTop-K Accuracy (Weights):", top_k_accuracy_w)

top_k_accuracy_c = top_k_accuracy(predict_kw_cosine)
print("Top-K Accuracy (Cosine):", top_k_accuracy_c)


Top-K Accuracy (Weights): {'top_1_accuracy': 0.18357142857142858, 'top_3_accuracy': 0.35892857142857143, 'top_5_accuracy': 0.49542857142857144}
Top-K Accuracy (Cosine): {'top_1_accuracy': 0.18685714285714286, 'top_3_accuracy': 0.37092857142857144, 'top_5_accuracy': 0.5096428571428572}


In [6]:
weights_file = "keyword_baseline_weights.txt"
cosine_file = "keyword_baseline_cosine.txt"

save_results(weights_file, predict_kw_weights)
save_results(cosine_file, predict_kw_cosine)

Writing Weights Results:   0%|          | 0/14000 [00:00<?, ?it/s]

Writing Weights Results:   0%|          | 0/14000 [00:00<?, ?it/s]

## NB Baseline Model

In [6]:
nb_baseline = NBBaseline(X_train=X_train, y_train=y_train)
nb_baseline.train()
print("Naive Bayes Baseline training finished.")

y_pred_nb = []

for text in tqdm(X_test, desc="Predicting"):
    pred = nb_baseline.predict(text)
    y_pred_nb.append(pred)

print("Classification Report:")
print(classification_report(y_test, [r[0] for r in y_pred_nb], digits=4))
print([r[0] for r in y_pred_nb])

top_k_accuracy_nb = top_k_accuracy(y_pred_nb)

print("Top-K Accuracy (Naive Bayes):", top_k_accuracy_nb)

Naive Bayes Baseline training finished.


Predicting:   0%|          | 0/14000 [00:00<?, ?it/s]

Classification Report:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       282
           1     0.1854    0.1055    0.1345       531
           2     0.2249    0.2166    0.2207      1408
           3     0.3079    0.4552    0.3673      1384
           4     0.0441    0.0081    0.0136       372
           5     0.5646    0.6098    0.5863       387
           6     0.1053    0.0232    0.0380       431
           7     0.3330    0.3680    0.3496       875
           8     0.1000    0.0186    0.0313       377
           9     0.2988    0.6195    0.4031      3049
          10     0.1228    0.0197    0.0340       355
          11     0.4668    0.3733    0.4148       509
          12     0.2576    0.3676    0.3029       370
          13     0.2553    0.1693    0.2035       644
          14     0.1984    0.0536    0.0845       466
          15     0.1303    0.0508    0.0731       728
          16     0.2250    0.0767    0.1144       587
    

NameError: name 'top_k_accuracy' is not defined

In [8]:
save_results("nb_baseline.txt", y_pred_nb)

Writing Weights Results:   0%|          | 0/14000 [00:00<?, ?it/s]