In [2]:
import os, sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, top_k_accuracy_score
from tqdm.notebook import tqdm
from src.model.KeywordBaseline import KeywordBaseline
from src.model.NBBaseline import NBBaseline
from src.utils import label_to_emoji, labels_to_emojis
from src.dataset import load_preprocessed_dataset
import numpy as np

# Baseline models
- Keywords baseline models
    - Bag-of-Words Weighted Classifier
    - Nearest Centroid Classifier
- Naive Bayes Classifier

In [None]:
train, _ = load_preprocessed_dataset()

X = train["TEXT"].values.astype("U")
y = train["Label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
def top_k_accuracy(predict):
    y_scores = np.zeros((len(X_test), 20))
    top_k_accs = {}

    for i, (_, scores) in enumerate(predict):
        for label, score in scores.items():
            y_scores[i, label] = score

    for k in [1, 3, 5]:
        acc = top_k_accuracy_score(y_test, y_scores, k=k)
        top_k_accs[f"top_{k}_accuracy"] = acc

    return top_k_accs

def save_results(filename, predictions):
    # save results to file
    results_dir = os.path.join(project_root, "notebooks", "results")
    os.makedirs(results_dir, exist_ok=True)

    filename = os.path.join(results_dir, filename)

    with open(filename, "w", encoding='utf8') as f:
        f.write("Text\tTrueLabel\tPredictedLabels(Top-5)\n")
        lines = []
        for text, true_label, (_, scores) in tqdm(zip(X_test, y_test, predictions), desc="Writing Weights Results", total=len(X_test)):
            lines.append(f"{text}\t{true_label} {label_to_emoji(true_label)}\t{labels_to_emojis(list(sorted(scores, key=scores.get, reverse=True)[:5]))}\n")
        f.write("".join(lines))

## Keywords Baseline Models
- Bag-of-Words Weighted Classifier
- Nearest Centroid Classifier

In [None]:
keyword_baseline = KeywordBaseline(X_train=X_train, y_train=y_train)
keyword_baseline.extract_keywords(top_n=15)
print("Keyword Mapping (Top-15):")
for label, keywords in keyword_baseline.keyword_mapping.items():
    print(f"Label {label} {label_to_emoji(label)}: {keywords}")

predict_kw_weights = []
predict_kw_cosine = []

for text in tqdm(X_test, desc="Predicting"):
    pred_w = keyword_baseline.predict_weights(text)
    pred_c = keyword_baseline.predict_cosine(text)
    predict_kw_weights.append(pred_w)
    predict_kw_cosine.append(pred_c)

print("\nClassification Report (Weights):")
print(classification_report(y_test, [r[0] for r in predict_kw_weights], digits=4))
print("\nClassification Report (Cosine):")
print(classification_report(y_test, [r[0] for r in predict_kw_cosine], digits=4))


In [None]:
top_k_accuracy_w = top_k_accuracy(predict_kw_weights)
print("\nTop-K Accuracy (Weights):", top_k_accuracy_w)

top_k_accuracy_c = top_k_accuracy(predict_kw_cosine)
print("Top-K Accuracy (Cosine):", top_k_accuracy_c)

In [None]:
weights_file = "keyword_baseline_weights.txt"
cosine_file = "keyword_baseline_cosine.txt"

save_results(weights_file, predict_kw_weights)
save_results(cosine_file, predict_kw_cosine)

## NB Baseline Model

In [None]:
nb_baseline = NBBaseline(X_train=X_train, y_train=y_train)
nb_baseline.train()
print("Naive Bayes Baseline training finished.")

y_pred_nb = []

for text in tqdm(X_test, desc="Predicting"):
    pred = nb_baseline.predict(text)
    y_pred_nb.append(pred)

print("Classification Report:")
print(classification_report(y_test, [r[0] for r in y_pred_nb], digits=4))
print([r[0] for r in y_pred_nb])

top_k_accuracy_nb = top_k_accuracy(y_pred_nb)

print("Top-K Accuracy (Naive Bayes):", top_k_accuracy_nb)

In [None]:
save_results("nb_baseline.txt", y_pred_nb)