In [12]:
import os, sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, top_k_accuracy_score
from tqdm.notebook import tqdm
from src.model.KeywordBaseline import KeywordBaseline
from src.utils import label_to_emoji, labels_to_emojis
from src.dataset import load_preprocessed_dataset
import numpy as np

# Keywords baseline models

In [None]:
train, _ = load_preprocessed_dataset()

X = train["TEXT"].values.astype("U")
y = train["Label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

baseline = KeywordBaseline(X_train=X_train, y_train=y_train)
baseline.extract_keywords(top_n=15)
print("Keyword Mapping (Top-15):")
for label, keywords in baseline.keyword_mapping.items():
    print(f"Label {label} {label_to_emoji(label)}: {keywords}")

predict_weights = []
predict_cosine = []

for text in tqdm(X_test, desc="Predicting"):
    pred_w = baseline.predict_weights(text)
    pred_c = baseline.predict_cosine(text)
    predict_weights.append(pred_w)
    predict_cosine.append(pred_c)

print("\nClassification Report (Weights):")
print(classification_report(y_test, [r[0] for r in predict_weights], digits=4))
print("\nClassification Report (Cosine):")
print(classification_report(y_test, [r[0] for r in predict_cosine], digits=4))


In [None]:
y_score_weights = np.zeros((len(X_test), 20))
y_score_cosine = np.zeros((len(X_test), 20))

for i, (_, scores_w) in enumerate(predict_weights):
    for label, score in scores_w.items():
        y_score_weights[i, label] = score

for i, (_, scores_c) in enumerate(predict_cosine):
    for label, score in scores_c.items():
        y_score_cosine[i, label] = score

for k in [1, 3, 5]:
    acc_w = top_k_accuracy_score(y_test, y_score_weights, k=k)
    acc_c = top_k_accuracy_score(y_test, y_score_cosine, k=k)
    print(f"Top-{k} Accuracy (Weights): {acc_w:.4f}")
    print(f"Top-{k} Accuracy (Cosine): {acc_c:.4f}")

In [None]:
# save results to file
results_dir = os.path.join(project_root, "notebooks", "results")
os.makedirs(results_dir, exist_ok=True)
weights_file = os.path.join(results_dir, "keyword_baseline_weights.txt")
cosine_file = os.path.join(results_dir, "keyword_baseline_cosine.txt")
with open(weights_file, "w") as f:
    f.write("Text\tTrueLabel\tPredictedLabels(Top-5)\n")
    lines = []
    for text, true_label, (_, scores) in tqdm(zip(X_test, y_test, predict_weights), desc="Writing Weights Results", total=len(X_test)):
        lines.append(f"{text}\t{true_label} {label_to_emoji(true_label)}\t{labels_to_emojis(list(sorted(scores, key=scores.get, reverse=True)[:5]))}\n")
    f.write("".join(lines))

with open(cosine_file, "w") as f:
    f.write("Text\tTrueLabel\tPredictedLabels(Top-5)\n")
    lines = []
    for text, true_label, (_, scores) in tqdm(zip(X_test, y_test, predict_cosine), desc="Writing Cosine Results", total=len(X_test)):
        lines.append(f"{text}\t{true_label} {label_to_emoji(true_label)}\t{labels_to_emojis(list(sorted(scores, key=scores.get, reverse=True)[:5]))}\n")
    f.write("".join(lines))