In [1]:
import os, sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from tqdm.notebook import tqdm
from src.model.KeywordBaseline import KeywordBaseline
from src.utils import label_to_emoji
from src.dataset import load_preprocessed_dataset

# Baseline models

In [None]:
train, _ = load_preprocessed_dataset()

X = train["TEXT"].values.astype("U")
y = train["Label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

baseline = KeywordBaseline(X_train=X_train, y_train=y_train)
baseline.extract_keywords(top_n=15)
print("Keyword Mapping (Top-15):")
for label, keywords in baseline.keyword_mapping.items():
    print(f"Label {label} {label_to_emoji(label)}: {keywords}")

predict_weights = []
predict_cosine = []

for text in tqdm(X_test, desc="Predicting"):
    pred_w = baseline.predict_weights(text)
    pred_c = baseline.predict_cosine(text)
    predict_weights.append(pred_w)
    predict_cosine.append(pred_c)

print("\nClassification Report (Weights):")
print(classification_report(y_test, [r[0] for r in predict_weights], digits=4))
print("\nClassification Report (Cosine):")
print(classification_report(y_test, [r[0] for r in predict_cosine], digits=4))