In [None]:
# 05_bert_bio_pol_training

# Imports
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))
print("Project root:", PROJECT_ROOT)

import numpy as np
import pandas as pd
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)

from sklearn.metrics import accuracy_score
from seqeval.metrics import classification_report, f1_score

from src.config import PROCESSED_DIR, ASPECT_MODEL_DIR


# Load BIO-POL datasets
train_df = pd.read_parquet(PROCESSED_DIR / "bio_pol_train.parquet")
val_df   = pd.read_parquet(PROCESSED_DIR / "bio_pol_val.parquet")
test_df  = pd.read_parquet(PROCESSED_DIR / "bio_pol_test.parquet")

print(len(train_df), len(val_df), len(test_df))


# Define label
label_list = [
    "O",
    "B-POS", "I-POS",
    "B-NEG", "I-NEG",
    "B-NEU", "I-NEU"
]

id2label = {i: l for i, l in enumerate(label_list)}
label2id = {l: i for i, l in id2label.items()}


# load tokenizer & model
MODEL_NAME = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# Build HuggingFace Dataset wrapper
class ABSA_Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.input_ids = df["input_ids"].tolist()
        self.labels = df["labels"].tolist()

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx], dtype=torch.long),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

train_dataset = ABSA_Dataset(train_df)
val_dataset   = ABSA_Dataset(val_df)
test_dataset  = ABSA_Dataset(test_df)


# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# f1-score
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]

    true_labels = [
        [id2label[l] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]

    return {
        "f1": f1_score(true_labels, true_predictions),
    }


# Training arguments
training_args = TrainingArguments(
    output_dir=str(ASPECT_MODEL_DIR),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# model training
trainer.train()

# Evaluation
test_results = trainer.predict(test_dataset)

preds = np.argmax(test_results.predictions, axis=2)
labels = test_results.label_ids

true_preds = [
    [id2label[p] for (p, l) in zip(pred, lab) if l != -100]
    for pred, lab in zip(preds, labels)
]

true_labels = [
    [id2label[l] for (p, l) in zip(pred, lab) if l != -100]
    for pred, lab in zip(preds, labels)
]

print(classification_report(true_labels, true_preds))


# Save Model
trainer.save_model(ASPECT_MODEL_DIR)
tokenizer.save_pretrained(ASPECT_MODEL_DIR)

print("Model saved to:", ASPECT_MODEL_DIR)



    
    