In [None]:
# 03_bio_tagging_and_bert_preparation

import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))
print("Project root:", PROJECT_ROOT)

import pandas as pd
from tqdm.notebook import tqdm
from transformers import BertTokenizerFast

from src.config import PROCESSED_DIR


# 2. Load data from notebook 02
input_path = PROCESSED_DIR / "semeval_with_dep_aspects.parquet"
df = pd.read_parquet(input_path)

print("Loaded:", input_path)
print("Rows:", len(df))
df.head()

# load Tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

# Define BIO-POL lable map
label_list = [
    "O",
    "B-POS", "I-POS",
    "B-NEG", "I-NEG",
    "B-NEU", "I-NEU"
]

label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}

label_to_id

# BIO-POL creation
def create_bio_pol(sentence, dep_aspects, tokenizer):
    """
    Create BIO-POL labels aligned to BERT tokens
    """
    encoding = tokenizer(
        sentence,
        return_offsets_mapping=True,
        truncation=True,
        max_length=128
    )

    labels = [-100] * len(encoding["input_ids"])
    offsets = encoding["offset_mapping"]

    polarity_map = {
        "positive": "POS",
        "negative": "NEG",
        "neutral": "NEU"
    }

    for asp in dep_aspects:
        if "term" not in asp:
            continue

        term = asp["term"]
        polarity = polarity_map.get(asp["polarity"], "NEU")

        start = sentence.lower().find(term.lower())
        if start == -1:
            continue

        end = start + len(term)

        first = True
        for i, (s, e) in enumerate(offsets):
            if s >= start and e <= end and e > s:
                if first:
                    labels[i] = label_to_id[f"B-{polarity}"]
                    first = False
                else:
                    labels[i] = label_to_id[f"I-{polarity}"]


    return encoding["input_ids"], labels

# Apply tagging to dataset
records = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    input_ids, labels = create_bio_pol(
        row["sentence_clean"],
        row["dep_aspects"],
        tokenizer
    )


    records.append({
        "id": row["id"],
        "sentence": row["sentence_clean"],
        "input_ids": input_ids,
        "labels": labels
    })


bio_df = pd.DataFrame(records)
bio_df.head()


# train-test-validation split
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(bio_df, test_size=0.10, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.11, random_state=42)

print("Train:", len(train_df))
print("Val:", len(val_df))
print("Test:", len(test_df))


# saving o/p
train_df.to_parquet(PROCESSED_DIR / "bio_pol_train.parquet", index=False)
val_df.to_parquet(PROCESSED_DIR / "bio_pol_val.parquet", index=False)
test_df.to_parquet(PROCESSED_DIR / "bio_pol_test.parquet", index=False)

print("BIO-POL datasets saved.")
