In [2]:
# Task 2: Load and inspect the labeled dataset in CoNLL format
import pandas as pd

# Let's simulate reading a CoNLL-formatted file.
# In practice, you'd read the actual 'labeled_data.conll' file line by line.
conll_file = "labeled_data.conll"  # This is a placeholder path for the example.

# We will parse the CoNLL data. Each sentence is separated by a blank line.
sentences = []
current_tokens = []
current_labels = []

with open(conll_file, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line:  # empty line indicates end of a sentence
            if current_tokens:
                sentences.append((current_tokens, current_labels))
            current_tokens, current_labels = [], []
        else:
            token, label = line.split()
            current_tokens.append(token)
            current_labels.append(label)
    # Add the last sentence if file doesn't end with a blank line
    if current_tokens:
        sentences.append((current_tokens, current_labels))

print(f"Total labeled sentences: {len(sentences)}")
print("Example labeled sentence:")
example_tokens, example_labels = sentences[0]
for token, label in zip(example_tokens, example_labels):
    print(f"{token}\t{label}")


Total labeled sentences: 1
Example labeled sentence:
ዶቭ	B-PRODUCT
ቦዲ	I-PRODUCT
ሎሽን	I-PRODUCT
ዋጋ፦	O
200	B-PRICE
ብር	I-PRICE
አድራሻ፦	O
አዲስ	B-LOCATION
አበባ	I-LOCATION


In [4]:
# Task 3: Prepare data for XLM-R model fine-tuning
!pip install transformers -q  # Install Hugging Face Transformers if not already installed
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer

# Define the model checkpoint and label list
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define unique labels in the dataset
labels = ["O", "B-PRODUCT", "I-PRODUCT", "B-PRICE", "I-PRICE", "B-LOCATION", "I-LOCATION"]
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Split sentences into train and validation sets (80/20 split)
split_idx = int(0.8 * len(sentences))
train_sentences = sentences[:split_idx]
val_sentences = sentences[split_idx:]

# Function to tokenize and align labels for a list of sentences
def encode_tags(token_list, label_list, tokenizer):
    encodings = tokenizer(token_list, is_split_into_words=True, return_offsets_mapping=True,
                          padding=True, truncation=True, max_length=128)
    # align labels
    aligned_labels = []
    for i, offsets in enumerate(encodings.offset_mapping):
        # offsets map subword pieces to original token indices (or None for special tokens)
        word_ids = encodings.word_ids(batch_index=i)
        labels_i = label_list[i]
        label_ids = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                # Special token like [CLS]/[SEP] in BERT or beginning-of-sentence token in XLM-R
                label_ids.append(-100)
            elif word_id != previous_word_id:
                # Start of a new word
                label_ids.append(label2id[labels_i[word_id]])
            else:
                # Same word as previous token piece
                label_ids.append(-100)
            previous_word_id = word_id
        aligned_labels.append(label_ids)
    # We don't need offset mappings beyond alignment, remove to save memory
    encodings.pop("offset_mapping", None)
    encodings["labels"] = aligned_labels
    return encodings

# Prepare encodings for training and validation sets
train_tokens = [tokens for tokens, lbs in train_sentences]
train_tags = [lbs for tokens, lbs in train_sentences]
val_tokens = [tokens for tokens, lbs in val_sentences]
val_tags = [lbs for tokens, lbs in val_sentences]

train_encodings = encode_tags(train_tokens, train_tags, tokenizer)
val_encodings = encode_tags(val_tokens, val_tags, tokenizer)

# Create torch Dataset objects
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self):
        return len(self.encodings["input_ids"])
    def __getitem__(self, idx):
        # Return dictionary of tensors for a single encoded sample
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

train_dataset = NERDataset(train_encodings)
val_dataset = NERDataset(val_encodings)

print(f"Prepared train samples: {len(train_dataset)}, validation samples: {len(val_dataset)}")
# Print an example to verify tokenization and alignment
example_idx = 0
tokens_example = train_tokens[example_idx]
labels_example = train_tags[example_idx]
print("Original tokens:", tokens_example)
print("Original labels:", labels_example)
print("Tokenized pieces:", tokenizer.tokenize(" ".join(tokens_example)))
print("Aligned label IDs:", train_encodings['labels'][example_idx])
print("Aligned labels:", [id2label[id] if id != -100 else "-" for id in train_encodings['labels'][example_idx]])


ERROR: Invalid requirement: '#'


ModuleNotFoundError: Could not import module 'Trainer'. Are this object's requirements defined correctly?