<a href="https://colab.research.google.com/github/beza328/EthioMart-Centralizing-Telegram-E-Commerce-/blob/Task3/Fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
# Load the labeled CoNLL dataset
file_path = "/content/labeled_data.conll"  # Use your actual file path

# Function to parse the CoNLL format
def load_conll_data(file_path):
    sentences = []
    labels = []
    sentence = []
    label = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line == "":  # If empty line, it signals the end of a sentence
                sentences.append(sentence)
                labels.append(label)
                sentence = []
                label = []
            else:
                token, tag = line.split()  # Assuming each line has format "word tag"
                sentence.append(token)
                label.append(tag)

    return sentences, labels

# Load and parse the dataset
sentences, labels = load_conll_data(file_path)

In [23]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences, labels, test_size=0.2, random_state=42)

print(f"Training set size: {len(train_sentences)}")
print(f"Test set size: {len(test_sentences)}")

Training set size: 159
Test set size: 40


In [24]:
from transformers import AutoTokenizer

# Load the tokenizer for XLM-Roberta (or another model)
model_name = "xlm-roberta-base"  # Replace with "bert-tiny-amharic" or another model if desired
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [25]:
# Define a mapping from label names to numerical IDs
label_to_id = {
    "B-Product": 0,
    "I-Product": 1,
    "B-LOC": 2,
    "I-LOC": 3,
    "B-PRICE": 4,
    "I-PRICE": 5,
    "O": 6  # O is for tokens that are outside any named entity
}

# Convert labels from strings to numerical IDs
def convert_labels_to_ids(labels):
    labels_ids = []
    for label in labels:
        labels_ids.append([label_to_id[l] for l in label])
    return labels_ids

# Apply the conversion to the training and testing labels
train_labels_ids = convert_labels_to_ids(train_labels)
test_labels_ids = convert_labels_to_ids(test_labels)

In [34]:
def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(
        sentences,
        truncation=True,
        is_split_into_words=True,
        padding=True,
        max_length=128 # Add max_length to ensure consistent input shapes
    )

    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_idx != previous_word_idx:
                # Check if word_idx is within the bounds of the labels
                # Handles cases where the tokenizer creates more subwords than labels
                label_ids.append(label[word_idx] if word_idx < len(label) else -100)
            else:
                label_ids.append(-100)  # Ignore subword tokens
            previous_word_idx = word_idx
        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

In [35]:

# Use the updated function to tokenize and align the labels
train_tokenized = tokenize_and_align_labels(train_sentences, train_labels_ids)
test_tokenized = tokenize_and_align_labels(test_sentences, test_labels_ids)

In [36]:
import torch
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data["input_ids"]
        self.attention_mask = tokenized_data["attention_mask"]
        self.labels = tokenized_data["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.attention_mask[idx], dtype=torch.long),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)  # Ensure labels are integers
        }

# Convert the tokenized data into datasets
train_dataset = NERDataset(train_tokenized)
test_dataset = NERDataset(test_tokenized)

In [42]:
!pip install transformers datasets
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer # Changed import
from transformers import AutoTokenizer

# ... (rest of your existing code) ...

# Load the XLM-Roberta model for token classification # Changed model type
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label_to_id)
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)




Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.960697
2,No log,0.688765
3,No log,0.586506


TrainOutput(global_step=30, training_loss=1.0331026713053386, metrics={'train_runtime': 46.482, 'train_samples_per_second': 10.262, 'train_steps_per_second': 0.645, 'total_flos': 31161046802688.0, 'train_loss': 1.0331026713053386, 'epoch': 3.0})

In [45]:
trainer.evaluate()

{'eval_loss': 0.5865055322647095,
 'eval_runtime': 0.3484,
 'eval_samples_per_second': 114.807,
 'eval_steps_per_second': 8.61,
 'epoch': 3.0}

In [47]:
model.save_pretrained("./fine_tuned_ner_model")
tokenizer.save_pretrained(".fine_tuned_ner_model")

('.fine_tuned_ner_model/tokenizer_config.json',
 '.fine_tuned_ner_model/special_tokens_map.json',
 '.fine_tuned_ner_model/sentencepiece.bpe.model',
 '.fine_tuned_ner_model/added_tokens.json',
 '.fine_tuned_ner_model/tokenizer.json')