Mount the Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Install Necessary Libraries

In [None]:
pip install transformers datasets seqeval scikit-learn torch


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none

Data Loading

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the CoNLL file
def load_conll_data(filepath):
    sentences = []
    sentence = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip() == "":  # New sentence
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                token, label = line.strip().split()
                sentence.append((token, label))
    if sentence:
        sentences.append(sentence)
    return sentences

# Example usage:
filepath = 'drive/MyDrive/Colab Notebooks/labeled_data.conll'
sentences = load_conll_data(filepath)

# Split into training and testing data
train_data, test_data = train_test_split(sentences, test_size=0.2, random_state=42)


In [None]:
print(sentences)

[[('ለመላው', 'O'), ('የክርስትና', 'O'), ('እምነት', 'O'), ('ተከታዮች', 'O'), ('በሙሉ', 'O'), ('እንኳን', 'O'), ('ለብርሃነ', 'O'), ('መስቀሉ', 'O'), ('በሰላም', 'O'), ('አደረሳችሁ', 'O'), ('አደረሰን', 'O'), ('በዓሉ', 'O'), ('የሰላም', 'O'), ('የጤና', 'O'), ('የፍቅር', 'O'), ('የደስታ', 'O'), ('እና', 'O'), ('የመተሳሰብ', 'O'), ('በዓል', 'O'), ('ይሁንልን', 'O'), ('መልካም', 'O'), ('የመስቀል', 'O'), ('በዓል', 'O')], [('የሞተ', 'B-Product'), ('ቆዳን', 'I-Product'), ('እንዲሁም', 'O'), ('ቆሻሻን', 'I-Product'), ('ለማፅዳት', 'I-Product'), ('ተመራጭ', 'O'), ('ዋጋ', 'B-PRICE'), ('200', 'I-PRICE'), ('ብር', 'B-PRICE'), ('ውስን', 'O'), ('ፍሬ', 'O'), ('ነው', 'O'), ('ያለው', 'O'), ('አድራሻ', 'B-LOC'), ('ቁ', 'I-LOC'), ('1', 'I-LOC'), ('መገናኛ', 'I-LOC'), ('መሰረት', 'I-LOC'), ('ደፋር', 'I-LOC'), ('ሞል', 'I-LOC'), ('ሁለተኛ', 'I-LOC'), ('ፎቅ', 'I-LOC'), ('ቢሮ', 'I-LOC'), ('ቁ', 'I-LOC'), ('05', 'I-LOC'), ('06', 'I-LOC'), ('ቁ', 'I-LOC'), ('2', 'I-LOC'), ('ፒያሳ', 'I-LOC'), ('ጊዮርጊስ', 'I-LOC'), ('አደባባይ', 'I-LOC'), ('ራመት_ታቦር_ኦዳ_ህንፃ', 'I-LOC'), ('1ኛ', 'I-LOC'), ('ፎቅ', 'I-LOC'), ('ሱቅ', 'I-LOC'), ('ቁ', 'I-LOC'), 

In [None]:
print(train_data)

[[('በረፍት', 'O'), ('ቀንዎ', 'O'), ('ሱቅ', 'B-LOC'), ('ላይ', 'O'), ('መስተናገድ', 'O'), ('ለምትፈልጉ', 'O'), ('ውድ', 'O'), ('ደንበኞቻችን', 'O'), ('ልዮ', 'B-Product'), ('የዋዜማ', 'O'), ('ገበያ', 'O'), ('ከቅናሽ', 'O'), ('ጋር', 'O'), ('አዘጋጅተን', 'O'), ('እንጠብቅዎታለን', 'O'), ('ነገ', 'O'), ('ከጠዋቱ', 'O'), ('4', 'O'), ('30', 'O'), ('_', 'O'), ('ቀኑ', 'O'), ('11', 'O'), ('00', 'O'), ('ድረስ', 'O'), ('ሱቃችን', 'O'), ('ክፍት', 'O'), ('ሁኖ', 'O'), ('ይጠብቅዎታል', 'O'), ('ሱቅ', 'I-LOC'), ('መተው', 'O'), ('መግዛት', 'O'), ('ላልቻላችሁ', 'O'), ('በሞተረኞች', 'O'), ('ያሉበት', 'O'), ('እናደርሳለን', 'O'), ('ዘወትር', 'O'), ('ሰኞ', 'O'), ('_ቅዳሜ', 'O'), ('ከጠዋቱ', 'O'), ('2', 'I-LOC'), ('30', 'O'), ('እስከ', 'O'), ('ምሽቱ', 'O'), ('2', 'I-LOC'), ('00', 'O'), ('ድረስ', 'O'), ('ክፍት', 'O'), ('መሆኑን', 'O'), ('እንገልፃለን', 'O'), ('አድራሻ', 'I-LOC'), ('ቁ', 'I-LOC'), ('1', 'I-LOC'), ('መገናኛ', 'I-LOC'), ('መሰረት', 'I-LOC'), ('ደፋር', 'I-LOC'), ('ሞል', 'I-LOC'), ('ሁለተኛ', 'I-LOC'), ('ፎቅ', 'I-LOC'), ('ቢሮ', 'I-LOC'), ('ቁ', 'I-LOC'), ('05', 'I-LOC'), ('06', 'I-LOC'), ('ቁ', 'I-LOC'), ('2', 'I-LOC'), ('ፒያ

In [None]:
print(test_data)

[[('ለማንኛውም', 'O'), ('ዓይነት', 'O'), ('የውሃ', 'O'), ('ጀሪካን', 'O'), ('የሚሆን', 'O'), ('ቱቦው', 'O'), ('1', 'B-LOC'), ('2', 'I-LOC'), ('የሚረዝም', 'O'), ('መጠጫ', 'O'), ('ብርጭቆ', 'O'), ('በራሱ', 'O'), ('ላይ', 'O'), ('ማስቀመጥ', 'O'), ('ይቻላል', 'O'), ('-', 'O'), ('4', 'O'), ('መጥነው', 'O'), ('መቅዳት', 'O'), ('የሚችሉበት', 'O'), ('100', 'O'), ('200', 'O'), ('500', 'O'), ('700', 'O'), ('1000', 'O'), ('ዋጋ', 'B-PRICE'), ('1200', 'I-PRICE'), ('ብር', 'B-PRICE'), ('አድራሻ', 'I-LOC'), ('ቁ', 'I-LOC'), ('1', 'I-LOC'), ('መገናኛ', 'I-LOC'), ('መሰረት', 'I-LOC'), ('ደፋር', 'I-LOC'), ('ሞል', 'I-LOC'), ('ሁለተኛ', 'I-LOC'), ('ፎቅ', 'I-LOC'), ('ቢሮ', 'I-LOC'), ('ቁ', 'I-LOC'), ('05', 'I-LOC'), ('06', 'I-LOC'), ('ቁ', 'I-LOC'), ('2', 'I-LOC'), ('ፒያሳ', 'I-LOC'), ('ጊዮርጊስ', 'I-LOC'), ('አደባባይ', 'I-LOC'), ('ራመት_ታቦር_ኦዳ_ህንፃ', 'I-LOC'), ('1ኛ', 'I-LOC'), ('ፎቅ', 'I-LOC'), ('ሱቅ', 'I-LOC'), ('ቁ', 'I-LOC'), ('1', 'I-LOC'), ('-107', 'O'), ('0902660722', 'O'), ('0928460606', 'O'), ('ፒያሳ', 'I-LOC'), ('ቅርንጫፍ', 'O'), ('0941337070', 'O'), ('በ', 'O'), ('ለማዘዝ', 'O'), ('ይጠ

In [None]:
model_names = {
    "xlm-roberta": "xlm-roberta-base",
    "distilbert": "distilbert-base-multilingual-cased",
    "mbert": "bert-base-multilingual-cased"
}


Pre processing

In [None]:
from datasets import Dataset, load_dataset,DatasetDict

label_list = ['O', 'B-Product', 'I-Product', 'B-PRICE', 'I-PRICE', 'B-LOC', 'I-LOC']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

# Function to map labels to integers
def encode_labels(labels):
    return [label2id[label] for label in labels]
def convert_to_dataset(data):
    tokens = []
    labels = []

    for sentence in data:
        sentence_tokens = [pair[0] for pair in sentence]
        sentence_labels = encode_labels([pair[1] for pair in sentence])
        tokens.append(sentence_tokens)
        labels.append(sentence_labels)

    return Dataset.from_dict({"tokens": tokens, "labels": labels})

train_dataset = convert_to_dataset(train_data)
test_dataset = convert_to_dataset(test_data)

# # Create DatasetDict for train and test
datasets = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})




In [None]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict

model = {}
tokenizer = {}
for key, model_name in model_names.items():
  model[key] = model_name
  tokenizer[key] = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
print(model)

{'xlm-roberta': 'xlm-roberta-base', 'distilbert': 'distilbert-base-multilingual-cased', 'mbert': 'bert-base-multilingual-cased'}


In [None]:
print(tokenizer['xlm-roberta'])

XLMRobertaTokenizerFast(name_or_path='xlm-roberta-base', vocab_size=250002, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	250001: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}


In [None]:
def tokenize_and_align_labels(datasets,tokenizer):
    tokenized_inputs = tokenizer(
        datasets["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(datasets["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 for them.
            if word_idx is None:
                aligned_labels.append(-100)
            # Only label the first token of a given word.
            elif word_idx != previous_word_idx:
                aligned_labels.append(label[word_idx])
            # Set the label to -100 for the other subtokens of the word.
            else:
                aligned_labels.append(-100)
            previous_word_idx = word_idx

        labels.append(aligned_labels)

    tokenized_inputs["labels"] = labels
    # labelss=labels
    return tokenized_inputs
# Tokenize the datasets

tokenized_datasets = {}
for key, model_name in model_names.items():
  tokenized_datasets[key] = datasets.map(tokenize_and_align_labels, batched=True,fn_kwargs={'tokenizer': tokenizer[key]})



Map:   0%|          | 0/130 [00:00<?, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

In [None]:
print(tokenized_datasets)

{'xlm-roberta': DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 130
    })
    test: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 33
    })
}), 'distilbert': DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 130
    })
    test: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 33
    })
}), 'mbert': DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 130
    })
    test: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 33
    })
})}


In [None]:
print(tokenized_datasets['xlm-roberta']['train'])

Dataset({
    features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 130
})


In [None]:
print("Sample tokenized input with labels:", tokenized_datasets['xlm-roberta']['train'][0])
print("Length of tokens:", len(tokenized_datasets['xlm-roberta']['train'][0]["input_ids"]))
print("Length of labels:", len(tokenized_datasets['xlm-roberta']['train'][0]["labels"]))

Sample tokenized input with labels: {'tokens': ['በረፍት', 'ቀንዎ', 'ሱቅ', 'ላይ', 'መስተናገድ', 'ለምትፈልጉ', 'ውድ', 'ደንበኞቻችን', 'ልዮ', 'የዋዜማ', 'ገበያ', 'ከቅናሽ', 'ጋር', 'አዘጋጅተን', 'እንጠብቅዎታለን', 'ነገ', 'ከጠዋቱ', '4', '30', '_', 'ቀኑ', '11', '00', 'ድረስ', 'ሱቃችን', 'ክፍት', 'ሁኖ', 'ይጠብቅዎታል', 'ሱቅ', 'መተው', 'መግዛት', 'ላልቻላችሁ', 'በሞተረኞች', 'ያሉበት', 'እናደርሳለን', 'ዘወትር', 'ሰኞ', '_ቅዳሜ', 'ከጠዋቱ', '2', '30', 'እስከ', 'ምሽቱ', '2', '00', 'ድረስ', 'ክፍት', 'መሆኑን', 'እንገልፃለን', 'አድራሻ', 'ቁ', '1', 'መገናኛ', 'መሰረት', 'ደፋር', 'ሞል', 'ሁለተኛ', 'ፎቅ', 'ቢሮ', 'ቁ', '05', '06', 'ቁ', '2', 'ፒያሳ', 'ጊዮርጊስ', 'አደባባይ', 'ራመት_ታቦር_ኦዳ_ህንፃ', '1ኛ', 'ፎቅ', 'ሱቅ', 'ቁ', '1', '-107', '0902660722', '0928460606', 'ፒያሳ', 'ቅርንጫፍ', '0941337070', 'በ', 'ለማዘዝ', 'ይጠቀሙ', 'ለተጨማሪ', 'ማብራሪያ', 'የቴሌግራም', 'ገፃችን'], 'labels': [-100, 0, -100, -100, 0, -100, 5, -100, -100, 0, 0, -100, -100, 0, -100, -100, -100, -100, 0, 0, -100, -100, -100, -100, 1, -100, 0, -100, -100, 0, -100, 0, -100, -100, 0, 0, -100, 0, -100, -100, -100, -100, 0, 0, -100, -100, -100, 0, 0, 0, 0, -100, 0, 0, 0, 0, -100, -100, -100, 0, -1

In [None]:
from transformers import XLMRobertaForTokenClassification, DistilBertForTokenClassification, BertForTokenClassification
from transformers import Trainer, TrainingArguments
models = {}
models['xlm-roberta'] = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(label_list))
models["distilbert"] = DistilBertForTokenClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=len(label_list))
models["mbert"] = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_list))


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import XLMRobertaForTokenClassification, DistilBertForTokenClassification, BertForTokenClassification
from transformers import Trainer, TrainingArguments
# model = XLMRobertaForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

In [None]:
print(models['distilbert'])

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
   

In [None]:
from transformers import Trainer, TrainingArguments
from seqeval.metrics import classification_report


def fine_tune_model(model, model_name):
  training_args = TrainingArguments(
      output_dir=f"./results/{model_name}",
      evaluation_strategy="epoch",
      logging_strategy="steps",
      logging_steps=3,
      learning_rate=2e-5,
      per_device_train_batch_size=8,
      per_device_eval_batch_size=8,
      num_train_epochs=20,
      weight_decay=0.01,
      logging_dir='./logs',
  )
  # Define a Trainer
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=tokenized_datasets[model_name]['train'],
      eval_dataset=tokenized_datasets[model_name]['test'],
      tokenizer=tokenizer[model_name],
  )

  # # Ensure labels are present and correctly formatted in your dataset
  # print(tokenized_datasets[model_name]['train'][0])
  # print(tokenized_datasets[model_name]['train'][1])
  # print(tokenized_datasets[model_name]['train'][2])

  # Train the model
  trainer.train()
  return trainer



In [None]:
roberta_trainer = fine_tune_model(models['xlm-roberta'], 'xlm-roberta')



Epoch,Training Loss,Validation Loss
1,0.568,0.419027
2,0.2954,0.255978
3,0.1582,0.166147
4,0.0938,0.127588
5,0.0942,0.11337
6,0.1532,0.100519
7,0.0498,0.077112
8,0.0318,0.057459
9,0.0122,0.057553
10,0.0334,0.052684


Model Training

In [None]:
distilbert_trainer = fine_tune_model(models['distilbert'], 'distilbert')



Epoch,Training Loss,Validation Loss
1,0.8997,0.724547
2,0.378,0.326516
3,0.2608,0.251419
4,0.1865,0.203872
5,0.1751,0.219297
6,0.2589,0.177367
7,0.1439,0.182158
8,0.1137,0.170466
9,0.0855,0.161392
10,0.1278,0.159013


In [None]:
mbert_trainer = fine_tune_model(models['mbert'], 'mbert')



Evaluation

In [None]:
import numpy as np
from seqeval.metrics import classification_report

pred_data = {'xlm-roberta':roberta_trainer,'distilbert':distilbert_trainer,'mbert':mbert_trainer}
# Make predictions on the test set
for model_name,trainer in pred_data.items():
  predictions, labels, _ = trainer.predict(tokenized_datasets[model_name]["test"])
  predictions = np.argmax(predictions, axis=2)

  # Convert predictions and labels to original label format
  true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
  predicted_labels = [[label_list[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]

  # Evaluate using seqeval
  report = classification_report(true_labels, predicted_labels)
  print(f'{model_name} evaluation \n{report}\n\n')


Save The Model

In [None]:
best_model_name = 'xlm-roberta'

In [None]:
models[best_model_name].save_pretrained("drive/MyDrive/Colab Notebooks/finetuned-model")
tokenizer[best_model_name].save_pretrained("drive/MyDrive/Colab Notebooks/finetuned-model")

Inference on New Data

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, XLMRobertaForTokenClassification
import torch
import numpy as np

# Load the saved model and tokenizer
model = XLMRobertaForTokenClassification.from_pretrained("drive/MyDrive/Colab Notebooks/finetuned-model")
tokenizer = AutoTokenizer.from_pretrained("drive/MyDrive/Colab Notebooks/finetuned-model")

# New sentences to test
new_sentences = ["""ዋጋ 200 ብር አድራሻ መገናኛ መሰረት ደፋር ሞል ውስን ፍሬ ነው ያለው ልጆች እየተዝናኑ የሚማሩበት
ለፅሁፍ እና ለስዕል መለማመጃ የሚሆን
የወረቀት እና እርሳስ ወጪን የሚቆጥብ
ብናኝ ወይም አይን የሚወጋ ብርሀን ስለሌለው በምቾት እና በደህንነት ለረጅም ሰአት የሚጠቀሙበት"""]

# Tokenize the new sentences into words
tokenized_inputs = tokenizer(
    new_sentences,
    padding=True,
    truncation=True,
    return_tensors="pt",
    is_split_into_words=False
)

# Convert ids back to tokens for debugging
tokens = tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"][0])

model.eval()

# Disable gradient calculations
with torch.no_grad():
    outputs = model(**tokenized_inputs)

# Get the predictions
logits = outputs.logits
predictions = torch.argmax(logits, dim=2)

# Define the label list (ensure it matches the one used during training)
label_list = ['O', 'B-Product', 'I-Product', 'B-PRICE', 'I-PRICE', 'B-LOC', 'I-LOC']

# Mapping predictions back to words
predicted_labels = []
for i, pred in enumerate(predictions):
    word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to original words
    current_word_labels = []

    prev_word_id = None
    word_label = None
    for idx, word_id in enumerate(word_ids):
        if word_id is None:
            continue
        # If this is a new word
        if word_id != prev_word_id:
            # Append label of the previous word if it exists
            if word_label is not None:
                current_word_labels.append(label_list[word_label])
            # Get the label for the new word (use the label for the first subword)
            word_label = pred[idx].item()
        prev_word_id = word_id

    # Append the last word's label
    if word_label is not None:
        current_word_labels.append(label_list[word_label])

    predicted_labels.append(current_word_labels)

# Print the predicted labels for each word in the sentence
for sentence, labels in zip(new_sentences, predicted_labels):
    # Split the sentence into words
    words = sentence.split()
    # print(words)
    # print(labels)

    # Print labels for each word
    print(f"Sentence: {sentence}")
    print("Word-Level Labels:")
    for word, label in zip(words, labels):
        print(f"{word}: {label}")


In [None]:
# !pip install shap

In [None]:
# !pip install shap
# !pip install lime

In [None]:
# # Import necessary libraries
# import numpy as np
# import pandas as pd
# import shap
# import lime
# import lime.lime_text
# import torch
# from transformers import AutoTokenizer, AutoModelForTokenClassification
# from sklearn.metrics import classification_report

# # Load your fine-tuned model and tokenizer
# model_name = "drive/MyDrive/Colab Notebooks/finetuned-model"  # Update this with your model path
# tokenizer = AutoTokenizer.from_pretrained("drive/MyDrive/Colab Notebooks/finetuned-model")
# model = AutoModelForTokenClassification.from_pretrained(model_name)



In [None]:
# # Function to preprocess text for the model
# def preprocess_text(text):
#     encoding = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
#     return encoding

# # Function to get model predictions
# def get_predictions(text):
#     inputs = preprocess_text(text)
#     with torch.no_grad():
#         outputs = model(**inputs)
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=2)
#     return predictions

# # Sample data: Amharic Telegram messages
# sample_messages = [
#     "ዋጋ 1000 ብር የተመዘገበ ቅድሚ ነው።",  # Price Example
#     "በአዲስ አበባ ከባር በተቀመጡ መደብ ገንዘብ ይቅርታ",  # Location Example
#     "እባክህ የተለያዩ ምርቶች አለኝ።",  # General Product Example
# ]

# # Get predictions for the sample messages
# predictions = [get_predictions(msg) for msg in sample_messages]

# # Convert predictions to human-readable format
# def decode_predictions(predictions, text):
#     tokens = tokenizer.tokenize(text)
#     labels = [model.config.id2label[p.item()] for p in predictions[0]]
#     return list(zip(tokens, labels))

# decoded_predictions = [decode_predictions(pred, msg) for pred, msg in zip(predictions, sample_messages)]
# print("Decoded Predictions:", decoded_predictions)

# # SHAP Explanation
# # Use the first message for SHAP analysis
# explainer = shap.Explainer(model)
# shap_values = explainer([preprocess_text(sample_messages[0])["input_ids"]])

# # Plot SHAP values
# shap.initjs()
# shap.plots.text(shap_values)

# # LIME Explanation
# # Initialize the LIME text explainer
# lime_explainer = lime.lime_text.LimeTextExplainer(class_names=model.config.id2label.values(),
#                                                     tokenizer=tokenizer.tokenize)

# # Function to explain predictions using LIME
# def explain_with_lime(text):
#     exp = lime_explainer.explain_instance(text, get_predictions, num_features=10)
#     return exp.as_pyplot_figure()

# # Get LIME explanation for the first sample message
# lime_explanation_figure = explain_with_lime(sample_messages[0])
# lime_explanation_figure.show()

# # Optionally save the LIME figure if needed
# # lime_explanation_figure.savefig("lime_explanation.png")

# # Note: Ensure you have SHAP and LIME installed:
# # pip install shap lime


In [None]:
# print(tokens_filtered)