In [15]:
import pandas as pd

def parse_conll(filepath):
    sentences = []
    with open(filepath, encoding='utf-8') as f:
        sentence = []
        for line in f:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                parts = line.split()
                if len(parts) >= 2:
                    token, label = parts[0], parts[-1]
                    sentence.append((token, label))
        if sentence:  # Catch any trailing sentence
            sentences.append(sentence)
    return sentences

# ✅ 1. Load the data
conll_data = parse_conll("../Labeled_data.con11") 

# ✅ 2. Flatten it into token-label rows
flat_data = []
for sentence in conll_data:
    for token, label in sentence:
        flat_data.append({"token": token, "label": label})

# ✅ 3. Create a DataFrame
df = pd.DataFrame(flat_data)

# ✅ 4. Save as CSV
df.to_csv("../data/labeled_data.csv", index=False, encoding="utf-8")

print("✅ Saved labeled data to data/labeled_data.csv")


✅ Saved labeled data to data/labeled_data.csv


In [16]:
from datasets import Dataset
import pandas as pd

# Load the CSV into a pandas DataFrame
df = pd.read_csv("../data/labeled_data.csv", encoding="utf-8")

# Group tokens and labels by sentence if needed
# For now, we'll assume the data is flat — i.e., a long list of token/label rows

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

# Peek at it
print(dataset)
dataset[:5]


Dataset({
    features: ['token', 'label'],
    num_rows: 30
})


{'token': ['40,41,42,43', 'Price', '3700', 'birr', '📌አድራሻ-ሜክሲኮ'],
 'label': ['B-PRICE', 'I-PRICE', 'I-PRICE', 'I-PRICE', 'B-LOC']}

In [29]:
import pandas as pd

# Load CSV
df = pd.read_csv("../data/labeled_data.csv")  # Adjust path

sentences = []
sentence = []
for _, row in df.iterrows():
    token = str(row["token"]).strip()
    label = row["label"].strip()

    if token == "" or pd.isna(token):
        if sentence:
            sentences.append(sentence)
            sentence = []
    else:
        sentence.append((token, label))

# Append last sentence if not empty
if sentence:
    sentences.append(sentence)

# Convert to spaCy format
train_data = []

for sent in sentences:
    text = ""
    entities = []
    offset = 0

    for token, label in sent:
        token = str(token)
        start = len(text)
        text += token + " "
        end = len(text) - 1  # subtract last space

        if label != "O":
            entity_type = label.split("-")[1]
            entities.append((start, end, entity_type))

    train_data.append((text.strip(), {"entities": entities}))

# Print a few examples
for ex in train_data[:3]:
    print(ex)

# Optional: save to .py for training
import json
with open("spacy_train_data.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)

print(f"✅ Converted {len(train_data)} sentences into spaCy format.")


('40,41,42,43 Price 3700 birr 📌አድራሻ-ሜክሲኮ ጀርባ መዚድ ፕላዛ EthioBrand Reebok Winterized 40,41,42,43,44 Price 3600 birr 📌አድራሻ-ሜክሲኮ ጀርባ መዚድ ፕላዛ EthioBrand ℹ️ℹ️ℹ️ Ice cube Bottle ice cube maker pitcher ice cubes', {'entities': [(0, 11, 'PRICE'), (12, 17, 'PRICE'), (18, 22, 'PRICE'), (23, 27, 'PRICE'), (28, 38, 'LOC'), (39, 42, 'Product'), (43, 46, 'Product'), (47, 50, 'Product'), (51, 61, 'Product'), (62, 68, 'Product'), (69, 79, 'Product'), (80, 94, 'PRICE'), (95, 100, 'PRICE'), (101, 105, 'PRICE'), (106, 110, 'PRICE'), (111, 121, 'LOC'), (122, 125, 'Product'), (126, 129, 'Product'), (130, 133, 'Product'), (134, 144, 'Product'), (145, 151, 'Product'), (152, 155, 'Product'), (156, 160, 'Product'), (161, 167, 'Product'), (168, 171, 'Product'), (172, 176, 'Product'), (177, 182, 'Product'), (183, 190, 'Product'), (191, 194, 'Product'), (195, 200, 'Product')]})
✅ Converted 1 sentences into spaCy format.


In [30]:
import spacy
from spacy.training.example import Example
import random
import json

# Load blank Amharic-compatible model (multilingual or English as fallback)
nlp = spacy.blank("xx")  # xx = multilingual

# Create NER pipe
ner = nlp.add_pipe("ner")
labels = set()

# Load training data
with open("spacy_train_data.json", "r", encoding="utf-8") as f:
    TRAIN_DATA = json.load(f)

# Add labels
for text, ann in TRAIN_DATA:
    for ent in ann["entities"]:
        labels.add(ent[2])

for label in labels:
    ner.add_label(label)

# Training
optimizer = nlp.initialize()
for i in range(10):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, ann in TRAIN_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, ann)
        nlp.update([example], losses=losses, drop=0.35)
    print(f"Epoch {i+1}: Losses {losses}")

# Save model
nlp.to_disk("amharic_ner_model")
print("✅ Model trained and saved to amharic_ner_model/")


Epoch 1: Losses {'ner': np.float32(33.428574)}
Epoch 2: Losses {'ner': np.float32(32.80156)}
Epoch 3: Losses {'ner': np.float32(32.115204)}
Epoch 4: Losses {'ner': np.float32(31.490658)}
Epoch 5: Losses {'ner': np.float32(30.671091)}
Epoch 6: Losses {'ner': np.float32(29.570364)}
Epoch 7: Losses {'ner': np.float32(28.702402)}
Epoch 8: Losses {'ner': np.float32(27.67277)}
Epoch 9: Losses {'ner': np.float32(26.696175)}
Epoch 10: Losses {'ner': np.float32(25.662355)}
✅ Model trained and saved to amharic_ner_model/


In [31]:
import spacy

# Load your trained model
nlp = spacy.load("amharic_ner_model")

# Example Amharic message
text = "40,41,42,43 Price 3700 birr 📌አድራሻ-ሜክሲኮ ጀርባ ፕላዛ Ice cube"

# Run NER prediction
doc = nlp(text)

# Show entities
for ent in doc.ents:
    print(ent.text, "->", ent.label_)


40,41,42,43 -> Product
Price -> Product
3700 -> Product
birr -> Product
📌 -> Product
አድራሻ -> Product
- -> Product
ሜክሲኮ -> Product
ጀርባ -> Product
ፕላዛ -> Product
Ice -> Product
cube -> Product


In [33]:
import pandas as pd

# Load your CSV
df = pd.read_csv("../data/labeled_data.csv")

# Initialize
data = []
sentence = []
labels = []
current_index = 0

for idx, row in df.iterrows():
    token = str(row['token']).strip()
    label = str(row['label']).strip()

    if pd.isna(token) or pd.isna(label):
        continue

    if label == 'O':
        sentence.append(token)
        current_index += len(token) + 1  # +1 for space
        continue

    # Detect sentence breaks (you can improve this logic)
    if token in [".", "!", "?"] and sentence:
        joined = " ".join(sentence)
        data.append((joined, {"entities": labels}))
        sentence = []
        labels = []
        current_index = 0
        continue

    sentence.append(token)
    start = current_index
    end = start + len(token)
    labels.append((start, end, label.split("-")[-1]))  # Use PRICE, LOC, Product
    current_index = end + 1  # +1 for space

# Final one
if sentence:
    joined = " ".join(sentence)
    data.append((joined, {"entities": labels}))

print(f"✅ Converted {len(data)} sentences into spaCy format.")


✅ Converted 1 sentences into spaCy format.


In [34]:
import spacy
from spacy.training.example import Example
from spacy.util import minibatch
import random

# Load blank model
nlp = spacy.blank("xx")  # multilingual

# Add NER pipeline
ner = nlp.add_pipe("ner")
for _, ann in data:
    for ent in ann["entities"]:
        ner.add_label(ent[2])

# Train the model
nlp.begin_training()
for epoch in range(10):
    random.shuffle(data)
    losses = {}
    batches = minibatch(data, size=2)
    for batch in batches:
        examples = []
        for text, ann in batch:
            doc = nlp.make_doc(text)
            examples.append(Example.from_dict(doc, ann))
        nlp.update(examples, losses=losses)
    print(f"Epoch {epoch+1}: Losses", losses)

# Save model
nlp.to_disk("amharic_ner_model")
print("✅ Model trained and saved to amharic_ner_model/")


Epoch 1: Losses {'ner': np.float32(33.428574)}
Epoch 2: Losses {'ner': np.float32(32.810528)}
Epoch 3: Losses {'ner': np.float32(31.664017)}
Epoch 4: Losses {'ner': np.float32(29.919365)}
Epoch 5: Losses {'ner': np.float32(27.517132)}
Epoch 6: Losses {'ner': np.float32(24.655592)}
Epoch 7: Losses {'ner': np.float32(21.719816)}
Epoch 8: Losses {'ner': np.float32(19.156385)}
Epoch 9: Losses {'ner': np.float32(16.41469)}
Epoch 10: Losses {'ner': np.float32(50.462124)}
✅ Model trained and saved to amharic_ner_model/


In [35]:
import spacy
nlp = spacy.load("amharic_ner_model")

doc = nlp("Ice cube maker pitcher 3700 birr 📌አድራሻ-ሜክሲኮ")
for ent in doc.ents:
    print(ent.text, ent.label_)


Ice Product
cube Product
maker Product
pitcher Product
3700 Product
birr Product
📌አድራሻ LOC


In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForTokenClassification, Trainer, TrainingArguments
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base")
# Fine-tune with Trainer

In [None]:
from transformers import DistilBertTokenizer, DistilBertForTokenClassification
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased")
# Fine-tune similarly

In [None]:
from transformers import BertTokenizer, BertForTokenClassification
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased")
# Fine-tune similarly

In [None]:
from sklearn.metrics import classification_report
# Generate predictions and evaluate

In [36]:
import shap
explainer = shap.Explainer(model)
shap_values = explainer(data)
shap.summary_plot(shap_values, features=data)

NameError: name 'model' is not defined

In [None]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(text_instance, model.predict)
exp.show_in_notebook()