In [None]:
!pip install transformers datasets accelerate peft evaluate seqeval

- Hugging Face Authentication

In [None]:
from huggingface_hub import notebook_login
notebook_login()

- read and parse the file into a dict

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# conll labeled data path
labeled_data_path = "/content/drive/MyDrive/Telegram_Scraper_Data/my_labeled_data_conll.txt"

# parse labeled data to a dict
def parse_conll_file(file_path):
    data = {"tokens": [], "ner_tags": []}
    current_tokens = []
    current_tags = []
    with open(file_path,'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line: # Blank line indicates new message
                if current_tokens:
                    data["tokens"].append(current_tokens)
                    data["ner_tags"].append(current_tags)
                current_tokens = []
                current_tags = []
            else:
                parts = line.split('\t')
                if len(parts) == 2:
                    current_tokens.append(parts[0])
                    current_tags.append(parts[1])

        if current_tokens:
            data["tokens"].append(current_tokens)
            data["ner_tags"].append(current_tags)
    return data

raw_data_dict = parse_conll_file(labeled_data_path)

- Create a Dataset object

In [None]:
from datasets import Dataset
dataset = Dataset.from_dict(raw_data_dict)

# Split into train and validation sets 80/20
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

- Numerical IDs for labels

In [None]:
# Gather all unique labels from your dataset
unique_labels = sorted(list(set(label for sublist in raw_data_dict['ner_tags'] for label in sublist)))

# Define the ID to label and label to ID mappings
id2label = {i: label for i, label in enumerate(unique_labels)}
label2id = {label: i for i, label in enumerate(unique_labels)}

print(f"Unique labels found: {unique_labels}")
print(f"Label to ID mapping: {label2id}")

- Tokenization

In [None]:
from transformers import AutoTokenizer

# MODEL_NAME = "xlm-roberta-base"
MODEL_NAME = "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"
# MODEL_NAME = "rasyosef/bert-tiny-amharic"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True, # Truncate long sequences
        is_split_into_words=True # Tell tokenizer that input is already split into words
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word_idx of None. We set their label to -100 so they are ignored.
            if word_idx is None:
                label_ids.append(-100)
            # We only label the first token of a given word.
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            # For subsequent subword tokens of the same word, label them -100
            # or with the 'I-' tag if preferred, depending on your scheme.
            # Standard practice for BPE tokens is -100 for non-first subwords
            else:
                label_ids.append(-100) # This token is part of a multi-subword token, ignore for loss
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and alignment to your datasets
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)

print("\nExample of tokenization and label alignment for first train sample:")
sample_tokens = tokenized_train_dataset[0]["tokens"]
sample_ner_tags = tokenized_train_dataset[0]["ner_tags"] # Original labels
sample_input_ids = tokenized_train_dataset[0]["input_ids"]
sample_labels = tokenized_train_dataset[0]["labels"] # Aligned labels

print("Original tokens:", sample_tokens)
print("Original NER tags:", sample_ner_tags)
print("Tokenized input IDs:", sample_input_ids)
print("Decoded tokens:", tokenizer.convert_ids_to_tokens(sample_input_ids))
print("Aligned labels (IDs):", sample_labels)
# To understand aligned labels: map sample_labels to actual label strings
aligned_label_strings = [id2label[lid] if lid != -100 else "IGN" for lid in sample_labels]
print("Aligned labels (strings):", aligned_label_strings)

- Model Setup with PEFT(LoRA)

In [None]:
from transformers import AutoModelForTokenClassification
from peft import LoraConfig, get_peft_model, TaskType

# Load the base pre-trained model for token classification
# Use your label mappings here
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, # This should be "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True # <--- ENSURE THIS IS PRESENT AND SET TO True
)
print(f"Model {MODEL_NAME} loaded successfully for fine-tuning, with mismatched head re-initialized.")

# Define LoRA configuration
# r: LoRA attention dimension (common values 8, 16, 32, 64)
# lora_alpha: Scaling factor (common values 16, 32)
# target_modules: Which layers to inject LoRA adapters into. Common for BERT/RoBERTa are 'query', 'value'
# lora_dropout: Dropout probability for LoRA layers
# bias: 'none' is common for LoRA
# task_type: Important for PEFT to know what kind of task (TOKEN_CLS for NER)
lora_config = LoraConfig(
    r=16, # LoRA attention dimension
    lora_alpha=32, # A scaling factor
    target_modules=["query", "value"], # Layers to apply LoRA. Often also include "key", "dense", "classifier" depending on model
    lora_dropout=0.1, # Dropout probability
    bias="none", # Bias type for LoRA layers
    task_type=TaskType.TOKEN_CLS # Indicate that this is a token classification task
)

# Get the PEFT model
peft_model = get_peft_model(model, lora_config)

# Print trainable parameters to see the dramatic reduction
print("\nPEFT Model Trainable Parameters:")
peft_model.print_trainable_parameters()

- Training Setup

In [None]:
from transformers import TrainingArguments, DataCollatorForTokenClassification
import evaluate # Hugging Face's evaluate library
import numpy as np # For numerical operations

# Data Collator (handles padding batches)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Load metrics
seqeval_metric = evaluate.load("seqeval")

# Function to compute metrics for NER
def compute_metrics(p):
    predictions, labels = p
    # Predictions are logits, so take argmax
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (where label is -100)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute metrics
    results = seqeval_metric.compute(predictions=true_predictions, references=true_labels)

    # Return F1, precision, recall for all labels, and also macro-averaged scores
    # We target F1-score as per your project objectives
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Training Arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Telegram_Scraper_Data/ner_peft_model_results",
    learning_rate=2e-5, # Common learning rate for fine-tuning
    num_train_epochs=5, # Number of training epochs (adjust based on validation performance)
    per_device_train_batch_size=8, # Batch size for training (adjust based on GPU memory)
    per_device_eval_batch_size=8, # Batch size for evaluation
    weight_decay=0.01, # Regularization
    eval_strategy="epoch", # <--- CHANGE THIS LINE: from evaluation_strategy to eval_strategy
    save_strategy="epoch", # Save checkpoint at the end of each epoch
    load_best_model_at_end=True, # Load the best model based on evaluation metric at the end
    metric_for_best_model="f1", # Metric to use for best model selection
    report_to="none", # You can set this to "tensorboard" or "wandb" for better logging
    logging_dir='./logs', # Directory for logs
    logging_steps=100, # Log training progress every N steps
)

- Fine Tuning with trainer API

In [None]:
from transformers import Trainer

# Initialize the Trainer
trainer = Trainer(
    model=peft_model, # Your PEFT model
    args=training_args, # Training arguments
    train_dataset=tokenized_train_dataset, # Tokenized training data
    eval_dataset=tokenized_eval_dataset, # Tokenized evaluation data
    data_collator=data_collator, # Data collator for padding
    tokenizer=tokenizer, # Tokenizer (used by collator and for logging)
    compute_metrics=compute_metrics, # Function to compute evaluation metrics
)

# Start training
print("\nStarting PEFT fine-tuning...")
trainer.train()
print("Fine-tuning complete.")

- Evaluation

In [None]:
print("\nEvaluating the fine-tuned model on the evaluation set...")
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

- Save model on drive

In [None]:
# Save the fine-tuned PEFT model
# This saves only the LoRA adapters, which are very small.
# The base model weights are not saved here, as they are frozen.
output_model_dir = "/content/drive/MyDrive/Telegram_Scraper_Data/my_fine_tuned_ner_model"
peft_model.save_pretrained(output_model_dir)
tokenizer.save_pretrained(output_model_dir) # Save the tokenizer alongside

print(f"\nFine-tuned PEFT model saved to: {output_model_dir}")

# To load this model later for inference, you'd load the base model,
# then load the PEFT adapters, and then merge them.
# Example for loading later:
# from peft import PeftModel, PeftConfig
# config = PeftConfig.from_pretrained(output_model_dir)
# base_model = AutoModelForTokenClassification.from_pretrained(config.base_model_name_or_path, num_labels=len(unique_labels))
# loaded_peft_model = PeftModel.from_pretrained(base_model, output_model_dir)
# merged_model = loaded_peft_model.merge_and_unload() # Merge adapters for inference
# merged_model.save_pretrained("./merged_ner_model") # Save the full merged model

- Model Interpretability using SHAP

In [None]:
print("\n--- Task 5: Model Interpretability Setup ---")

# 1. Install necessary interpretability libraries
!pip install shap lime

# 2. Import Libraries
import shap

# Assuming peft_model and tokenizer are still in memory from training.
# If you restart Colab, you'd need to load them from Drive:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForTokenClassification, AutoTokenizer

output_model_dir = "/content/drive/MyDrive/Telegram_Scraper_Data/my_fine_tuned_ner_model__afroxml" # Your saved model path
config = PeftConfig.from_pretrained(output_model_dir)
base_model = AutoModelForTokenClassification.from_pretrained(config.base_model_name_or_path, num_labels=len(unique_labels), ignore_mismatched_sizes=True) # Added ignore_mismatched_sizes=True
peft_model_loaded = PeftModel.from_pretrained(base_model, output_model_dir)
tokenizer_loaded = AutoTokenizer.from_pretrained(output_model_dir)
model_for_interpret = peft_model_loaded # Use this if loaded, else use peft_model from training

print("SHAP and LIME are powerful tools for model interpretability.")
print("For a Token Classification model, they help explain the influence of each token on the entity prediction.")
print("Conceptual setup would involve training an explainer (e.g., shap.Explainer) with a suitable prediction function.")
print("Due to time constraints, in-depth interpretability analysis is noted as future work.")

# Initialize a tokenizer and a dummy input for conceptual SHAP setup
# Replace 'your_cleaned_message_example' with an actual cleaned message from your data
sample_text_for_interpret = "አዲስ ስልክ በ2000 ብር ብቻ። ቦታ ልደታ"
sample_tokens_for_interpret = tokenizer.tokenize(sample_text_for_interpret) # Use your actual tokenizer

print(f"\nConceptual SHAP setup for Token Classification:")
print(f"Sample text: '{sample_text_for_interpret}'")
print(f"Sample tokens: {sample_tokens_for_interpret}")

try:
    # SHAP requires a model output function and a masker.
    # For a deep learning model, especially for token classification,
    # this setup is usually more involved than a single line.
    # This is purely illustrative of the *start* of the process.

    # Example for a specific layer's output (very basic illustration for understanding)
    # This might not yield meaningful interpretable results directly for NER,
    # but it shows the conceptual connection.

    # You need a function that maps inputs to model outputs
    # For a real scenario, you'd define a custom explainer or use text-specific SHAP explainers
    # like TextExplainer which might require specific model wrappers.
    # Due to complexity for this deadline, this is primarily conceptual explanation in report.
    print("Due to the complexity of integrating SHAP/LIME directly with HuggingFace TokenClassification models for meaningful outputs under time constraints,",)
    print("the practical implementation is deferred. However, conceptual understanding is key for the report:")
    print("- SHAP helps quantify each token's contribution to a specific entity prediction.")
    print("- LIME explains individual predictions by approximating the model locally.")
    print("These would be crucial for debugging, understanding biases, and building trust in the model's NER outputs.")

except Exception as e:
    print(f"Could not fully set up interpretability tools (as expected for deadline-based conceptual demo): {e}")

- FinTech Vendor Scorecard for Micro-Lending (Simplified Demonstration)

In [None]:
# --- Task 6: FinTech Vendor Scorecard for Micro-Lending (Simplified Demonstration) ---
# Goal: Demonstrate calculation of vendor metrics and lending score
# using metadata and SIMULATED NER extractions (since current model performance is low).

import pandas as pd
import os
from datetime import datetime # Import datetime for date calculations

print("\n--- Task 6: FinTech Vendor Scorecard Demonstration ---")

# 1. Load Raw Scraped Data
# Adjust path as needed for your setup (Colab Drive or local)
raw_data_path = "/content/drive/MyDrive/Telegram_Scraper_Data/output_data/telegram_data.csv" # Your main scraped CSV

try:
    df_raw_scraped = pd.read_csv(raw_data_path)
    # Ensure 'Message' column is string type and fill NaNs
    df_raw_scraped['Message'] = df_raw_scraped['Message'].fillna('').astype(str)
    # Convert 'Date' column to datetime objects for calculations
    df_raw_scraped['Date'] = pd.to_datetime(df_raw_scraped['Date'])
    print(f"Raw scraped data loaded. Total posts: {len(df_raw_scraped)}")
except FileNotFoundError:
    print(f"Error: Raw scraped data file not found at '{raw_data_path}'. Please ensure it exists.")
    df_raw_scraped = pd.DataFrame() # Create empty DataFrame to avoid further errors

if not df_raw_scraped.empty:
    # 2. Select ONE Vendor/Channel for Demonstration
    # IMPORTANT: CHANGE THIS to one of your actual scraped channels (e.g., @qnashcom, @Shegeronlinestore)
    target_vendor_username = "@marakibrand"

    # Filter posts for the target vendor
    vendor_posts = df_raw_scraped[df_raw_scraped['Username'] == target_vendor_username].copy()

    # Sort by date to make frequency calculation meaningful
    vendor_posts = vendor_posts.sort_values(by='Date').reset_index(drop=True)

    if not vendor_posts.empty:
        print(f"\nDemonstrating Vendor Scorecard for: {target_vendor_username}")
        print(f"Number of posts available for this vendor: {len(vendor_posts)}")

        # 3. Calculate Key Vendor Metrics:

        # --- A. Activity & Consistency: Posting Frequency ---
        posting_frequency_per_week = 0
        if len(vendor_posts) > 1:
            min_date = vendor_posts['Date'].min()
            max_date = vendor_posts['Date'].max()
            total_days_scraped = (max_date - min_date).days + 1
            if total_days_scraped == 0: total_days_scraped = 1 # Avoid division by zero for single-day scrapes

            posting_frequency_per_day = len(vendor_posts) / total_days_scraped
            posting_frequency_per_week = posting_frequency_per_day * 7
        elif len(vendor_posts) == 1:
             # If only one post, cannot calculate frequency over time; assume minimal activity for demo
             posting_frequency_per_week = 1 # One post within a week
             print("Note: Only 1 post found for this vendor, posting frequency is simplified.")
        else:
            print("No posts found for this vendor to calculate posting frequency.")

        # --- B. Market Reach & Engagement: Average Views per Post & Top Post ---
        # IMPORTANT: Your scraper by default did NOT capture 'Views'.
        # For this demo, we are SIMULATING these values for a conceptual example.
        # In a full project, you would update your scraper to extract message.views.

        # --- SIMULATED VIEWS DATA ---
        # If your actual scraper started saving views, replace this logic with actual column access:
        if 'Views' in vendor_posts.columns: # Check if you updated your scraper and the CSV contains a 'Views' column
            average_views_per_post = vendor_posts['Views'].mean()
            top_performing_post_views = vendor_posts['Views'].max()
            top_post_row = vendor_posts.loc[vendor_posts['Views'].idxmax()]
            top_performing_post_text = top_post_row['Message']
            print("\nUsing actual 'Views' data from CSV (assuming scraper update).")
        else:
            print("\nWARNING: 'Views' column not found in data. Simulating 'Average Views per Post' for demonstration.")
            # Manually pick a few posts from your `vendor_posts` DataFrame and roughly estimate their views.
            # You can inspect the posts directly in the raw data file.
            simulated_views_list = [500, 750, 1200, 300, 900, 600, 1500] # Example simulated views for several posts
            average_views_per_post = sum(simulated_views_list) / len(simulated_views_list)
            top_performing_post_views = max(simulated_views_list)
            # You would manually pick the message for the top post, here's a placeholder
            top_performing_post_text = "Simulated: (Post with highest views - manually verify this in your raw data)"


        # --- C. Business Profile: Average Price Point (from SIMULATED NER) ---
        # As your NER model is not yet highly accurate, we are SIMULATING extracted prices here.
        # In a real production system, this would come from the output of your fine-tuned, accurate NER model.

        # Go to your `telegram_data.csv`, filter for your target_vendor_username,
        # and manually identify a few prices from their messages.
        simulated_prices_etb = [1500, 3000, 850, 4500, 1000, 2800, 5000] # Manually chosen prices from @qnashcom examples
        average_price_point_etb = sum(simulated_prices_etb) / len(simulated_prices_etb) if simulated_prices_etb else 0
        print(f"Using {len(simulated_prices_etb)} manually identified prices for Average Price Point demonstration.")

        # --- D. Top Performing Post's Product & Price (from SIMULATED NER) ---
        # This would also come from the NER model's extraction on the top-viewed post.
        top_post_product_simulated = "Simulated: Product A (from highest view post)"
        top_post_price_simulated = "Simulated: 2500 ETB"

        # --- E. Create a Final "Lending Score" ---

        # Example formula: (adjust coefficients based on how you want to prioritize)
        # Scale views to prevent dominating if they are large numbers
        scaled_avg_views = average_views_per_post / 100 # Example scaling
        scaled_avg_price = average_price_point_etb / 100 # Example scaling

        # This is a conceptual formula, modify it based on EthioMart's priorities:
        # Score = (Posts/Week * Weight_Freq) + (Scaled Avg Views * Weight_Views) + (Scaled Avg Price * Weight_Price)
        # Let's assume higher frequency, higher views, and higher average price (for higher ticket items) are good.

        weight_freq = 0.4
        weight_views = 0.3
        weight_price = 0.3

        lending_score = (posting_frequency_per_week * weight_freq) + \
                        (scaled_avg_views * weight_views) + \
                        (scaled_avg_price * weight_price)

        # 4. Present Summary Table for Report
        print("\n--- Vendor Scorecard Summary for Report ---")
        print(f"Vendor: {target_vendor_username}")
        print("-" * 40)
        print(f"Posting Frequency (posts/week):   {posting_frequency_per_week:.2f}")
        print(f"Average Views/Post (Simulated):   {average_views_per_post:.2f}")
        print(f"Average Price Point (ETB, Sim.):  {average_price_point_etb:.2f}")
        print(f"Top Post Views (Simulated):       {top_performing_post_views:.0f}")
        print(f"Top Post Product (Simulated NER): {top_post_product_simulated}")
        print(f"Top Post Price (Simulated NER):   {top_post_price_simulated}")
        print(f"Calculated Lending Score:         {lending_score:.2f}")
        print("-" * 40)

        print("\nNote: This demonstration utilizes manually identified (simulated) NER extractions and conceptualized metrics where raw data was unavailable (e.g., 'Views'), due to project constraints.")
        print("In a production system, these metrics would be calculated automatically using outputs from a highly accurate NER model and comprehensive metadata scraping.")

    else:
        print(f"No posts found for vendor: {target_vendor_username}. Check username or if data was scraped.")
else:
    print("Could not proceed with Vendor Scorecard: Raw scraped data not loaded or is empty.")