In [1]:
!pip install transformers datasets accelerate peft evaluate seqeval

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.met

- Hugging Face Authentication

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

- read and parse the file into a dict

In [3]:
from google.colab import drive
drive.mount('/content/drive')

# conll labeled data path
labeled_data_path = "/content/drive/MyDrive/Telegram_Scraper_Data/my_labeled_data_conll.txt"

# parse labeled data to a dict
def parse_conll_file(file_path):
    data = {"tokens": [], "ner_tags": []}
    current_tokens = []
    current_tags = []
    with open(file_path,'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line: # Blank line indicates new message
                if current_tokens:
                    data["tokens"].append(current_tokens)
                    data["ner_tags"].append(current_tags)
                current_tokens = []
                current_tags = []
            else:
                parts = line.split('\t')
                if len(parts) == 2:
                    current_tokens.append(parts[0])
                    current_tags.append(parts[1])

        if current_tokens:
            data["tokens"].append(current_tokens)
            data["ner_tags"].append(current_tags)
    return data

raw_data_dict = parse_conll_file(labeled_data_path)

Mounted at /content/drive


- Create a Dataset object

In [4]:
from datasets import Dataset
dataset = Dataset.from_dict(raw_data_dict)

# Split into train and validation sets 80/20
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

Train dataset size: 2880
Eval dataset size: 720


- Numerical IDs for labels

In [5]:
# Gather all unique labels from your dataset
unique_labels = sorted(list(set(label for sublist in raw_data_dict['ner_tags'] for label in sublist)))

# Define the ID to label and label to ID mappings
id2label = {i: label for i, label in enumerate(unique_labels)}
label2id = {label: i for i, label in enumerate(unique_labels)}

print(f"Unique labels found: {unique_labels}")
print(f"Label to ID mapping: {label2id}")

Unique labels found: ['B-LOC', 'B-ORG', 'B-PER', 'B-PRICE', 'B-PRODUCT', 'B-TIME', 'B-TTL', 'I-LOC', 'I-ORG', 'I-PER', 'I-PRICE', 'I-PRODUCT', 'I-TIME', 'I-TTL', 'O']
Label to ID mapping: {'B-LOC': 0, 'B-ORG': 1, 'B-PER': 2, 'B-PRICE': 3, 'B-PRODUCT': 4, 'B-TIME': 5, 'B-TTL': 6, 'I-LOC': 7, 'I-ORG': 8, 'I-PER': 9, 'I-PRICE': 10, 'I-PRODUCT': 11, 'I-TIME': 12, 'I-TTL': 13, 'O': 14}


- Tokenization

In [6]:
from transformers import AutoTokenizer

# MODEL_NAME = "xlm-roberta-base"
MODEL_NAME = "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"
# MODEL_NAME = "rasyosef/bert-tiny-amharic"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True, # Truncate long sequences
        is_split_into_words=True # Tell tokenizer that input is already split into words
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word_idx of None. We set their label to -100 so they are ignored.
            if word_idx is None:
                label_ids.append(-100)
            # We only label the first token of a given word.
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            # For subsequent subword tokens of the same word, label them -100
            # or with the 'I-' tag if preferred, depending on your scheme.
            # Standard practice for BPE tokens is -100 for non-first subwords
            else:
                label_ids.append(-100) # This token is part of a multi-subword token, ignore for loss
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the tokenization and alignment to your datasets
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)

print("\nExample of tokenization and label alignment for first train sample:")
sample_tokens = tokenized_train_dataset[0]["tokens"]
sample_ner_tags = tokenized_train_dataset[0]["ner_tags"] # Original labels
sample_input_ids = tokenized_train_dataset[0]["input_ids"]
sample_labels = tokenized_train_dataset[0]["labels"] # Aligned labels

print("Original tokens:", sample_tokens)
print("Original NER tags:", sample_ner_tags)
print("Tokenized input IDs:", sample_input_ids)
print("Decoded tokens:", tokenizer.convert_ids_to_tokens(sample_input_ids))
print("Aligned labels (IDs):", sample_labels)
# To understand aligned labels: map sample_labels to actual label strings
aligned_label_strings = [id2label[lid] if lid != -100 else "IGN" for lid in sample_labels]
print("Aligned labels (strings):", aligned_label_strings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/404 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Map:   0%|          | 0/2880 [00:00<?, ? examples/s]

Map:   0%|          | 0/720 [00:00<?, ? examples/s]


Example of tokenization and label alignment for first train sample:
Original tokens: ['የትምህርት', 'ሚኒስቴር', 'መምህራን', 'ከወጪ', 'መጋራቱ', 'ክፍያ', 'የራቁና', 'የእነርሱ', 'ወጪ', 'በአገልግሎት', 'የሚታሰብ', 'ነው"', 'ያለ', 'ቢሆንም', 'እኛ', 'ያለ', 'አማራጮች', 'ይህን', 'ውሳኔ', 'መቀበል', 'አንፈልግም', '።']
Original NER tags: ['B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Tokenized input IDs: [0, 153958, 137773, 6, 83070, 4359, 88760, 148142, 96164, 2370, 153420, 4585, 206852, 623, 3851, 11307, 946, 623, 168701, 72654, 193165, 728, 242875, 157348, 19850, 3053, 58, 16863, 113331, 44553, 16863, 109588, 72042, 5519, 34142, 84219, 2370, 105638, 20924, 158528, 816, 42192, 2]
Decoded tokens: ['<s>', '▁የትምህርት', '▁ሚኒስቴር', '▁', 'መም', 'ህ', 'ራን', '▁ከወ', 'ጪ', '▁መ', 'ጋራ', 'ቱ', '▁ክፍያ', '▁የ', 'ራ', 'ቁ', 'ና', '▁የ', 'እነ', 'ርሱ', '▁ወጪ', '▁በ', 'አገልግሎት', '▁የሚታ', 'ሰብ', '▁ነው', '"', '▁ያለ', '▁ቢሆንም', '▁እኛ', '▁ያለ', '▁አማራ', 'ጮ', 'ች', '▁ይህን', '▁ውሳኔ', '▁መ', 'ቀበል', '▁አን', 'ፈልግ', 'ም', '▁።', '</s>

- Model Setup with PEFT(LoRA)

In [7]:
from transformers import AutoModelForTokenClassification
from peft import LoraConfig, get_peft_model, TaskType

# Load the base pre-trained model for token classification
# Use your label mappings here
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME, # This should be "masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0"
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True # <--- ENSURE THIS IS PRESENT AND SET TO True
)
print(f"Model {MODEL_NAME} loaded successfully for fine-tuning, with mismatched head re-initialized.")

# Define LoRA configuration
# r: LoRA attention dimension (common values 8, 16, 32, 64)
# lora_alpha: Scaling factor (common values 16, 32)
# target_modules: Which layers to inject LoRA adapters into. Common for BERT/RoBERTa are 'query', 'value'
# lora_dropout: Dropout probability for LoRA layers
# bias: 'none' is common for LoRA
# task_type: Important for PEFT to know what kind of task (TOKEN_CLS for NER)
lora_config = LoraConfig(
    r=16, # LoRA attention dimension
    lora_alpha=32, # A scaling factor
    target_modules=["query", "value"], # Layers to apply LoRA. Often also include "key", "dense", "classifier" depending on model
    lora_dropout=0.1, # Dropout probability
    bias="none", # Bias type for LoRA layers
    task_type=TaskType.TOKEN_CLS # Indicate that this is a token classification task
)

# Get the PEFT model
peft_model = get_peft_model(model, lora_config)

# Print trainable parameters to see the dramatic reduction
print("\nPEFT Model Trainable Parameters:")
peft_model.print_trainable_parameters()

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([15]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([15, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0 loaded successfully for fine-tuning, with mismatched head re-initialized.

PEFT Model Trainable Parameters:
trainable params: 1,588,239 || all params: 560,444,446 || trainable%: 0.2834


- Training Setup

In [8]:
from transformers import TrainingArguments, DataCollatorForTokenClassification
import evaluate # Hugging Face's evaluate library
import numpy as np # For numerical operations

# Data Collator (handles padding batches)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Load metrics
seqeval_metric = evaluate.load("seqeval")

# Function to compute metrics for NER
def compute_metrics(p):
    predictions, labels = p
    # Predictions are logits, so take argmax
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (where label is -100)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute metrics
    results = seqeval_metric.compute(predictions=true_predictions, references=true_labels)

    # Return F1, precision, recall for all labels, and also macro-averaged scores
    # We target F1-score as per your project objectives
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Training Arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Telegram_Scraper_Data/ner_peft_model_results",
    learning_rate=2e-5, # Common learning rate for fine-tuning
    num_train_epochs=5, # Number of training epochs (adjust based on validation performance)
    per_device_train_batch_size=8, # Batch size for training (adjust based on GPU memory)
    per_device_eval_batch_size=8, # Batch size for evaluation
    weight_decay=0.01, # Regularization
    eval_strategy="epoch", # <--- CHANGE THIS LINE: from evaluation_strategy to eval_strategy
    save_strategy="epoch", # Save checkpoint at the end of each epoch
    load_best_model_at_end=True, # Load the best model based on evaluation metric at the end
    metric_for_best_model="f1", # Metric to use for best model selection
    report_to="none", # You can set this to "tensorboard" or "wandb" for better logging
    logging_dir='./logs', # Directory for logs
    logging_steps=100, # Log training progress every N steps
)

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

- Fine Tuning with trainer API

In [None]:
from transformers import Trainer

# Initialize the Trainer
trainer = Trainer(
    model=peft_model, # Your PEFT model
    args=training_args, # Training arguments
    train_dataset=tokenized_train_dataset, # Tokenized training data
    eval_dataset=tokenized_eval_dataset, # Tokenized evaluation data
    data_collator=data_collator, # Data collator for padding
    tokenizer=tokenizer, # Tokenizer (used by collator and for logging)
    compute_metrics=compute_metrics, # Function to compute evaluation metrics
)

# Start training
print("\nStarting PEFT fine-tuning...")
trainer.train()
print("Fine-tuning complete.")

  trainer = Trainer(
No label_names provided for model class `PeftModelForTokenClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



Starting PEFT fine-tuning...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,2.3192,2.0018,0.005146,0.003138,0.003899,0.862796
2,1.3597,1.153946,0.0,0.0,0.0,0.890785
3,0.8976,0.771334,0.0,0.0,0.0,0.890887
4,0.7187,0.665137,0.0,0.0,0.0,0.890887
5,0.6983,0.644878,0.0,0.0,0.0,0.890887


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fine-tuning complete.


- Evaluation

In [None]:
print("\nEvaluating the fine-tuned model on the evaluation set...")
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")


Evaluating the fine-tuned model on the evaluation set...


Evaluation Results: {'eval_loss': 2.0017998218536377, 'eval_precision': 0.005145797598627788, 'eval_recall': 0.0031380753138075313, 'eval_f1': 0.0038986354775828462, 'eval_accuracy': 0.8627958955602967, 'eval_runtime': 0.8253, 'eval_samples_per_second': 872.46, 'eval_steps_per_second': 109.057, 'epoch': 5.0}


- Save model on drive

In [None]:
# Save the fine-tuned PEFT model
# This saves only the LoRA adapters, which are very small.
# The base model weights are not saved here, as they are frozen.
output_model_dir = "/content/drive/MyDrive/Telegram_Scraper_Data/my_fine_tuned_ner_model"
peft_model.save_pretrained(output_model_dir)
tokenizer.save_pretrained(output_model_dir) # Save the tokenizer alongside

print(f"\nFine-tuned PEFT model saved to: {output_model_dir}")

# To load this model later for inference, you'd load the base model,
# then load the PEFT adapters, and then merge them.
# Example for loading later:
# from peft import PeftModel, PeftConfig
# config = PeftConfig.from_pretrained(output_model_dir)
# base_model = AutoModelForTokenClassification.from_pretrained(config.base_model_name_or_path, num_labels=len(unique_labels))
# loaded_peft_model = PeftModel.from_pretrained(base_model, output_model_dir)
# merged_model = loaded_peft_model.merge_and_unload() # Merge adapters for inference
# merged_model.save_pretrained("./merged_ner_model") # Save the full merged model


Fine-tuned PEFT model saved to: /content/drive/MyDrive/Telegram_Scraper_Data/my_fine_tuned_ner_model


- Model Interpretability using SHAP

In [None]:
print("\n--- Task 5: Model Interpretability Setup ---")

# 1. Install necessary interpretability libraries
!pip install shap lime

# 2. Import Libraries
import shap

# Assuming peft_model and tokenizer are still in memory from training.
# If you restart Colab, you'd need to load them from Drive:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForTokenClassification, AutoTokenizer

output_model_dir = "/content/drive/MyDrive/Telegram_Scraper_Data/my_fine_tuned_ner_model__afroxml" # Your saved model path
config = PeftConfig.from_pretrained(output_model_dir)
base_model = AutoModelForTokenClassification.from_pretrained(config.base_model_name_or_path, num_labels=len(unique_labels), ignore_mismatched_sizes=True) # Added ignore_mismatched_sizes=True
peft_model_loaded = PeftModel.from_pretrained(base_model, output_model_dir)
tokenizer_loaded = AutoTokenizer.from_pretrained(output_model_dir)
model_for_interpret = peft_model_loaded # Use this if loaded, else use peft_model from training

print("SHAP and LIME are powerful tools for model interpretability.")
print("For a Token Classification model, they help explain the influence of each token on the entity prediction.")
print("Conceptual setup would involve training an explainer (e.g., shap.Explainer) with a suitable prediction function.")
print("Due to time constraints, in-depth interpretability analysis is noted as future work.")

# Initialize a tokenizer and a dummy input for conceptual SHAP setup
# Replace 'your_cleaned_message_example' with an actual cleaned message from your data
sample_text_for_interpret = "አዲስ ስልክ በ2000 ብር ብቻ። ቦታ ልደታ"
sample_tokens_for_interpret = tokenizer.tokenize(sample_text_for_interpret) # Use your actual tokenizer

print(f"\nConceptual SHAP setup for Token Classification:")
print(f"Sample text: '{sample_text_for_interpret}'")
print(f"Sample tokens: {sample_tokens_for_interpret}")

try:
    # SHAP requires a model output function and a masker.
    # For a deep learning model, especially for token classification,
    # this setup is usually more involved than a single line.
    # This is purely illustrative of the *start* of the process.

    # Example for a specific layer's output (very basic illustration for understanding)
    # This might not yield meaningful interpretable results directly for NER,
    # but it shows the conceptual connection.

    # You need a function that maps inputs to model outputs
    # For a real scenario, you'd define a custom explainer or use text-specific SHAP explainers
    # like TextExplainer which might require specific model wrappers.
    # Due to complexity for this deadline, this is primarily conceptual explanation in report.
    print("Due to the complexity of integrating SHAP/LIME directly with HuggingFace TokenClassification models for meaningful outputs under time constraints,",)
    print("the practical implementation is deferred. However, conceptual understanding is key for the report:")
    print("- SHAP helps quantify each token's contribution to a specific entity prediction.")
    print("- LIME explains individual predictions by approximating the model locally.")
    print("These would be crucial for debugging, understanding biases, and building trust in the model's NER outputs.")

except Exception as e:
    print(f"Could not fully set up interpretability tools (as expected for deadline-based conceptual demo): {e}")


--- Task 5: Model Interpretability Setup ---


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([15]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([15, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


SHAP and LIME are powerful tools for model interpretability.
For a Token Classification model, they help explain the influence of each token on the entity prediction.
Conceptual setup would involve training an explainer (e.g., shap.Explainer) with a suitable prediction function.
Due to time constraints, in-depth interpretability analysis is noted as future work.

Conceptual SHAP setup for Token Classification:
Sample text: 'አዲስ ስልክ በ2000 ብር ብቻ። ቦታ ልደታ'
Sample tokens: ['▁አዲስ', '▁ስልክ', '▁በ', '2000', '▁ብር', '▁ብቻ', '።', '▁ቦታ', '▁ል', 'ደ', 'ታ']
Due to the complexity of integrating SHAP/LIME directly with HuggingFace TokenClassification models for meaningful outputs under time constraints,
the practical implementation is deferred. However, conceptual understanding is key for the report:
- SHAP helps quantify each token's contribution to a specific entity prediction.
- LIME explains individual predictions by approximating the model locally.
These would be crucial for debugging, understanding b

- FinTech Vendor Scorecard for Micro-Lending (Simplified Demonstration)

In [9]:
# --- Task 6: FinTech Vendor Scorecard for Micro-Lending (Simplified Demonstration) ---
# Goal: Demonstrate calculation of vendor metrics and lending score
# using metadata and SIMULATED NER extractions (since current model performance is low).

import pandas as pd
import os
from datetime import datetime # Import datetime for date calculations

print("\n--- Task 6: FinTech Vendor Scorecard Demonstration ---")

# 1. Load Raw Scraped Data
# Adjust path as needed for your setup (Colab Drive or local)
raw_data_path = "/content/drive/MyDrive/Telegram_Scraper_Data/output_data/telegram_data.csv" # Your main scraped CSV

try:
    df_raw_scraped = pd.read_csv(raw_data_path)
    # Ensure 'Message' column is string type and fill NaNs
    df_raw_scraped['Message'] = df_raw_scraped['Message'].fillna('').astype(str)
    # Convert 'Date' column to datetime objects for calculations
    df_raw_scraped['Date'] = pd.to_datetime(df_raw_scraped['Date'])
    print(f"Raw scraped data loaded. Total posts: {len(df_raw_scraped)}")
except FileNotFoundError:
    print(f"Error: Raw scraped data file not found at '{raw_data_path}'. Please ensure it exists.")
    df_raw_scraped = pd.DataFrame() # Create empty DataFrame to avoid further errors

if not df_raw_scraped.empty:
    # 2. Select ONE Vendor/Channel for Demonstration
    # IMPORTANT: CHANGE THIS to one of your actual scraped channels (e.g., @qnashcom, @Shegeronlinestore)
    target_vendor_username = "@marakibrand"

    # Filter posts for the target vendor
    vendor_posts = df_raw_scraped[df_raw_scraped['Username'] == target_vendor_username].copy()

    # Sort by date to make frequency calculation meaningful
    vendor_posts = vendor_posts.sort_values(by='Date').reset_index(drop=True)

    if not vendor_posts.empty:
        print(f"\nDemonstrating Vendor Scorecard for: {target_vendor_username}")
        print(f"Number of posts available for this vendor: {len(vendor_posts)}")

        # 3. Calculate Key Vendor Metrics:

        # --- A. Activity & Consistency: Posting Frequency ---
        posting_frequency_per_week = 0
        if len(vendor_posts) > 1:
            min_date = vendor_posts['Date'].min()
            max_date = vendor_posts['Date'].max()
            total_days_scraped = (max_date - min_date).days + 1
            if total_days_scraped == 0: total_days_scraped = 1 # Avoid division by zero for single-day scrapes

            posting_frequency_per_day = len(vendor_posts) / total_days_scraped
            posting_frequency_per_week = posting_frequency_per_day * 7
        elif len(vendor_posts) == 1:
             # If only one post, cannot calculate frequency over time; assume minimal activity for demo
             posting_frequency_per_week = 1 # One post within a week
             print("Note: Only 1 post found for this vendor, posting frequency is simplified.")
        else:
            print("No posts found for this vendor to calculate posting frequency.")

        # --- B. Market Reach & Engagement: Average Views per Post & Top Post ---
        # IMPORTANT: Your scraper by default did NOT capture 'Views'.
        # For this demo, we are SIMULATING these values for a conceptual example.
        # In a full project, you would update your scraper to extract message.views.

        # --- SIMULATED VIEWS DATA ---
        # If your actual scraper started saving views, replace this logic with actual column access:
        if 'Views' in vendor_posts.columns: # Check if you updated your scraper and the CSV contains a 'Views' column
            average_views_per_post = vendor_posts['Views'].mean()
            top_performing_post_views = vendor_posts['Views'].max()
            top_post_row = vendor_posts.loc[vendor_posts['Views'].idxmax()]
            top_performing_post_text = top_post_row['Message']
            print("\nUsing actual 'Views' data from CSV (assuming scraper update).")
        else:
            print("\nWARNING: 'Views' column not found in data. Simulating 'Average Views per Post' for demonstration.")
            # Manually pick a few posts from your `vendor_posts` DataFrame and roughly estimate their views.
            # You can inspect the posts directly in the raw data file.
            simulated_views_list = [500, 750, 1200, 300, 900, 600, 1500] # Example simulated views for several posts
            average_views_per_post = sum(simulated_views_list) / len(simulated_views_list)
            top_performing_post_views = max(simulated_views_list)
            # You would manually pick the message for the top post, here's a placeholder
            top_performing_post_text = "Simulated: (Post with highest views - manually verify this in your raw data)"


        # --- C. Business Profile: Average Price Point (from SIMULATED NER) ---
        # As your NER model is not yet highly accurate, we are SIMULATING extracted prices here.
        # In a real production system, this would come from the output of your fine-tuned, accurate NER model.

        # Go to your `telegram_data.csv`, filter for your target_vendor_username,
        # and manually identify a few prices from their messages.
        simulated_prices_etb = [1500, 3000, 850, 4500, 1000, 2800, 5000] # Manually chosen prices from @qnashcom examples
        average_price_point_etb = sum(simulated_prices_etb) / len(simulated_prices_etb) if simulated_prices_etb else 0
        print(f"Using {len(simulated_prices_etb)} manually identified prices for Average Price Point demonstration.")

        # --- D. Top Performing Post's Product & Price (from SIMULATED NER) ---
        # This would also come from the NER model's extraction on the top-viewed post.
        top_post_product_simulated = "Simulated: Product A (from highest view post)"
        top_post_price_simulated = "Simulated: 2500 ETB"

        # --- E. Create a Final "Lending Score" ---
        # Design a simple, weighted score. Explain your weighting choices in the report.
        # Adjust scaling factors (e.g., 0.001 for views, 100 for price) based on your numbers' magnitudes
        # to ensure each component contributes meaningfully to the score.

        # Example weighting:
        # Posting Frequency (e.g., higher is better)
        # Average Views (e.g., higher is better)
        # Average Price Point (e.g., lower average price might mean higher volume, or higher price means higher value items)
        # Let's say higher views and higher frequency are good, and lower price points *might* mean more accessible products (or vice-versa, depending on business logic).

        # Example formula: (adjust coefficients based on how you want to prioritize)
        # Scale views to prevent dominating if they are large numbers
        scaled_avg_views = average_views_per_post / 100 # Example scaling
        scaled_avg_price = average_price_point_etb / 100 # Example scaling

        # This is a conceptual formula, modify it based on EthioMart's priorities:
        # Score = (Posts/Week * Weight_Freq) + (Scaled Avg Views * Weight_Views) + (Scaled Avg Price * Weight_Price)
        # Let's assume higher frequency, higher views, and higher average price (for higher ticket items) are good.

        weight_freq = 0.4
        weight_views = 0.3
        weight_price = 0.3

        lending_score = (posting_frequency_per_week * weight_freq) + \
                        (scaled_avg_views * weight_views) + \
                        (scaled_avg_price * weight_price)

        # 4. Present Summary Table for Report
        print("\n--- Vendor Scorecard Summary for Report ---")
        print(f"Vendor: {target_vendor_username}")
        print("-" * 40)
        print(f"Posting Frequency (posts/week):   {posting_frequency_per_week:.2f}")
        print(f"Average Views/Post (Simulated):   {average_views_per_post:.2f}")
        print(f"Average Price Point (ETB, Sim.):  {average_price_point_etb:.2f}")
        print(f"Top Post Views (Simulated):       {top_performing_post_views:.0f}")
        print(f"Top Post Product (Simulated NER): {top_post_product_simulated}")
        print(f"Top Post Price (Simulated NER):   {top_post_price_simulated}")
        print(f"Calculated Lending Score:         {lending_score:.2f}")
        print("-" * 40)

        print("\nNote: This demonstration utilizes manually identified (simulated) NER extractions and conceptualized metrics where raw data was unavailable (e.g., 'Views'), due to project constraints.")
        print("In a production system, these metrics would be calculated automatically using outputs from a highly accurate NER model and comprehensive metadata scraping.")

    else:
        print(f"No posts found for vendor: {target_vendor_username}. Check username or if data was scraped.")
else:
    print("Could not proceed with Vendor Scorecard: Raw scraped data not loaded or is empty.")


--- Task 6: FinTech Vendor Scorecard Demonstration ---
Raw scraped data loaded. Total posts: 30580

Demonstrating Vendor Scorecard for: @marakibrand
Number of posts available for this vendor: 5000

Using 7 manually identified prices for Average Price Point demonstration.

--- Vendor Scorecard Summary for Report ---
Vendor: @marakibrand
----------------------------------------
Posting Frequency (posts/week):   22.62
Average Views/Post (Simulated):   821.43
Average Price Point (ETB, Sim.):  2664.29
Top Post Views (Simulated):       1500
Top Post Product (Simulated NER): Simulated: Product A (from highest view post)
Top Post Price (Simulated NER):   Simulated: 2500 ETB
Calculated Lending Score:         19.51
----------------------------------------

Note: This demonstration utilizes manually identified (simulated) NER extractions and conceptualized metrics where raw data was unavailable (e.g., 'Views'), due to project constraints.
In a production system, these metrics would be calculated