# Task 1: Data Ingestion and Preprocessing
## Amharic E-commerce NER Project

This notebook implements data collection from Ethiopian Telegram e-commerce channels and preprocessing for NER tasks.

### Objectives:
- Connect to relevant Telegram channels
- Implement message ingestion system
- Preprocess Amharic text data
- Clean and structure data into unified format
- Store preprocessed data for analysis

## 1. Setup and Imports

In [1]:
import asyncio
import os
import logging
import json
import pandas as pd
import sys
from pathlib import Path
from dotenv import load_dotenv
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Add src to path for imports
sys.path.append(str(Path.cwd().parent / 'src'))

# Import custom modules
from data_collection.telegram_scraper import TelegramScraper
from preprocessing.amharic_processor import AmharicTextProcessor
from preprocessing.data_unifier import DataUnifier

print("✅ All imports successful!")
print(f"📅 Execution started at: {datetime.now()}")

✅ All imports successful!
📅 Execution started at: 2025-06-23 20:06:30.605528


## 2. Logging Setup

In [11]:
def setup_notebook_logging():
    """Setup logging for notebook execution"""
    # Create logs directory
    logs_dir = Path("../logs")
    logs_dir.mkdir(exist_ok=True)
    
    # Clear any existing handlers
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    
    # Setup logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler('../logs/task1_notebook.log', encoding='utf-8'),
            logging.StreamHandler()
        ]
    )
    
    return logging.getLogger('task1_notebook')

logger = setup_notebook_logging()
logger.info("Task 1 notebook logging initialized")
print("📝 Logging setup complete")

2025-06-23 19:40:01,103 - task1_notebook - INFO - Task 1 notebook logging initialized


📝 Logging setup complete


## 3. Configuration and Environment Setup

In [12]:
# Load environment variables
load_dotenv()

# Check API credentials
api_id = os.getenv('TELEGRAM_API_ID')
api_hash = os.getenv('TELEGRAM_API_HASH')

if not api_id or not api_hash:
    print("❌ Error: Telegram API credentials not found!")
    print("Please set TELEGRAM_API_ID and TELEGRAM_API_HASH in your .env file")
else:
    print("✅ Telegram API credentials loaded successfully")
    print(f"API ID: {api_id[:3]}...{api_id[-3:]}")
    
# Create necessary directories
directories = [
    "../data/raw/telegram_scrapes",
    "../data/processed",
    "../logs"
]

for directory in directories:
    Path(directory).mkdir(parents=True, exist_ok=True)
    
print("📁 Directory structure created")

✅ Telegram API credentials loaded successfully
API ID: 279...109
📁 Directory structure created


## 4. Channel Configuration

In [13]:
# Display target channels configuration
config_path = "../config/channels.yaml"

if Path(config_path).exists():
    import yaml
    with open(config_path, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    
    print("🎯 Target Telegram Channels:")
    for i, channel in enumerate(config['telegram_channels'], 1):
        description = channel.get('description', 'No description')
        print(f"  {i}. {channel['username']} - {description}")
    
    print(f"\n⚙️ Scraping Configuration:")
    scraping_config = config['scraping_config']
    for key, value in scraping_config.items():
        print(f"  • {key}: {value}")
else:
    print("⚠️ Warning: channels.yaml not found. Using default configuration.")

🎯 Target Telegram Channels:
  1. ZemenExpress - No description
  2. nevacomputer - No description
  3. meneshayeofficial - No description
  4. ethio_brand_collection - No description
  5. Leyueqa - No description
  6. Shewabrand - No description
  7. helloomarketethiopia - No description
  8. modernshoppingcenter - No description
  9. qnashcom - No description
  10. Fashiontera - No description
  11. kuruwear - No description
  12. gebeyaadama - No description
  13. forfreemarket - No description
  14. classybrands - No description
  15. marakibrand - No description
  16. aradabrand2 - No description
  17. @marakisat2 - No description
  18. belaclassic - No description
  19. AwasMart - No description
  20. qnashcom - No description

⚙️ Scraping Configuration:
  • max_messages_per_channel: 1000
  • date_range_days: 60
  • include_media: True
  • retry_attempts: 3
  • delay_between_requests: 2


## 5. Data Ingestion - Telegram Scraping

In [None]:
async def run_telegram_scraping():
    """Execute Telegram scraping process"""
    logger.info("Starting Telegram data scraping")
    
    try:
        # Initialize scraper
        scraper = TelegramScraper(api_id, api_hash, '../config/channels.yaml')
        print("🔌 Connecting to Telegram...")
        
        # Start session
        await scraper.start_session()
        print("✅ Connected to Telegram successfully")
        
        # Scrape all channels
        print("🚀 Starting data collection from channels...")
        all_data = await scraper.scrape_all_channels()
        
        # Display results
        total_messages = 0
        print("\n📊 Scraping Results:")
        for channel, messages in all_data.items():
            message_count = len(messages)
            total_messages += message_count
            print(f"  • {channel}: {message_count} messages")
        
        print(f"\n🎉 Total messages collected: {total_messages}")
        
        # Close session
        await scraper.close_session()
        print("🔐 Telegram session closed")
        
        return all_data
        
    except Exception as e:
        logger.error(f"Error during scraping: {str(e)}")
        print(f"❌ Scraping failed: {str(e)}")
        return None

# Run the scraping
scraping_results = await run_telegram_scraping()

2025-06-23 19:40:28,559 - task1_notebook - INFO - Starting Telegram data scraping
2025-06-23 19:40:28,584 - telethon.network.mtprotosender - INFO - Connecting to 149.154.167.51:443/TcpFull...
2025-06-23 19:40:28,730 - telethon.network.mtprotosender - INFO - Connection to 149.154.167.51:443/TcpFull complete!


🔌 Connecting to Telegram...


## 6. Data Analysis and Validation

In [13]:
if scraping_results:
    # Analyze scraped data
    print("🔍 Analyzing scraped data...")
    
    # Combine all messages for analysis
    all_messages = []
    for channel, messages in scraping_results.items():
        all_messages.extend(messages)
    
    # Create DataFrame for analysis
    df_raw = pd.DataFrame(all_messages)
    
    print(f"\n📈 Raw Data Statistics:")
    print(f"  • Total messages: {len(df_raw)}")
    print(f"  • Unique channels: {df_raw['channel_username'].nunique()}")
    print(f"  • Messages with text: {df_raw['text'].notna().sum()}")
    print(f"  • Messages with media: {df_raw['has_media'].sum()}")
    print(f"  • Date range: {df_raw['date'].min()} to {df_raw['date'].max()}")
    
    # Channel distribution
    print(f"\n📊 Messages per channel:")
    channel_counts = df_raw['channel_username'].value_counts()
    for channel, count in channel_counts.items():
        print(f"  • {channel}: {count} messages")
    
    # Sample messages
    print(f"\n📝 Sample messages:")
    sample_messages = df_raw['text'].dropna().head(3)
    for i, message in enumerate(sample_messages, 1):
        print(f"  {i}. {message[:100]}...")
else:
    print("❌ No data to analyze - scraping may have failed")

NameError: name 'scraping_results' is not defined

## 7. Data Preprocessing and Unification

In [5]:
def run_data_unification():
    """Execute data unification and preprocessing"""
    # Use logger if available, otherwise fallback to print
    log = logger if 'logger' in globals() else None
    if log:
        log.info("Starting data unification and preprocessing")
    else:
        print("INFO: Starting data unification and preprocessing")
    
    try:
        print("🔄 Starting data unification and preprocessing...")
        
        # Initialize data unifier
        unifier = DataUnifier("../data/raw/telegram_scrapes")
        
        # Create unified dataset
        unified_df = unifier.create_unified_dataset()
        
        print(f"\n✅ Data unification completed!")
        print(f"📊 Unified dataset statistics:")
        print(f"  • Total processed messages: {len(unified_df)}")
        print(f"  • Amharic messages: {unified_df['is_amharic'].sum()}")
        print(f"  • Messages with price hints: {unified_df['has_price_hints'].sum()}")
        print(f"  • Messages with location hints: {unified_df['has_location_hints'].sum()}")
        print(f"  • Messages with product hints: {unified_df['has_product_hints'].sum()}")
        print(f"  • Average token count: {unified_df['token_count'].mean():.2f}")
        
        return unified_df
        
    except Exception as e:
        if log:
            log.error(f"Error during data unification: {str(e)}")
        print(f"❌ Data unification failed: {str(e)}")
        return None

# Run data unification
unified_dataset = run_data_unification()

INFO: Starting data unification and preprocessing
🔄 Starting data unification and preprocessing...

✅ Data unification completed!
📊 Unified dataset statistics:
  • Total processed messages: 1403
  • Amharic messages: 502
  • Messages with price hints: 951
  • Messages with location hints: 745
  • Messages with product hints: 77
  • Average token count: 70.09


## 8. Amharic Text Processing Analysis

In [6]:
if unified_dataset is not None:
    # Analyze Amharic text processing results
    print("🔤 Analyzing Amharic text processing...")
    
    # Initialize text processor for demonstration
    processor = AmharicTextProcessor()
    
    # Sample text analysis
    amharic_messages = unified_dataset[unified_dataset['is_amharic'] == True]
    
    if len(amharic_messages) > 0:
        print(f"\n📝 Sample Amharic text processing:")
        
        sample_text = amharic_messages['cleaned_text'].iloc[0]
        print(f"\nOriginal text: {sample_text}")
        
        # Tokenization
        tokens = processor.tokenize_amharic(sample_text)
        print(f"Tokens ({len(tokens)}): {tokens[:10]}...")
        
        # Entity hints
        entity_hints = processor.extract_entities_hints(sample_text)
        print(f"\nEntity hints found:")
        for entity_type, hints in entity_hints.items():
            if hints:
                print(f"  • {entity_type}: {hints}")
    
    # Entity distribution analysis
    print(f"\n📈 Entity hints distribution:")
    entity_stats = {
        'Price hints': unified_dataset['has_price_hints'].sum(),
        'Location hints': unified_dataset['has_location_hints'].sum(),
        'Product hints': unified_dataset['has_product_hints'].sum()
    }
    
    for entity_type, count in entity_stats.items():
        percentage = (count / len(unified_dataset)) * 100
        print(f"  • {entity_type}: {count} messages ({percentage:.1f}%)")

🔤 Analyzing Amharic text processing...

📝 Sample Amharic text processing:

Original text: MAGIC REMOTE ሸጠን ጨርሰናል! ክፍለሀገር ያላችሁ ደንበኞቻችንም ይሄን እቃ ብላችሁ ብር አታስገቡ.
Tokens (14): ['MAGIC', 'REMOTE', 'ሸጠን', 'ጨርሰናል', '!', 'ክፍለሀገር', 'ያላችሁ', 'ደንበኞቻችንም', 'ይሄን', 'እቃ']...

Entity hints found:

📈 Entity hints distribution:
  • Price hints: 951 messages (67.8%)
  • Location hints: 745 messages (53.1%)
  • Product hints: 77 messages (5.5%)


## 9. Data Quality Assessment

In [7]:
if unified_dataset is not None:
    print("🔍 Assessing data quality...")
    
    # Quality metrics
    quality_metrics = {
        'Total messages': len(unified_dataset),
        'Amharic messages': unified_dataset['is_amharic'].sum(),
        'Non-empty messages': unified_dataset['cleaned_text'].notna().sum(),
        'Messages with entities': len(unified_dataset[
            unified_dataset['has_price_hints'] | 
            unified_dataset['has_location_hints'] | 
            unified_dataset['has_product_hints']
        ]),
        'Average message length': unified_dataset['token_count'].mean(),
        'Min message length': unified_dataset['token_count'].min(),
        'Max message length': unified_dataset['token_count'].max()
    }
    
    print("\n📊 Data Quality Metrics:")
    for metric, value in quality_metrics.items():
        if isinstance(value, float):
            print(f"  • {metric}: {value:.2f}")
        else:
            print(f"  • {metric}: {value}")
    
    # Recommendations
    print("\n💡 Quality Assessment:")
    amharic_ratio = unified_dataset['is_amharic'].sum() / len(unified_dataset)
    entity_ratio = quality_metrics['Messages with entities'] / len(unified_dataset)
    
    if amharic_ratio > 0.7:
        print(f"  ✅ Good Amharic content ratio: {amharic_ratio:.1%}")
    else:
        print(f"  ⚠️ Low Amharic content ratio: {amharic_ratio:.1%}")
    
    if entity_ratio > 0.3:
        print(f"  ✅ Good entity coverage: {entity_ratio:.1%}")
    else:
        print(f"  ⚠️ Low entity coverage: {entity_ratio:.1%}")
    
    if quality_metrics['Average message length'] > 5:
        print(f"  ✅ Good average message length")
    else:
        print(f"  ⚠️ Short average message length")

🔍 Assessing data quality...

📊 Data Quality Metrics:
  • Total messages: 1403
  • Amharic messages: 502
  • Non-empty messages: 1403
  • Messages with entities: 1034
  • Average message length: 70.09
  • Min message length: 3
  • Max message length: 274

💡 Quality Assessment:
  ⚠️ Low Amharic content ratio: 35.8%
  ✅ Good entity coverage: 73.7%
  ✅ Good average message length


## 10. Save Results and Summary

In [12]:
# Generate final summary
task1_summary = {
    'execution_time': datetime.now().isoformat(),
    'status': 'completed' if unified_dataset is not None else 'failed',
    'data_collected': len(unified_dataset) if unified_dataset is not None else 0,
    #'channels_processed': len(scraping_results) if scraping_results else 0,
    'files_created': [
        '../data/raw/telegram_scrapes/',
        '../data/processed/unified_dataset.csv',
        '../data/processed/unified_dataset.json',
        '../data/processed/dataset_summary.json'
    ],
    'next_steps': [
        'Review data quality',
        'Proceed to Task 2: Data Labeling',
        'Prepare CoNLL format dataset'
    ]
}

# Save task summary
summary_path = Path("../logs/task1_summary.json")
with open(summary_path, 'w', encoding='utf-8') as f:
    json.dump(task1_summary, f, ensure_ascii=False, indent=2)

print("\n🎉 Task 1 Execution Summary:")
print(f"  • Status: {task1_summary['status'].upper()}")
print(f"  • Messages processed: {task1_summary['data_collected']}")
#print(f"  • Channels processed: {task1_summary['channels_processed']}")
print(f"  • Summary saved to: {summary_path}")

if unified_dataset is not None:
    print("\n✅ Task 1 completed successfully!")
    print("📂 Output files:")
    for file_path in task1_summary['files_created']:
        print(f"  • {file_path}")
    print("\n🚀 Ready for Task 2: Data Labeling")
else:
    print("\n❌ Task 1 completed with errors")
    print("Please check the logs for details")

print("Task 1 notebook execution completed")


🎉 Task 1 Execution Summary:
  • Status: COMPLETED
  • Messages processed: 1403
  • Summary saved to: ..\logs\task1_summary.json

✅ Task 1 completed successfully!
📂 Output files:
  • ../data/raw/telegram_scrapes/
  • ../data/processed/unified_dataset.csv
  • ../data/processed/unified_dataset.json
  • ../data/processed/dataset_summary.json

🚀 Ready for Task 2: Data Labeling
Task 1 notebook execution completed
