In [None]:
# Setup
import sqlite3
import json
import regex
from pathlib import Path
from typing import Dict, List, Any

# Initialize Amharic processor (copy from notebook 02)
class AmharicProcessor:
    """Simplified Amharic text processor for e-commerce data."""
    
    def __init__(self):
        self.price_patterns = [
            regex.compile(r'(\d+(?:,\d{3})*(?:\.\d{2})?)\s*(?:ብር|birr|ETB)', regex.IGNORECASE),
            regex.compile(r'(?:ዋጋ|ዋጋው|በ)\s*(\d+(?:,\d{3})*(?:\.\d{2})?)', regex.IGNORECASE),
        ]
        self.location_keywords = ['አዲስ አበባ', 'አዲስ ዓባባ', 'ቦሌ', 'ገርጂ', 'ንግሥት', 'ማርካቶ', 'ፒያሳ']
        self.product_keywords = ['ቦርሳ', 'ሞባይል', 'ፎን', 'ልብስ', 'ሻምፖ', 'cream', 'lotion', 'bottle']
    
    def clean_text(self, text: str) -> str:
        if not text:
            return ""
        return regex.sub(r'\s+', ' ', text).strip()
    
    def extract_entities(self, text: str) -> Dict[str, List[str]]:
        entities = {'prices': [], 'locations': [], 'products': []}
        
        if not text:
            return entities
        
        # Extract prices
        for price_pattern in self.price_patterns:
            matches = price_pattern.findall(text)
            entities['prices'].extend(matches)
        
        # Extract locations
        for location in self.location_keywords:
            if location in text:
                entities['locations'].append(location)
        
        # Extract products
        for keyword in self.product_keywords:
            if keyword.lower() in text.lower():
                entities['products'].append(keyword)
        
        # Remove duplicates
        for key in entities:
            entities[key] = list(set(entities[key]))
        
        return entities

processor = AmharicProcessor()
print("Data processing setup complete")


In [None]:
# Load Sample Scraped Data
# Note: This assumes you've run notebook 03 and have scraped_messages
# For demo purposes, we'll create sample data if none exists

sample_scraped_messages = {
    "@ShegerOnlineStore": [
        {
            'id': 123,
            'channel': '@ShegerOnlineStore',
            'channel_title': 'Sheger Online Shopping',
            'text': 'የሴቶች ቦርሳ ዋጋ 2500 ብር በአዲስ አበባ',
            'date': '2024-01-01T10:00:00',
            'views': 150,
            'has_media': False
        },
        {
            'id': 124,
            'channel': '@ShegerOnlineStore', 
            'channel_title': 'Sheger Online Shopping',
            'text': 'ሞባይል ፎን 15000 ብር delivery ከነ ቦሌ',
            'date': '2024-01-01T11:00:00',
            'views': 200,
            'has_media': True
        }
    ],
    "@ethio_commerce": [
        {
            'id': 456,
            'channel': '@ethio_commerce',
            'channel_title': 'Ethio Commerce',
            'text': 'Baby bottle በ 150 birr ገርጂ ላይ',
            'date': '2024-01-01T12:00:00',
            'views': 75,
            'has_media': False
        }
    ]
}

# Use actual scraped data if available (from previous notebook)
try:
    # This will work if you ran notebook 03 first
    scraped_messages
    print("Using scraped data from previous notebook")
except NameError:
    # Use sample data for demo
    scraped_messages = sample_scraped_messages
    print("Using sample data for processing demo")

print(f"Processing data from {len(scraped_messages)} channels")


In [None]:
# Process Messages and Extract Entities
processed_results = {}

print("Processing messages for entity extraction:")
print("=" * 50)

for channel, messages in scraped_messages.items():
    print(f"\nProcessing {channel} ({len(messages)} messages):")
    channel_processed = []
    
    for i, message in enumerate(messages, 1):
        if message['text']:
            print(f"\n  {i}. Message {message['id']}:")
            print(f"     Original: {message['text'][:70]}...")
            
            # Clean text
            cleaned_text = processor.clean_text(message['text'])
            
            # Extract entities
            entities = processor.extract_entities(cleaned_text)
            
            processed_data = {
                'message_id': message['id'],
                'channel': channel,
                'channel_title': message['channel_title'],
                'original_text': message['text'],
                'cleaned_text': cleaned_text,
                'date': message['date'],
                'views': message['views'],
                'entities': entities
            }
            
            channel_processed.append(processed_data)
            
            print(f"     Products: {entities['products']}")
            print(f"     Prices: {entities['prices']}")
            print(f"     Locations: {entities['locations']}")
    
    processed_results[channel] = channel_processed
    print(f"\nProcessed {len(channel_processed)} messages from {channel}")

print(f"\nProcessing complete for {len(processed_results)} channels")


In [None]:
# Save to Database
db_path = Path("data/processed_messages.db")
db_path.parent.mkdir(parents=True, exist_ok=True)

# Initialize database
with sqlite3.connect(db_path) as conn:
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS processed_messages (
            id INTEGER,
            channel TEXT,
            channel_title TEXT,
            original_text TEXT,
            cleaned_text TEXT,
            date TEXT,
            views INTEGER,
            entities TEXT,
            PRIMARY KEY (id, channel)
        )
    """)
    conn.commit()

# Save all processed data
total_saved = 0
for channel, messages in processed_results.items():
    for msg in messages:
        with sqlite3.connect(db_path) as conn:
            cursor = conn.cursor()
            cursor.execute("""
                INSERT OR REPLACE INTO processed_messages 
                (id, channel, channel_title, original_text, cleaned_text, date, views, entities)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                msg['message_id'], 
                msg['channel'], 
                msg['channel_title'],
                msg['original_text'],
                msg['cleaned_text'],
                msg['date'], 
                msg['views'],
                json.dumps(msg['entities'])
            ))
            conn.commit()
            total_saved += 1

print(f"Saved {total_saved} processed messages to database: {db_path}")

# Verify data was saved
with sqlite3.connect(db_path) as conn:
    cursor = conn.cursor()
    cursor.execute("SELECT COUNT(*) FROM processed_messages")
    count = cursor.fetchone()[0]
    print(f"Database contains {count} processed messages")
