In [None]:
# Setup
import sys
import regex
from pathlib import Path
from typing import List, Dict, Any

sys.path.insert(0, str(Path.cwd() / "src"))

print("Setting up Amharic text processor...")


In [None]:
# Create Amharic Text Processor
class AmharicProcessor:
    """Simplified Amharic text processor for e-commerce data."""
    
    def __init__(self):
        # Amharic unicode range
        self.amharic_pattern = regex.compile(r'[\u1200-\u137F]+')
        
        # Price patterns (Ethiopian Birr)
        self.price_patterns = [
            regex.compile(r'(\d+(?:,\d{3})*(?:\.\d{2})?)\s*(?:ብር|birr|ETB)', regex.IGNORECASE),
            regex.compile(r'(?:ዋጋ|ዋጋው|በ)\s*(\d+(?:,\d{3})*(?:\.\d{2})?)', regex.IGNORECASE),
        ]
        
        # Location keywords
        self.location_keywords = [
            'አዲስ አበባ', 'አዲስ ዓባባ', 'ቦሌ', 'ገርጂ', 'ንግሥት', 'ማርካቶ', 'ፒያሳ'
        ]
        
        # Product keywords
        self.product_keywords = [
            'ቦርሳ', 'ሞባይል', 'ፎን', 'ልብስ', 'ሻምፖ', 'cream', 'lotion', 'bottle'
        ]
    
    def clean_text(self, text: str) -> str:
        """Clean and normalize text."""
        if not text:
            return ""
        # Remove excessive whitespace
        text = regex.sub(r'\s+', ' ', text).strip()
        return text
    
    def extract_entities(self, text: str) -> Dict[str, List[str]]:
        """Extract entities from text."""
        entities = {'prices': [], 'locations': [], 'products': []}
        
        if not text:
            return entities
        
        # Extract prices
        for price_pattern in self.price_patterns:
            matches = price_pattern.findall(text)
            entities['prices'].extend(matches)
        
        # Extract locations
        for location in self.location_keywords:
            if location in text:
                entities['locations'].append(location)
        
        # Extract products
        for keyword in self.product_keywords:
            if keyword.lower() in text.lower():
                entities['products'].append(keyword)
        
        # Remove duplicates
        for key in entities:
            entities[key] = list(set(entities[key]))
        
        return entities

processor = AmharicProcessor()
print("Amharic processor created successfully")


In [None]:
# Test with Sample Amharic E-commerce Text
sample_texts = [
    "የሴቶች ቦርሳ ዋጋ 2500 ብር በአዲስ አበባ",
    "Baby bottle በ 150 birr ገርጂ ላይ",
    "ሞባይል ፎን 15000 ብር delivery ከነ ቦሌ",
    "እቃዎች በ 500 ETB free delivery አዲስ ዓባባ ውስጥ",
    "የህፃናት ልብስ ዋጋው 800 ብር ንግሥት @username 09123456789"
]

print("Testing Amharic text processing:")
print("=" * 50)

for i, text in enumerate(sample_texts, 1):
    print(f"\n{i}. Original: {text}")
    
    # Clean text
    cleaned = processor.clean_text(text)
    print(f"   Cleaned: {cleaned}")
    
    # Extract entities
    entities = processor.extract_entities(cleaned)
    print(f"   Products: {entities['products']}")
    print(f"   Prices: {entities['prices']}")
    print(f"   Locations: {entities['locations']}")
    print("   " + "-" * 40)
