In [1]:
# 1. Setup and Configuration
import sys
import sqlite3
import json
from pathlib import Path
from datetime import datetime

print(" Task 2: CoNLL Format Labeling for Amharic NER")
print("=" * 60)
print("Setting up CoNLL labeling system...")

# Add project root to path
project_root = Path.cwd().parent if 'notebook' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(project_root))

print(f" Project root: {project_root}")
print("Ready to start CoNLL labeling!")


 Task 2: CoNLL Format Labeling for Amharic NER
Setting up CoNLL labeling system...
 Project root: /home/btd/Documents/KAIM/amharic-e-commerce-data-extractor
Ready to start CoNLL labeling!


In [2]:
# 2. Load and Preview Data
print(" Loading data for CoNLL labeling...")

# Try to load from existing databases
data_sources = [
    project_root / "data/processed/amharic_ecommerce.db",
    project_root / "data/historical_messages.db", 
    project_root / "data/demo.db"
]

messages = []
for db_path in data_sources:
    if db_path.exists():
        print(f" Found database: {db_path}")
        try:
            conn = sqlite3.connect(db_path)
            cursor = conn.cursor()
            
            # Check available tables
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
            tables = [row[0] for row in cursor.fetchall()]
            print(f"   Available tables: {tables}")
            
            # Try to load messages from any available table
            for table in ['processed_messages', 'historical_messages', 'messages']:
                if table in tables:
                    cursor.execute(f"SELECT * FROM {table} LIMIT 50")
                    rows = cursor.fetchall()
                    if rows:
                        cursor.execute(f"PRAGMA table_info({table})")
                        columns = [row[1] for row in cursor.fetchall()]
                        messages = [dict(zip(columns, row)) for row in rows]
                        print(f"   Loaded {len(messages)} messages from {table}")
                        break
            
            conn.close()
            if messages:
                break
                
        except Exception as e:
            print(f"   Error loading from {db_path}: {e}")

print(f"\n Data Summary:")
print(f"   Messages loaded: {len(messages)}")
if messages:
    sample_msg = messages[0]
    print(f"   Sample fields: {list(sample_msg.keys())}")
    text_field = None
    for field in ['text', 'original_text', 'message']:
        if field in sample_msg:
            text_field = field
            break
    
    if text_field:
        print(f"   Text field: '{text_field}'")
        print(f"   Sample text: {sample_msg[text_field][:100]}...")
    else:
        print("   No text field found")


 Loading data for CoNLL labeling...
 Found database: /home/btd/Documents/KAIM/amharic-e-commerce-data-extractor/data/processed/amharic_ecommerce.db
   Available tables: []
 Found database: /home/btd/Documents/KAIM/amharic-e-commerce-data-extractor/data/historical_messages.db
   Available tables: ['historical_messages', 'collection_stats']
   Loaded 50 messages from historical_messages

 Data Summary:
   Messages loaded: 50
   Sample fields: ['id', 'channel', 'channel_title', 'text', 'date', 'year', 'month', 'views', 'has_media', 'collection_timestamp']
   Text field: 'text'
   Sample text: cream በጣም ጥሩ! ዋጋ 1200 ብር። መርካቶ ውስጥ ይገኛል።...


In [3]:
# 3. Run CoNLL Labeling System
print(" Running CoNLL labeling process...")

# Run the labeling script
import subprocess
result = subprocess.run(['python3', 'task2_conll_labeling.py'], 
                       capture_output=True, text=True, cwd=project_root)

print("STDOUT:")
print(result.stdout)

if result.stderr:
    print("STDERR:")
    print(result.stderr)

print(f"Return code: {result.returncode}")

# Check if files were created
output_dir = project_root / "data/conll_labeled"
conll_file = output_dir / "amharic_ecommerce_conll.txt"
stats_file = output_dir / "labeling_statistics.json"

print(f"\n Output Files:")
print(f"   CoNLL file exists: {conll_file.exists()}")
print(f"   Stats file exists: {stats_file.exists()}")

if stats_file.exists():
    with open(stats_file, 'r', encoding='utf-8') as f:
        stats = json.load(f)
    print(f"\n Labeling Statistics:")
    print(f"   Messages processed: {stats['total_messages_processed']}")
    print(f"   Total tokens: {stats['total_tokens']:,}")
    print(f"   Total entities: {stats['total_entities']}")
    print(f"   Entity distribution: {stats['entity_counts']}")


 Running CoNLL labeling process...
STDOUT:

STDERR:
python3: can't open file '/home/btd/Documents/KAIM/amharic-e-commerce-data-extractor/task2_conll_labeling.py': [Errno 2] No such file or directory

Return code: 2

 Output Files:
   CoNLL file exists: True
   Stats file exists: True

 Labeling Statistics:
   Messages processed: 50
   Total tokens: 515
   Total entities: 187
   Entity distribution: {'PRODUCT': 47, 'LOC': 40, 'PRICE': 100}


In [4]:
# 4. Validate CoNLL Format
print(" Validating CoNLL format...")

# Run the validation script
result = subprocess.run(['python3', 'test_conll_format.py'], 
                       capture_output=True, text=True, cwd=project_root)

print("STDOUT:")
print(result.stdout)

if result.stderr:
    print("STDERR:")
    print(result.stderr)

print(f"Return code: {result.returncode}")

# Load validation report
validation_file = output_dir / "validation_report.json"
if validation_file.exists():
    with open(validation_file, 'r', encoding='utf-8') as f:
        validation_report = json.load(f)
    
    print(f"\n Validation Summary:")
    checks = validation_report['requirements_check']
    print(f"   Message count (30-50): {'' if checks['message_count_ok'] else ''}")
    print(f"   CoNLL format valid: {'' if checks['format_valid'] else ''}")
    print(f"   All entity types found: {'' if checks['all_entity_types_found'] else '❌'}")
    
    if all(checks.values()):
        print("\n Task 2 PASSED: Ready for NER training!")
    else:
        print("\n Task 2 ISSUES: Some requirements need attention")


 Validating CoNLL format...
STDOUT:

STDERR:
python3: can't open file '/home/btd/Documents/KAIM/amharic-e-commerce-data-extractor/test_conll_format.py': [Errno 2] No such file or directory

Return code: 2

 Validation Summary:
   Message count (30-50): 
   CoNLL format valid: 
   All entity types found: 

 Task 2 PASSED: Ready for NER training!


In [5]:
# 5. Preview CoNLL Output
print(" Previewing CoNLL format output...")

if conll_file.exists():
    with open(conll_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    print(f" File: {conll_file}")
    print(f" Total lines: {len(lines)}")
    
    print(f"\n First 50 lines:")
    print("=" * 50)
    for i, line in enumerate(lines[:50]):
        print(f"{i+1:2d}: {line.rstrip()}")
    
    print(f"\n..." + "." * 40)
    print(f" Full file contains {len(lines)} lines")
    
    # Count entity examples
    entity_lines = [line for line in lines if '\t' in line and not line.startswith('#')]
    total_tokens = len(entity_lines)
    entity_tokens = len([line for line in entity_lines if not line.strip().endswith('\tO')])
    
    print(f"\n Token Statistics:")
    print(f"   Total tokens: {total_tokens}")
    print(f"   Entity tokens: {entity_tokens}")
    print(f"   Entity coverage: {(entity_tokens/total_tokens)*100:.1f}%")
else:
    print(f" CoNLL file not found: {conll_file}")


 Previewing CoNLL format output...
 File: /home/btd/Documents/KAIM/amharic-e-commerce-data-extractor/data/conll_labeled/amharic_ecommerce_conll.txt
 Total lines: 664

 First 50 lines:
 1: # Message 1: cream በጣም ጥሩ! ዋጋ 1200 ብር። መርካቶ ውስጥ ይገኛል።
 2: # Channel: @ShegerOnlineStore
 3: cream	B-PRODUCT
 4: በጣም	O
 5: ጥሩ	O
 6: !	O
 7: ዋጋ	B-PRICE
 8: 1200	B-PRICE
 9: ብር።	I-PRICE
10: መርካቶ	B-LOC
11: ውስጥ	O
12: ይገኛል።	O
13: 
14: # Message 2: ጫማ በጣም ጥሩ! ዋጋ 1800 ብር። ቢሸፍቱ ውስጥ ይገኛል።
15: # Channel: @ethio_commerce
16: ጫማ	B-PRODUCT
17: በጣም	O
18: ጥሩ	O
19: !	O
20: ዋጋ	B-PRICE
21: 1800	B-PRICE
22: ብር።	I-PRICE
23: ቢሸፍቱ	O
24: ውስጥ	O
25: ይገኛል።	O
26: 
27: # Message 3: ሳምንጣ በጣም ጥሩ! ዋጋ 2500 ብር። አዲስ አበባ ውስጥ ይገኛል።
28: # Channel: @addis_market
29: ሳምንጣ	O
30: በጣም	O
31: ጥሩ	O
32: !	O
33: ዋጋ	B-PRICE
34: 2500	B-PRICE
35: ብር።	I-PRICE
36: አዲስ	B-LOC
37: አበባ	I-LOC
38: ውስጥ	O
39: ይገኛል።	O
40: 
41: # Message 4: ቦርሳ በጣም ጥሩ! ዋጋ 5000 ብር። ቦሌ ውስጥ ይገኛል።
42: # Channel: @ethiopia_shopping
43: ቦርሳ	B-PRODUCT
44: በጣም	O
45: ጥሩ	O
46: !	O
47: ዋጋ	B