In [7]:
import pandas as pd
import random
import os
from pathlib import Path

class SampleSelector:
    def __init__(self):
        """Initialize the SampleSelector with file paths"""
        # Set up paths using pathlib for cross-platform compatibility
        self.base_dir = Path('../../data')
        self.processed_path = self.base_dir / 'processed/cleaned_messages.csv'
        self.samples_path = self.base_dir / 'labeled/samples_for_labeling.csv'
        
        # Create labeled directory if it doesn't exist
        (self.base_dir / 'labeled').mkdir(parents=True, exist_ok=True)
    
    def select_samples(self, sample_size=50):
        """
        Select random samples from processed data for manual labeling
        
        Args:
            sample_size (int): Number of samples to select (default: 50)
            
        Returns:
            list: Selected samples as dictionaries, or None if error occurs
        """
        try:
            # Load processed data
            df = pd.read_csv(self.processed_path)
            
            # Filter messages with sufficient Amharic text
            df = df[df['amharic_text'].str.len() > 20].copy()
            
            if len(df) < sample_size:
                print(f"Warning: Only {len(df)} samples available, less than requested {sample_size}")
                sample_size = len(df)
            
            # Stratified sampling by channel
            samples = []
            channels = df['channel'].unique()
            samples_per_channel = max(1, sample_size // len(channels))
            
            for channel in channels:
                channel_df = df[df['channel'] == channel]
                if len(channel_df) >= samples_per_channel:
                    samples.extend(
                        channel_df.sample(samples_per_channel, random_state=42)
                        .to_dict('records')
                    )
            
            # Add random samples if needed
            remaining = sample_size - len(samples)
            if remaining > 0:
                extra_samples = df[~df.index.isin([s['message_id'] for s in samples])]
                samples.extend(
                    extra_samples.sample(remaining, random_state=42)
                    .to_dict('records')
                )
            
            # Save selected samples
            pd.DataFrame(samples[:sample_size]).to_csv(self.samples_path, index=False)
            print(f"✅ Successfully saved {len(samples[:sample_size])} samples to {self.samples_path}")
            
            # Print channel distribution
            sample_df = pd.DataFrame(samples)
            print("\nChannel distribution in selected samples:")
            print(sample_df['channel'].value_counts())
            
            return samples
            
        except FileNotFoundError:
            print(f"❌ Error: Processed data not found at {self.processed_path}")
            print("Please run the data preprocessing step first")
            return None
        except Exception as e:
            print(f"❌ Unexpected error: {str(e)}")
            return None


if __name__ == "__main__":
    # Example usage
    print("Starting sample selection...")
    selector = SampleSelector()
    selected_samples = selector.select_samples(50)
    
    if selected_samples:
        print("\nSample preview:")
        print(pd.DataFrame(selected_samples[:3])[['channel', 'amharic_text']])
        print("\nNext steps:")
        print(f"1. Open {selector.samples_path} in a spreadsheet editor")
        print("2. Add two new columns: 'tokens' and 'labels'")
        print("3. For each message:")
        print("   - Split the Amharic text into tokens (words)")
        print("   - Label each token with appropriate tags (B-PRODUCT, I-PRICE, etc.)")
        print("4. Save as 'labeled_data.csv' for the next step")
    else:
        print("Sample selection failed. Please check the error messages above.")

Starting sample selection...
✅ Successfully saved 50 samples to ..\..\data\labeled\samples_for_labeling.csv

Channel distribution in selected samples:
channel
ZemenExpress              10
nevacomputer              10
meneshayeofficial         10
ethio_brand_collection    10
Leyueqa                   10
Name: count, dtype: int64

Sample preview:
        channel                                       amharic_text
0  ZemenExpress  በተች የሚሰራ ቻርጅ የሚደረግ ለመኝታ ቤት ፣ ለሳሎን ፣ ለ ሆቴሎች እንዲ...
1  ZemenExpress  ዋጋ፦ ብር ውስን ፍሬ ነው ያለው   አድራሻ መገናኛ መሰረት ደፋር ሞል ሁ...
2  ZemenExpress  በኤሌክትሪክየሚሰራ ለቤት መልካም መዓዛን የሚሰጥ ዋጋ፦ ብር ውስን ፍሬ ነ...

Next steps:
1. Open ..\..\data\labeled\samples_for_labeling.csv in a spreadsheet editor
2. Add two new columns: 'tokens' and 'labels'
3. For each message:
   - Split the Amharic text into tokens (words)
   - Label each token with appropriate tags (B-PRODUCT, I-PRICE, etc.)
4. Save as 'labeled_data.csv' for the next step
