In [39]:
import os
import pandas as pd
from telethon.sync import TelegramClient
from dotenv import load_dotenv
import asyncio
import nest_asyncio

# Allows asyncio to run in a Jupyter notebook
nest_asyncio.apply()

In [40]:
# Load Credentials and Define Channels
load_dotenv()
api_id = int(os.getenv('API_ID'))
api_hash = os.getenv('API_HASH')

# List of target Telegram channels
# UPDATED CHANNEL LIST
# It's important to periodically verify these channels are still active and public.
channels = [
    'Shageronlinestore',       # This one still works
    'shegergebya',             # New - to be verified
    'ethio_brand_collection',  # New - to be verified
    'ethio_gebeya',            # New - to be verified
    'ethiopianonlinemarket'    # New - to be verified
]

In [41]:
async def fetch_messages(channel_name, limit=500):
    """
    Asynchronously connects to a single Telegram channel and scrapes a specified
    number of the latest messages.

    Args:
        channel_name (str): The public username of the Telegram channel (e.g., 'Shageronlinestore').
        limit (int): The maximum number of messages to retrieve.

    Returns:
        list: A list of dictionaries, where each dictionary represents a message
              containing key metadata. Returns an empty list if scraping fails.
    """
    # Initialize an empty list to store message data for this channel.
    all_messages = []
    
    # Establish a connection to Telegram using the Telethon client.
    # The 'anon' session name creates a 'anon.session' file for automatic login on subsequent runs.
    async with TelegramClient('anon', api_id, api_hash) as client:
        print(f"Fetching messages from '{channel_name}'...")
        try:
            # Asynchronously iterate through the messages in the specified channel.
            async for message in client.iter_messages(channel_name, limit=limit):
                # We only care about messages that contain text content.
                if message.text:
                    # Append a dictionary with the desired information to our list.
                    all_messages.append({
                        'channel': channel_name,
                        'message_id': message.id,
                        'text': message.text,         # The raw text of the post
                        'date': message.date,         # The timestamp of the post
                        'views': message.views        # The view count of the post
                    })
        except Exception as e:
            # Handle potential errors, such as the channel being private or not existing.
            print(f"Error: Could not fetch from '{channel_name}'. Reason: {e}")
            
    return all_messages

In [42]:
# MODIFIED AND IMPROVED NORMALIZATION FUNCTION
import re

def normalize_amharic(text):
    """
    Performs comprehensive normalization and cleaning of Amharic and English text
    from Telegram posts for NLP tasks.

    - Removes URLs
    - Removes Telegram-style hashtags and mentions
    - Removes emojis and other pictographs
    - Normalizes various Amharic punctuation marks to standard Latin equivalents
    - Removes repeated punctuation
    - Replaces newlines and tabs with a single space
    - Strips extra whitespace from the beginning and end
    """
    # 1. Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # 2. Remove hashtags and mentions
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'@\w+', '', text)
    
    # 3. Remove Emojis and other pictographs/symbols
    # This regex pattern covers most emoji ranges, as well as some other symbols.
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F700-\U0001F77F"  # alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess Symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # 4. Normalize Amharic punctuation
    text = text.replace('።', '.')
    text = text.replace('፣', ',')
    text = text.replace('፤', ';')
    text = text.replace('?', '?')
    
    # 5. Normalize visually similar but different characters (if any)
    # Example: text = text.replace('፡', ':') # Uncomment if you find this character
    
    # 6. Remove repeated punctuation and non-essential special characters
    text = re.sub(r'([,.?!;])\1+', r'\1', text) # e.g., '!!!' -> '!'
    text = re.sub(r'[\n\t\r*”’`“”]+', ' ', text) # Replace newlines, tabs, special quotes with a space

    # 7. Collapse multiple spaces into one and strip leading/trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [43]:
# MODIFIED main() function

async def main():
    """
    Main function to run the entire data ingestion and preprocessing pipeline.
    It scrapes data, cleans it, normalizes it, and returns a processed DataFrame.
    """
    # --- Data Collection ---
    all_channel_data = []
    for channel in channels:
        messages = await fetch_messages(channel)
        all_channel_data.extend(messages)
        print(f"Found {len(messages)} messages in '{channel}'.")

    # --- Data Structuring and Cleaning ---

    # Convert the list of dictionaries into a pandas DataFrame.
    if not all_channel_data:
        print("No data was collected. Exiting.")
        return pd.DataFrame() # Return an empty DataFrame if nothing was scraped

    df = pd.DataFrame(all_channel_data)
    print(f"\nCollected a total of {len(df)} messages before cleaning.")

    # --- Comprehensive Preprocessing ---

    # Step 1: Drop rows with empty text.
    df.dropna(subset=['text'], inplace=True)

    # Step 2: Apply advanced Amharic normalization.
    print("Applying advanced Amharic text normalization...")
    df['processed_text'] = df['text'].apply(normalize_amharic)

    # Step 3: Remove messages that became empty after normalization.
    df = df[df['processed_text'] != ''].copy()

    # --- Save the Processed Data ---
    output_path = '../data/scraped_telegram_data.csv'
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"\nSuccessfully saved {len(df)} cleaned messages to {output_path}")
    
    # --- Return the final DataFrame ---
    return df

In [44]:
# --- EXECUTE THE PIPELINE ---

# Call the main function and store the resulting DataFrame in a variable.
# This makes the final data available for inspection in the notebook.
processed_df = await main()

# Display the first few rows of the final, processed DataFrame to confirm success.
print("\n--- Pipeline Execution Complete. Final DataFrame Head: ---")
display(processed_df.head())

Fetching messages from 'Shageronlinestore'...
Found 227 messages in 'Shageronlinestore'.
Fetching messages from 'shegergebya'...
Found 99 messages in 'shegergebya'.
Fetching messages from 'ethio_brand_collection'...
Found 499 messages in 'ethio_brand_collection'.
Fetching messages from 'ethio_gebeya'...
Found 244 messages in 'ethio_gebeya'.
Fetching messages from 'ethiopianonlinemarket'...
Found 17 messages in 'ethiopianonlinemarket'.

Collected a total of 1086 messages before cleaning.
Applying advanced Amharic text normalization...

Successfully saved 1077 cleaned messages to ../data/scraped_telegram_data.csv

--- Pipeline Execution Complete. Final DataFrame Head: ---


Unnamed: 0,channel,message_id,text,date,views,processed_text
0,Shageronlinestore,7423,🥂5.5L Glass dispenser jar with Bamboo stand\n\...,2025-06-24 15:47:54+00:00,1407.0,5.5L Glass dispenser jar with Bamboo stand ለተለ...
1,Shageronlinestore,7422,🥂5.5L Glass dispenser jar with Bamboo stand\n\...,2025-06-24 15:45:49+00:00,1420.0,5.5L Glass dispenser jar with Bamboo stand ለተለ...
2,Shageronlinestore,7421,**❇️ Electronic Pest Repeller \n\nበረሮ 🦗➕ቢንቢ😡➕ ...,2025-06-24 10:08:29+00:00,2407.0,Electronic Pest Repeller በረሮ ቢንቢ አይጥ ሸረሪት እና ሌ...
3,Shageronlinestore,7420,**❇️ Electronic Pest Repeller \n\nበረሮ 🦗➕ቢንቢ😡➕ ...,2025-06-24 10:08:29+00:00,2011.0,Electronic Pest Repeller በረሮ ቢንቢ አይጥ ሸረሪት እና ሌ...
4,Shageronlinestore,7419,🍀3in1 Rotatable outlet extender\n\n👉can easily...,2025-06-24 05:36:56+00:00,2595.0,3in1 Rotatable outlet extender can easily conv...
