# EthioMart Centralized E-Commerce

#### EthioMart plans to create a single centralized platform that consolidates real-time data from multiple e-commerce Telegram channels into one unified channel.This script is to go over the data ingestion and pipelining steps of building a centralized e-commerce channel from various telegram channels for business. 

In [11]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import warnings
from dotenv import load_dotenv
warnings.filterwarnings('ignore')

# Add the parent directory to the Python path
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))




# Load environment variables from .env file
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

## Task 1: Data Ingestion and  Data Preprocessing

In [12]:
from telethon import TelegramClient
from telethon.sessions import MemorySession
import csv
import asyncio
import nest_asyncio  # For environments like Jupyter notebooks

# Apply the patch for nested event loops (useful in Jupyter or nested environments)
nest_asyncio.apply()


api_id = os.getenv("api_id")
api_hash =  os.getenv("api_hash")
phone_number = os.getenv("phone_number")

# List of target Telegram channels
channels = ['@ZemenExpress','@nevacomputer','@meneshayeofficial','@ethio_brand_collection','@Leyueqa','@sinayelj','@Shewabrand','@helloomarketethiopia',
'@modernshoppingcenter','@qnashcom','@Fashiontera','@kuruwear','@gebeyaadama','@MerttEka','@forfreemarket','@classybrands','@marakibrand',
'@aradabrand2','@marakisat2','@belaclassic','@AwasMart','@qnashcom']
# Use MemorySession to avoid SQLite locking issues, replace with string for persistent sessions
client = TelegramClient(MemorySession(), api_id, api_hash)

# File to save the results
output_file = "result.csv"

# Initialize the CSV file with headers if it doesn't exist
def initialize_csv():
    try:
        with open(output_file, mode='x', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["Channel Name", "Message Date", "Message Content"])
    except FileExistsError:
        pass  # File already exists, no need to overwrite

async def fetch_channel_messages(limit=20):
    """
    Fetch and save messages from specified Telegram channels.
    :param limit: Number of messages to fetch per channel
    """
    with open(output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for channel in channels:
            try:
                # Get the channel entity
                entity = await client.get_entity(channel)
                print(f"Fetching messages from channel: {entity.title}")
                
                # Fetch the most recent messages
                messages = await client.get_messages(entity, limit=limit)
                for message in messages:
                    if message.text:  # Only process messages with text
                        writer.writerow([entity.title, message.date, message.text])
                        print(f"[{entity.title}] {message.date}: {message.text}")
            except Exception as e:
                print(f"Error fetching messages from {channel}: {e}")

async def main():
    """
    Main function to start the Telegram client, log in, and fetch messages.
    """
    try:
        # Start the client and log in using the phone number
        print("Logging into Telegram...")
        await client.start(phone=phone_number)
        print("Successfully logged in!")
        
        # Fetch messages from the channels
        await fetch_channel_messages(limit=20)
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        # Ensure the client disconnects cleanly
        print("Disconnecting Telegram client...")
        await client.disconnect()
        print("Client disconnected.")

# Initialize the CSV file
initialize_csv()

# Run the main function in the event loop
if __name__ == "__main__":
    asyncio.run(main())


2025-01-18 10:15:31,471 - INFO - Connecting to 149.154.167.51:443/TcpFull...


Logging into Telegram...


2025-01-18 10:15:32,903 - INFO - Connection to 149.154.167.51:443/TcpFull complete!
2025-01-18 10:15:33,610 - INFO - Phone migrated to 4
2025-01-18 10:15:33,759 - INFO - Reconnecting to new data center 4
2025-01-18 10:15:33,759 - INFO - Disconnecting from 149.154.167.51:443/TcpFull...
2025-01-18 10:15:33,760 - INFO - Disconnection from 149.154.167.51:443/TcpFull complete!
2025-01-18 10:15:33,761 - INFO - Connecting to 149.154.167.91:443/TcpFull...
2025-01-18 10:15:35,480 - INFO - Connection to 149.154.167.91:443/TcpFull complete!


Signed in successfully as Bethelhem Teka; remember to not break the ToS or you will risk an account ban!
Successfully logged in!
Fetching messages from channel: Zemen Express®
[Zemen Express®] 2025-01-18 05:12:07+00:00: 💥💥...................................💥💥

📌Door Bottom Seal Strip Stopper
👍በበርዎ ስር አቧራ እና ነፍሳት እንዳይገባ 
    የሚከላከል!!!
💯 High Quality 
®️ አንደኛው ወፍራሙ 

🔰Keep Hot and Cold Air isolated 
🔰 Reduce The Noise Outside and Door slamming 
🔰 No 🦟 🪰 🐀 🪳🕷️🐛🦗
   

 ዋጋ፦  💵🏷  400ብር

♦️ውስን ፍሬ ነው ያለው 🔥🔥🔥

🏢 አድራሻ👉

📍ቁ.1️⃣♦️መገናኛ መሰረት ደፋር ሞል ሁለተኛ ፎቅ ቢሮ ቁ. S05/S06

📍 ቁ.2️⃣♦️ፒያሳ ጊዮርጊስ አደባባይ ራመት_ታቦር_ኦዳ_ህንፃ 1ኛ ፎቅ ሱቅ ቁ. G1 -107

     💧💧💧💧


    📲 0902660722
    📲 0928460606

➡️ 🔴ፒያሳ ቅርንጫፍ 0941337070 

🔖
💬በTelegram ለማዘዝ ⤵️ ይጠቀሙ🔽

@zemencallcenter 

@zemenexpressadmin

ለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን⤵️
https://telegram.me/zemenexpress

💥💥...................................💥💥
[Zemen Express®] 2025-01-18 05:11:55+00:00: 💥💥...................................💥💥

📌Door Bottom Seal Strip Stopper
👍በበርዎ ስር አቧራ እና ነ

2025-01-18 10:15:51,240 - INFO - Disconnecting from 149.154.167.91:443/TcpFull...
2025-01-18 10:15:51,241 - INFO - Disconnection from 149.154.167.91:443/TcpFull complete!


[qnash.com - ቅናሽ ®️] 2025-01-17 16:41:32+00:00: 📣 Hair Steamer Cap
🔼 High Quality 

➡️የፀጉር እስቲመር ትሪትመንት ወይም ቅቤ ተቀብተው በቤቶ ሆነው የሚጠቀሙበት

          💵ዋጋ፦ **1000ብር**

ውስን ፍሬ ነው የቀረው ❌❌
< Limited Stock ❌

Telegram - t.me/qnashcom
✅ ጥራት ✅ ዋስትና ✅ ቅናሽ

🏬 አድራሻ ፦ 
ቁጥር 1 :- **መገናኛ ዘፍመሽ ግራንድ ሞል 3 ተኛ ፎቅ ቁጥር 329 **

ቁጥር 2 :- **ጀሞ 1 ከለላ ህንፃ ግራውንድ ለይ G07 **

❌❌ **ማሳሰብያ**: ሱቃችን ሲመጡ ትክክለኛ የኛ ሱቅ መሆኑ ያረጋግጡ  የራሳችን ሎጎ መኖሩን እና 329 / G07 መሆኑ ያረጋግጡ !

ስልክ:
📱0946966440
📱0992606060
📱0905464599
[qnash.com - ቅናሽ ®️] 2025-01-17 11:35:59+00:00: 📣 Hair Steamer Cap
🔼 High Quality 

➡️የፀጉር እስቲመር ትሪትመንት ወይም ቅቤ ተቀብተው በቤቶ ሆነው የሚጠቀሙበት

          💵ዋጋ፦ **1000ብር**

ውስን ፍሬ ነው የቀረው ❌❌
< Limited Stock ❌

Telegram - t.me/qnashcom
✅ ጥራት ✅ ዋስትና ✅ ቅናሽ

🏬 አድራሻ ፦ 
ቁጥር 1 :- **መገናኛ ዘፍመሽ ግራንድ ሞል 3 ተኛ ፎቅ ቁጥር 329 **

ቁጥር 2 :- **ጀሞ 1 ከለላ ህንፃ ግራውንድ ለይ G07 **

❌❌ **ማሳሰብያ**: ሱቃችን ሲመጡ ትክክለኛ የኛ ሱቅ መሆኑ ያረጋግጡ  የራሳችን ሎጎ መኖሩን እና 329 / G07 መሆኑ ያረጋግጡ !

ስልክ:
📱0946966440
📱0992606060
📱0905464599
[qnash.com - ቅናሽ ®️] 2025-01-17 09:01:41+00:00: 🟰🔡🔣🟰ለደረቅ ነገሮ

In [16]:
import csv
import re

# Function to read the "Message" column from the CSV file
def read_messages_from_csv(csv_file="result.csv"):
    messages = []
    with open(csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            message = row.get("Message Content", "").strip()  # Get the message from the "Message" column
            if message:  # Avoid empty messages
                messages.append(message)
    return messages

# Regular expression to detect Amharic characters (Unicode range for Amharic)
amharic_regex = re.compile('[\u1200-\u137F]+')

# Function to extract only Amharic text from the message
def extract_amharic_text(message):
    """
    This function extracts only the Amharic text from a given message.
    It returns the Amharic part of the message, or None if no Amharic text is found.
    """
    amharic_text = " ".join(re.findall(amharic_regex, message))
    return amharic_text if amharic_text else None

# Regular expressions for detecting different entities
product_keywords = ['ልብስ', 'ምንጣፍ', 'እንጀራ', 'ተለዋዋጭ', 'እቃ']
location_keywords = ['አበባ', 'ቦሌ', 'አዲስ', 'ተማ', 'እስትንፋስ', 'አዳም']
price_keywords = ['ዋጋ', 'ብር', 'ሴራ', 'በ']

# Function to label entities dynamically based on the message content
def dynamic_label_message(message):
    tokens = message.split()  # Simple whitespace-based tokenization
    labeled_tokens = []
    
    for token in tokens:
        label = "O"  # Default label (outside any entity)
        
        # Check if the token matches a known product keyword
        if any(keyword in token for keyword in product_keywords):
            label = "B-Product"  # Label as product (can extend to I-Product if needed)
        
        # Check if the token matches a known location keyword
        elif any(keyword in token for keyword in location_keywords):
            label = "B-LOC"  # Label as location (can extend to I-LOC if needed)
        
        # Check if the token matches a known price keyword
        elif any(keyword in token for keyword in price_keywords):
            label = "B-PRICE"  # Label as price (can extend to I-PRICE if needed)
        
        labeled_tokens.append((token, label))
    return labeled_tokens

# Save the labeled data in CoNLL format
def save_to_conll(messages, output_file="labeled_dataset.conll"):
    """
    Saves labeled data in CoNLL format.
    """
    with open(output_file, "w", encoding="utf-8") as file:
        for message in messages:
            # Extract only Amharic text from the message
            amharic_message = extract_amharic_text(message)
            
            if amharic_message:  # Only process messages that contain Amharic text
                labeled_tokens = dynamic_label_message(amharic_message)
                for token, label in labeled_tokens:
                    file.write(f"{token}\t{label}\n")
                file.write("\n")  # Blank line to separate messages
    print(f"Labeled dataset saved to {output_file}")

# Main function to run the entire process
if __name__ == "__main__":
    # Read messages from the result.csv
    messages = read_messages_from_csv("result.csv")
    
    # Save the labeled messages in CoNLL format (only Amharic text)
    save_to_conll(messages)


Labeled dataset saved to labeled_dataset.conll
