In [21]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from telethon import TelegramClient
from dotenv import load_dotenv

In [22]:
df = pd.read_csv('../telegram_data.csv', encoding='ISO-8859-1')
print(df.head())

         Channel Title    Channel Username    ID  \
0  Sheger online-store  @Shageronlinestore  6206   
1  Sheger online-store  @Shageronlinestore  6205   
2  Sheger online-store  @Shageronlinestore  6204   
3  Sheger online-store  @Shageronlinestore  6203   
4  Sheger online-store  @Shageronlinestore  6202   

                                             Message  \
0  ð¥Delux Foldable multifunctional Draying RAC...   
1  #á ááá_áá°á£áá½á_á áµáá_...   
2  ð¥Bulb Rotating security camera 360Â°ð¥\n\...   
3  ð¥Bulb Rotating security camera 360Â°ð¥\n\...   
4                                                NaN   

                        Date                          Media Path  
0  2025-01-16 10:07:54+00:00  photos\@Shageronlinestore_6206.jpg  
1  2025-01-16 09:20:43+00:00  photos\@Shageronlinestore_6205.jpg  
2  2025-01-16 07:21:13+00:00                                 NaN  
3  2025-01-16 07:21:13+00:00  photos\@Shageronlinestore_6203.jpg  
4  2025-01-15 1

In [23]:
import re

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove special characters (except for spaces and words)
    text = re.sub(r'[^A-Za-z0-9አ-፸ዀ-፽ሀ-ሿቀ-ች፡\s]', '', text)  # Amharic characters are handled here
    
    # Remove extra spaces
    text = ' '.join(text.split())
    
    # Normalize to lowercase (optional, depends on case sensitivity)
    text = text.lower()
    
    return text

# Apply cleaning to the 'Message' column
df['cleaned_message'] = df['Message'].apply(lambda x: clean_text(str(x)))

# Check the cleaned data
print(df[['Message', 'cleaned_message']].head())


                                             Message  \
0  ð¥Delux Foldable multifunctional Draying RAC...   
1  #á ááá_áá°á£áá½á_á áµáá_...   
2  ð¥Bulb Rotating security camera 360Â°ð¥\n\...   
3  ð¥Bulb Rotating security camera 360Â°ð¥\n\...   
4                                                NaN   

                                     cleaned_message  
0  delux foldable multifunctional draying rack 52...  
1  automatic rotating nozzle 360 rotating avariet...  
2  bulb rotating security camera 360 lens 360 deg...  
3  bulb rotating security camera 360 lens 360 deg...  
4                                                nan  


In [24]:
import re

def preprocess_text(text):
    # Remove URLs, mentions, and hashtags
    text = re.sub(r"(https?://\S+|@\w+|#\w+)", "", text)

    # Remove punctuation and convert to lowercase
    text = re.sub(r"[^\w\s]", "", text).lower()

    # Tokenize text
    tokens = text.split()

    # Handle stop words or Amharic-specific stemming (customize for Amharic)
    stop_words = ["እንደ", "አምላክ"]  # Add more stop words
    tokens = [token for token in tokens if token not in stop_words]

    return tokens


Tokenization

In [25]:
import re

# Custom tokenization function for Amharic
def amharic_tokenize(text):
    # Split by spaces and common Amharic punctuation
    tokens = re.findall(r'[\u1200-\u137F]+|[.,!?።፣፤፥፦፧]', text)
    return tokens

# Apply tokenization
df['tokens'] = df['cleaned_message'].apply(lambda x: amharic_tokenize(x))



Normalization

In [26]:
# Custom normalization function for Amharic
def amharic_normalize(text):
    # Replace non-word characters
    text = re.sub(r'[^\u1200-\u137F\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply normalization
df['normalized_message'] = df['cleaned_message'].apply(lambda x: amharic_normalize(x))


 Handling Amharic-Specific Linguistic Features

In [27]:
# Define a list of Amharic stop words
amharic_stopwords = ['እኔ', 'አሁን', 'እኛ', 'ነው', 'ነበር']

# Function to remove stop words
def remove_stopwords(tokens):
    return [token for token in tokens if token not in amharic_stopwords]

# Apply stop words removal
df['filtered_tokens'] = df['tokens'].apply(lambda x: remove_stopwords(x))


In [28]:
def preprocess_amharic(text):
    # Normalize the text
    normalized_text = amharic_normalize(text)
    # Tokenize the normalized text
    tokens = amharic_tokenize(normalized_text)
    # Remove stop words
    filtered_tokens = remove_stopwords(tokens)
    return filtered_tokens

# Apply preprocessing pipeline
df['processed_tokens'] = df['cleaned_message'].apply(lambda x: preprocess_amharic(x))


In [29]:
df.to_csv('preprocessed_amharic_data.csv', index=False)

In [30]:
print(df.head())  # Display the first few rows
print(df.info())  # Check data types and null values


         Channel Title    Channel Username    ID  \
0  Sheger online-store  @Shageronlinestore  6206   
1  Sheger online-store  @Shageronlinestore  6205   
2  Sheger online-store  @Shageronlinestore  6204   
3  Sheger online-store  @Shageronlinestore  6203   
4  Sheger online-store  @Shageronlinestore  6202   

                                             Message  \
0  ð¥Delux Foldable multifunctional Draying RAC...   
1  #á ááá_áá°á£áá½á_á áµáá_...   
2  ð¥Bulb Rotating security camera 360Â°ð¥\n\...   
3  ð¥Bulb Rotating security camera 360Â°ð¥\n\...   
4                                                NaN   

                        Date                          Media Path  \
0  2025-01-16 10:07:54+00:00  photos\@Shageronlinestore_6206.jpg   
1  2025-01-16 09:20:43+00:00  photos\@Shageronlinestore_6205.jpg   
2  2025-01-16 07:21:13+00:00                                 NaN   
3  2025-01-16 07:21:13+00:00  photos\@Shageronlinestore_6203.jpg   
4  2025-01

In [31]:
df['tokens'] = df['tokens'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
# Drop duplicates without considering the 'tokens' column
df = df.drop_duplicates(subset=['Channel Title', 'Channel Username', 'ID', 'Date', 'Message'])
tokens_df = df[['tokens']]
df = df.drop(columns=['tokens'])  # Remove tokens from the main dataframe if necessary
print(df.dtypes)
print(df['tokens'].apply(type).value_counts())
# Rejoin tokens as a separate column
df['tokens'] = tokens_df['tokens']


Channel Title         object
Channel Username      object
ID                     int64
Message               object
Date                  object
Media Path            object
cleaned_message       object
normalized_message    object
filtered_tokens       object
processed_tokens      object
dtype: object


KeyError: 'tokens'

In [32]:


# Replace missing message content with an empty string
df['Message'] = df['Message'].fillna('').astype(str)

# Convert date to a proper datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['Date'])

# Remove duplicates
df = df.drop_duplicates()

# Reset the index after cleaning
df = df.reset_index(drop=True)


TypeError: unhashable type: 'list'

In [33]:
print(df.columns)


Index(['Channel Title', 'Channel Username', 'ID', 'Message', 'Date',
       'Media Path', 'cleaned_message', 'normalized_message',
       'filtered_tokens', 'processed_tokens'],
      dtype='object')


In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Download necessary resources (if not already installed)
nltk.download('punkt')

# Ensure all entries in 'cleaned_message' are strings and handle NaNs
df['cleaned_message'] = df['cleaned_message'].fillna('').astype(str)

# Define a safe tokenization function with error handling
def safe_tokenize(text):
    try:
        # Tokenize the cleaned message
        return word_tokenize(text)
    except Exception as e:
        print(f"Error tokenizing message: {e}")
        return []

# Apply the safe tokenization function
df['tokens'] = df['cleaned_message'].apply(lambda x: safe_tokenize(x))

# Check the tokenized output
print(df[['cleaned_message', 'tokens']].head())




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Error tokenizing message: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\hp/nltk_data'
    - 'c:\\Users\\hp\\anaconda\\nltk_data'
    - 'c:\\Users\\hp\\anaconda\\share\\nltk_data'
    - 'c:\\Users\\hp\\anaconda\\lib\\nltk_data'
    - 'C:\\Users\\hp\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************

Error tokenizing message: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('pu