In [1]:
import re
import pandas as pd
from bs4 import BeautifulSoup
import os

class AmharicTextCleaner:
    def __init__(self):
        self.amharic_range = r'[\u1200-\u137F]'
        
    def clean_text(self, text):
        """Basic cleaning for Amharic text"""
        if pd.isna(text):
            return ""
            
        # Remove HTML tags
        text = BeautifulSoup(text, 'html.parser').get_text()
        
        # Normalize spaces
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        
        # Remove special characters except Amharic and basic punctuation
        text = re.sub(r'[^\w\s\u1200-\u137F.,!?]', '', text)
        
        return text
    
    def extract_amharic(self, text):
        """Extract only Amharic text segments"""
        matches = re.findall(r'[\u1200-\u137F]+(?:\s+[\u1200-\u137F]+)*', text)
        return ' '.join(matches).strip()
    
    def process_dataframe(self, df):
        """Process entire dataframe"""
        # Clean text
        df['clean_text'] = df['text'].apply(self.clean_text)
        
        # Extract Amharic text
        df['amharic_text'] = df['clean_text'].apply(self.extract_amharic)
        
        # Filter empty messages
        df = df[df['amharic_text'].str.len() > 0].copy()
        
        return df

def run_preprocessing():
    # Paths
    raw_path = os.path.join('..', '..', 'data', 'raw', 'telegram_messages.csv')
    processed_path = os.path.join('..', '..', 'data', 'processed', 'cleaned_messages.csv')
    
    # Create processed directory if not exists
    os.makedirs(os.path.dirname(processed_path), exist_ok=True)
    
    # Load and process data
    cleaner = AmharicTextCleaner()
    df = pd.read_csv(raw_path)
    processed_df = cleaner.process_dataframe(df)
    
    # Save processed data
    processed_df.to_csv(processed_path, index=False)
    print(f"Saved processed data to {processed_path}")
    return processed_df

if __name__ == "__main__":
    run_preprocessing()

Saved processed data to ..\..\data\processed\cleaned_messages.csv



If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  text = BeautifulSoup(text, 'html.parser').get_text()
