In [1]:
import pandas as pd
import regex as re
from underthesea import word_tokenize
from typing import List, Dict, Set, Literal
import logging
import unicodedata
from unidecode import unidecode
import os

In [3]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def convert_to_set_variations(terms: Set[str]) -> Set[str]:
    """Convert a set of underscore-separated terms into all possible variations.
    
    For example: 'thực_hiện' becomes {'thực_hiện', 'thực hiện'}
    
    Args:
        terms: Set of terms, some with underscores
        
    Returns:
        Set of terms with both underscore and space variations
    """
    variations = set()
    for term in terms:
        variations.add(term)  # Add original form
        if '_' in term:
            variations.add(term.replace('_', ' '))  # Add space version
        else:
            variations.add(term.replace(' ', '_'))  # Add underscore version
    return variations

# Cultural terms that should always be preserved
PRESERVE_TERMS_RAW = {
    # Religious and ceremonial places
    'lễ hội', 'di sản', 'văn hóa', 'tín ngưỡng', 'phong tục', 'tập quán',
    'nghi lễ', 'cúng', 'thờ', 'đình', 'chùa', 'miếu', 'đền', 'phủ',
    
    # Cultural heritage terms
    'di tích', 'danh thắng', 'di chỉ', 'cổ vật', 'hiện vật', 'bảo tàng',
    'truyền thống', 'bản sắc', 'dân tộc', 'văn bia', 'bia đá',
    
    # Traditional practices
    'hội làng', 'tế lễ', 'cầu an', 'cầu siêu', 'rước kiệu', 'tục lệ',
    'mai táng', 'thờ cúng', 'tổ tiên', 'thần linh'
}

# Default built-in stop words
DEFAULT_STOP_WORDS_RAW = {
    # Basic stop words
    'và', 'của', 'có', 'được', 'trong', 'các', 'là', 'những', 'cho', 'không',
    'để', 'này', 'khi', 'với', 'về', 'như', 'từ', 'theo', 'tại', 'trên',
    'đã', 'đến', 'sau', 'tới', 'vào', 'rồi', 'thì', 'mà', 'còn', 'nên',
    
    # Cultural heritage specific stop words
    'ngày', 'tháng', 'năm', 'hàng', 'được', 'cùng', 'theo', 'trong', 'ngoài',
    'trước', 'sau', 'đây', 'kia', 'ấy', 'vậy', 'nhất', 'cũng', 'lại', 'mới',
    
    # Temporal markers common in cultural texts
    'xưa', 'nay', 'trước đây', 'hiện nay', 'ngày xưa', 'thời', 'khoảng',
    'triều', 'đời', 'niên', 'kỷ', 'thế kỷ', 'thời kỳ', 'giai đoạn',
    
    # Ceremonial/ritual common words
    'buổi', 'cuộc', 'dịp', 'đợt', 'lần', 'mỗi', 'việc', 'điều', 'cách',
    'hành lễ', 'cử hành', 'tiến hành', 'thực hiện', 'tổ chức', 'diễn ra',
    
    # Historical document markers
    'theo', 'căn cứ', 'dựa vào', 'qua', 'từ đó', 'do đó', 'vì thế',
    'được biết', 'ghi chép', 'tương truyền', 'tục truyền', 'tương tự',
    
    # Measurement units and quantities
    'cái', 'chiếc', 'người', 'con', 'bộ', 'đôi', 'bên', 'phía', 'nhiều',
    'ít', 'vài', 'mấy', 'số', 'khoảng', 'chừng', 'độ', 'phần',
    
    # Location/spatial markers
    'nơi', 'chỗ', 'vùng', 'miền', 'khu vực', 'địa phương', 'vị trí',
    'hướng', 'phía', 'bên', 'trong', 'ngoài', 'trên', 'dưới',
    
    # Additional common words that often don't add semantic value
    'bằng', 'ra', 'đi', 'lên', 'xuống', 'hay', 'hoặc', 'nhưng', 'tuy',
    'dù', 'dẫu', 'mặc', 'dù', 'cho dù', 'tuy nhiên', 'song', 'nhưng mà',
    'bởi', 'bởi vì', 'vì', 'do', 'tại vì', 'nên', 'cho nên', 'vì vậy',
    'nếu', 'giả sử', 'giả như', 'trong trường hợp', 'khi mà', 'lúc',
    'lúc này', 'lúc đó', 'bấy giờ', 'hồi', 'hồi đó', 'thuở', 'thuở ấy',
    
    # Administrative and bureaucratic terms
    'quyết định', 'thông tư', 'nghị định', 'luật', 'điều', 'khoản', 'điểm',
    'phụ lục', 'ban hành', 'công bố', 'thực hiện', 'áp dụng', 'có hiệu lực',
    
    # Redundant descriptive words
    'rất', 'hết sức', 'vô cùng', 'cực kỳ', 'khá', 'khá là', 'tương đối',
    'hơi', 'có phần', 'phần nào', 'đôi chút', 'chút ít', 'ít nhiều'
}# Convert raw sets to include both underscore and space variations
PRESERVE_TERMS = convert_to_set_variations(PRESERVE_TERMS_RAW)
DEFAULT_STOP_WORDS = convert_to_set_variations(DEFAULT_STOP_WORDS_RAW)


In [4]:
def normalize_text_for_comparison(text: str) -> str:
    """Normalize text for comparison by handling spaces and underscores.
    """
    # Create variations with both spaces and underscores
    space_version = text.replace('_', ' ')
    underscore_version = text.replace(' ', '_')
    return f"{space_version}|{underscore_version}"

def remove_stop_words(tokens: List[str], stop_words: Set[str], min_word_length: int = 2) -> List[str]:
    """Remove stop words while preserving important terms."""
    result = []
    i = 0
    while i < len(tokens):
        # Try to match two-word phrases first
        two_word_match = False
        if i < len(tokens) - 1:
            two_words = f"{tokens[i]} {tokens[i+1]}"
            two_words_normalized = normalize_text_for_comparison(two_words)
            
            # Check if it's a preserve term
            if any(term in two_words_normalized for term in PRESERVE_TERMS):
                result.append(two_words)
                i += 2
                continue
            
            # Check if it's a stop word
            if any(term in two_words_normalized for term in stop_words):
                i += 2
                continue
        
        # If no two-word match, process single word
        if not two_word_match:
            token = tokens[i]
            token_normalized = normalize_text_for_comparison(token)
            
            # Enhanced filtering conditions
            should_keep = (
                # Not a stop word OR is a preserve term OR is long enough to be meaningful
                (not any(term in token_normalized for term in stop_words) or
                 any(term in token_normalized for term in PRESERVE_TERMS) or
                 len(token) > 10) and
                # Additional filters
                len(token) >= min_word_length and  # Remove very short words
                not token.isdigit() and  # Remove standalone numbers
                not re.match(r'^[IVX]+$', token.upper())  # Remove Roman numerals
            )
            
            if should_keep:
                result.append(token)
            i += 1
    
    return result

def remove_duplicate_phrases(tokens: List[str], max_phrase_length: int = 5) -> List[str]:
    """Remove duplicate phrases within the token list."""
    result = []
    seen_phrases = set()
    
    for i in range(len(tokens)):
        # Check for phrases of different lengths
        for phrase_len in range(1, min(max_phrase_length + 1, len(tokens) - i + 1)):
            phrase = ' '.join(tokens[i:i+phrase_len])
            
            if phrase_len == 1:  # Single word
                if phrase not in seen_phrases:
                    result.append(phrase)
                    seen_phrases.add(phrase)
                break
            else:  # Multi-word phrase
                if phrase in seen_phrases:
                    # Skip this entire phrase
                    break
                elif phrase_len == max_phrase_length or i + phrase_len == len(tokens):
                    # Add the longest possible unique phrase
                    result.extend(tokens[i:i+phrase_len])
                    seen_phrases.add(phrase)
                    break
    
    return result



In [5]:
def load_stopwords(mode: Literal['builtin', 'file', 'combined'] = 'builtin', 
                  filepath: str = 'vietnamese-stopwords.txt') -> Set[str]:
    """Load stop words based on specified mode.
    
    Args:
        mode: How to load stop words:
            - 'builtin': Use only built-in stop words
            - 'file': Use only file-based stop words
            - 'combined': Combine both built-in and file-based stop words
        filepath: Path to stop words file
    
    Returns:
        Set of stop words
    """
    file_stop_words = set()
    
    if mode in ['file', 'combined']:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                file_stop_words = {line.strip() for line in f if line.strip()}
            logger.info(f"Loaded {len(file_stop_words)} stop words from file")
        except FileNotFoundError:
            logger.warning(f"Stop words file {filepath} not found.")
            if mode == 'file':
                logger.warning("Falling back to built-in stop words.")
                return DEFAULT_STOP_WORDS
        except Exception as e:
            logger.error(f"Error loading stop words file: {str(e)}")
            if mode == 'file':
                return DEFAULT_STOP_WORDS
    
    if mode == 'builtin':
        logger.info(f"Using {len(DEFAULT_STOP_WORDS)} built-in stop words")
        return DEFAULT_STOP_WORDS
    elif mode == 'file':
        return file_stop_words
    else:  # combined
        combined = DEFAULT_STOP_WORDS.union(file_stop_words)
        logger.info(f"Combined stop words: {len(combined)} total "
                   f"({len(DEFAULT_STOP_WORDS)} built-in + {len(file_stop_words)} from file)")
        return combined

def normalize_unicode(text: str) -> str:
    """Normalize Unicode characters to canonical form."""
    return unicodedata.normalize('NFC', text)

def clean_text(text: str, remove_numbers: bool = True, remove_short_words: bool = True, min_word_length: int = 2) -> str:
    """Clean and normalize text."""
    if not isinstance(text, str):
        return ''
    
    # Normalize unicode
    text = normalize_unicode(text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove numbers and dates if specified
    if remove_numbers:
        # Remove standalone numbers, years, dates
        text = re.sub(r'\b\d+\b', '', text)
        # Remove number ranges like "12-15", "2020-2022"
        text = re.sub(r'\d+\s*[-–]\s*\d+', '', text)
        # Remove decimal numbers
        text = re.sub(r'\d+[.,]\d+', '', text)
    
    # Remove multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters but keep Vietnamese diacritics
    text = re.sub(r'[^\p{L}\p{N}\s]', ' ', text)
    
    # Remove parentheses and their contents
    text = re.sub(r'\([^)]*\)', '', text)
    
    # Remove brackets and their contents
    text = re.sub(r'\[[^\]]*\]', '', text)
    
    # Remove multiple spaces again after all removals
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()


In [6]:

def tokenize_and_preserve_terms(text: str) -> List[str]:
    """Tokenize text and preserve important multi-word terms."""
    tokens = word_tokenize(text)
    final_tokens = []
    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1:
            bigram = tokens[i] + '_' + tokens[i+1]
            if bigram in PRESERVE_TERMS:
                final_tokens.append(bigram)
                i += 2
                continue
        final_tokens.append(tokens[i])
        i += 1
    return final_tokens

def filter_by_frequency(tokens: List[str], min_frequency: int = 2, max_frequency_ratio: float = 0.8) -> List[str]:
    """Filter tokens based on frequency within the document."""
    from collections import Counter
    
    # Count token frequencies
    token_counts = Counter(tokens)
    total_tokens = len(tokens)
    
    # Filter tokens
    filtered_tokens = []
    for token in tokens:
        frequency = token_counts[token]
        frequency_ratio = frequency / total_tokens
        
        # Keep tokens that appear at least min_frequency times but not too frequently
        if frequency >= min_frequency and frequency_ratio <= max_frequency_ratio:
            filtered_tokens.append(token)
        # Always keep preserved terms regardless of frequency
        elif any(term in token for term in PRESERVE_TERMS):
            filtered_tokens.append(token)
    
    return filtered_tokens

def preprocess_text(text: str, stop_words: Set[str], 
                   remove_numbers: bool = True, 
                   min_word_length: int = 2,
                   remove_duplicates: bool = True,
                   frequency_filter: bool = True) -> Dict[str, str]:
    """Process a single text entry.
    
    Args:
        text: Input text to process
        stop_words: Set of stop words to use
        remove_numbers: Whether to remove numbers and dates
        min_word_length: Minimum word length to keep
        remove_duplicates: Whether to remove duplicate phrases
        frequency_filter: Whether to apply frequency-based filtering
    
    Returns:
        Dictionary containing processed versions of the text
    """
    cleaned_text = clean_text(text, remove_numbers=remove_numbers, min_word_length=min_word_length)
    tokens = tokenize_and_preserve_terms(cleaned_text)
    tokens = remove_stop_words(tokens, stop_words, min_word_length=min_word_length)
    
    # Apply additional filtering
    if remove_duplicates:
        tokens = remove_duplicate_phrases(tokens)
    
    if frequency_filter and len(tokens) > 50:  # Only apply to longer texts
        tokens = filter_by_frequency(tokens)
    
    processed_text = ' '.join(tokens)
    return {
        'text_with_diacritics': processed_text,
        'text_without_diacritics': unidecode(processed_text),
        'token_count': len(tokens)
    }

def process_dataframe(df: pd.DataFrame, text_columns: List[str], 
                     mode: Literal['builtin', 'file', 'combined'] = 'builtin',
                     remove_numbers: bool = True,
                     min_word_length: int = 2,
                     remove_duplicates: bool = True,
                     frequency_filter: bool = True) -> pd.DataFrame:
    """Process multiple text columns in a dataframe.
    
    Args:
        df: Input dataframe
        text_columns: List of column names containing text to process
        mode: Which stop words to use ('builtin', 'file', or 'combined')
        remove_numbers: Whether to remove numbers and dates
        min_word_length: Minimum word length to keep
        remove_duplicates: Whether to remove duplicate phrases
        frequency_filter: Whether to apply frequency-based filtering
    
    Returns:
        Processed dataframe with new columns
    """
    stop_words = load_stopwords(mode)
    processed_df = df.copy()
    
    for column in text_columns:
        if column not in df.columns:
            logger.warning(f"Column {column} not found in dataframe")
            continue
            
        logger.info(f"Processing column: {column}")
        
        # Process each text entry with enhanced options
        processed_texts = [preprocess_text(text, stop_words, 
                                         remove_numbers=remove_numbers,
                                         min_word_length=min_word_length,
                                         remove_duplicates=remove_duplicates,
                                         frequency_filter=frequency_filter) 
                          for text in df[column]]
        
        # Add new columns for processed text
        processed_df[f"{column}_processed"] = [text['text_with_diacritics'] for text in processed_texts]
        processed_df[f"{column}_normalized"] = [text['text_without_diacritics'] for text in processed_texts]
        processed_df[f"{column}_token_count"] = [text['token_count'] for text in processed_texts]
    
    return processed_df

In [10]:
if __name__ == "__main__":
    try:
        df = pd.read_csv('merged.csv')
        df = df.drop_duplicates()
        logger.info(f"Loaded CSV file with {len(df)} rows")
        
        # Process with different stop words options
        text_columns = ['title', 'content']
        
        # Test enhanced processing with different configurations
        configs = [
            # {'mode': 'combined', 'suffix': 'enhanced_basic', 'remove_numbers': True, 'min_word_length': 2, 'remove_duplicates': True, 'frequency_filter': False},
            {'mode': 'combined', 'suffix': 'enhanced_full', 'remove_numbers': True, 'min_word_length': 3, 'remove_duplicates': True, 'frequency_filter': True},
            # {'mode': 'combined', 'suffix': 'enhanced_aggressive', 'remove_numbers': True, 'min_word_length': 4, 'remove_duplicates': True, 'frequency_filter': True}
        ]
        
        for config in configs:
            logger.info(f"Processing with {config['suffix']} configuration...")
            processed_df = process_dataframe(df, text_columns, 
                                           mode=config['mode'],
                                           remove_numbers=config['remove_numbers'],
                                           min_word_length=config['min_word_length'],
                                           remove_duplicates=config['remove_duplicates'],
                                           frequency_filter=config['frequency_filter'])
            
            output_file = f'processed_vietnamese_texts_{config["suffix"]}.csv'
            processed_df.to_csv(output_file, index=False)
            
            # Log token reduction statistics
            original_tokens = len(df['content'][0].split())
            processed_tokens = processed_df['content_token_count'][0]
            reduction_pct = ((original_tokens - processed_tokens) / original_tokens) * 100
            
            logger.info(f"{config['suffix']}: {original_tokens} -> {processed_tokens} tokens ({reduction_pct:.1f}% reduction)")
        
        # Log statistics about stop words
        builtin_words = load_stopwords('builtin')
        file_words = load_stopwords('file')
        combined_words = load_stopwords('combined')
        
        logger.info(f"Built-in stop words count: {len(builtin_words)}")
        logger.info(f"File-based stop words count: {len(file_words)}")
        logger.info(f"Combined stop words count: {len(combined_words)}")
        logger.info(f"Unique words added from file: {len(combined_words - builtin_words)}")
        
    except Exception as e:
        logger.error(f"Error processing file: {str(e)}") 

INFO:__main__:Loaded CSV file with 966 rows
INFO:__main__:Processing with enhanced_full configuration...
INFO:__main__:Loaded 1942 stop words from file
INFO:__main__:Combined stop words: 2066 total (229 built-in + 1942 from file)
INFO:__main__:Processing column: title
INFO:__main__:Processing column: content
INFO:__main__:enhanced_full: 2149 -> 93 tokens (95.7% reduction)
INFO:__main__:Using 229 built-in stop words
INFO:__main__:Loaded 1942 stop words from file
INFO:__main__:Loaded 1942 stop words from file
INFO:__main__:Combined stop words: 2066 total (229 built-in + 1942 from file)
INFO:__main__:Built-in stop words count: 229
INFO:__main__:File-based stop words count: 1942
INFO:__main__:Combined stop words count: 2066
INFO:__main__:Unique words added from file: 1837


In [None]:
# Add a simple Vietnamese stemming function (basic suffix removal)
def simple_vietnamese_stem(word: str) -> str:
    """Simple Vietnamese stemming by removing common suffixes."""
    if len(word) <= 3:
        return word
    
    # Common Vietnamese suffixes to remove
    suffixes = ['tion', 'sion', 'ness', 'ment', 'able', 'ible', 'ful', 'less', 'ly', 'ed', 'ing', 's']
    vietnamese_suffixes = ['hóa', 'tính', 'gia', 'sư', 'viên', 'thủ', 'học']
    
    word_lower = word.lower()
    
    # Try Vietnamese suffixes first
    for suffix in vietnamese_suffixes:
        if word_lower.endswith(suffix) and len(word) > len(suffix) + 2:
            return word[:-len(suffix)]
    
    # Try English suffixes (for loanwords)
    for suffix in suffixes:
        if word_lower.endswith(suffix) and len(word) > len(suffix) + 2:
            return word[:-len(suffix)]
    
    return word

def apply_stemming(tokens: List[str]) -> List[str]:
    """Apply stemming to tokens while preserving important terms."""
    stemmed_tokens = []
    for token in tokens:
        # Don't stem preserved terms
        if any(term in token for term in PRESERVE_TERMS):
            stemmed_tokens.append(token)
        else:
            stemmed_tokens.append(simple_vietnamese_stem(token))
    return stemmed_tokens

# Test the enhanced processing
def run_enhanced_processing_test():
    """Test the enhanced processing pipeline."""
    try:
        df = pd.read_csv('merged.csv')
        logger.info(f"Loaded CSV file with {len(df)} rows")
        
        # Test with first row only for quick comparison
        sample_text = df['content'][0]
        original_length = len(sample_text)
        original_tokens = len(sample_text.split())
        
        logger.info(f"Original text: {original_length} characters, {original_tokens} tokens")
        
        # Test different processing levels
        stop_words = load_stopwords('combined')
        
        # Basic processing (current)
        result_basic = preprocess_text(sample_text, stop_words, 
                                     remove_numbers=False, min_word_length=1, 
                                     remove_duplicates=False, frequency_filter=False)
        
        # Enhanced processing
        result_enhanced = preprocess_text(sample_text, stop_words,
                                        remove_numbers=True, min_word_length=3,
                                        remove_duplicates=True, frequency_filter=True)
        
        logger.info(f"Basic processing: {result_basic['token_count']} tokens ({((original_tokens - result_basic['token_count']) / original_tokens * 100):.1f}% reduction)")
        logger.info(f"Enhanced processing: {result_enhanced['token_count']} tokens ({((original_tokens - result_enhanced['token_count']) / original_tokens * 100):.1f}% reduction)")
        
        return result_basic, result_enhanced
        
    except Exception as e:
        logger.error(f"Error in test: {str(e)}")
        return None, None


In [None]:
# Run comparison test
basic_result, enhanced_result = run_enhanced_processing_test()


In [15]:
result = pd.read_csv('processed_vietnamese_texts_builtin.csv')

result['content_processed']

0    Dao đỏ xã Hồ Thầu huyện Hoàng Su Phì tỉnh Hà G...
1    Lễ cầu mùa Cờ Lao đỏ xã Túng Sán huyện Hoàng S...
2    Tây Ninh 04 tôn giáo chính Phật giáo Thiên chú...
3    Khèn tiếng Mông gọi là Khềnh Kềnh Kỳ nhạc khí ...
4    Dinh Thầy Thím tọa lạc giữa khu rừng dầu Bàu C...
Name: content_processed, dtype: object

In [19]:
len(df['content'][0])

9661

In [17]:
len(result['content_processed'][0])

7604

In [20]:
result2 = pd.read_csv('processed_vietnamese_texts_file.csv')

In [21]:
len(result2['content_processed'][0])

6381

In [23]:
result2['content_processed'][0]

'Dao đỏ xã Hồ Thầu huyện Hoàng Su Phì tỉnh Hà Giang nhắc câu chuyện vị vua Bình Vương yêu thương dân Vua hộ vệ yêu Bàn Hồ long khuyển thước lông đen vằn vàng mượt nhung Cao Vương vua láng giềng đánh chiếm biên ải cảnh lầm than dân chúng Bình Vương cử binh hùng tướng trấn giữ biên ải đánh đuổi Cao Vương Trong nguy cấp Bàn Hồ hiến kế giúp Bình Vương đánh đuổi Cao Vương đem bình yên dân chúng lập công Bàn Hồ bỗng hóa thành chàng trai khôi ngô tuấn tú Bình Vương gả Tam công chúa sinh 12 6 trai 6 gái Vua Bình Vương phong vương Bàn Hồ lấy hiệu Bàn Vương ban sắc 12 Bàn Vương 12 thủy tổ tộc người Dao xấu Bàn Vương sơn dương sừng đâm ngã chết Gù hương con cháu Bàn Vương chặt Gù Hương thân trống lột da sơn dương mặt trống tế lễ Bàn Vương thế hệ con cháu Dao Dao xã Hồ Thầu tổ chức cúng tạ Bàn Vương tưởng nhớ vị sư tổ anh hùng dân tộc giáo dục truyền thống dũng cảm tự tin dịp cầu mong Bàn Vương che trở con cháu bình an may mắn Tùy ngành Dao lễ cúng nhìn chung tục cúng Bàn Vương cơ bản Dao đỏ 5 cún