# Vietnamese ABSA Dataset Preprocessing (Fixed)

This notebook fetches raw VLSP 2018 ABSA data from GitHub and applies comprehensive Vietnamese preprocessing.

**Data Source:** https://github.com/ancan203/SentimentAnalysis/VLSP2018/

**Features:**
- Downloads raw VLSP2018 files directly from GitHub
- Vietnamese text cleaning (HTML, emoji, URL removal)
- Vietnamese tone normalization (VinAI rules)
- Traditional preprocessing (lowercase, whitespace, short words)
- Outputs compatible with both Traditional ML and PhoBERT notebooks
- Proper train/dev/test splits for fair comparison

## Install Dependencies

In [None]:
!pip install emoji regex pandas numpy

import os
import re
import emoji
import pandas as pd
import numpy as np
import urllib.request
from typing import Dict, List, Tuple, Any
import warnings
warnings.filterwarnings('ignore')

print("Dependencies installed successfully!")

## Download Raw VLSP2018 Data from GitHub

In [None]:
# Download raw VLSP2018 data files from GitHub
base_url = "https://raw.githubusercontent.com/ancan203/SentimentAnalysis/main/VLSP2018/"

# Raw data files to download
raw_files = [
    'VLSP2018-SA-Hotel-train.txt',
    'VLSP2018-SA-Hotel-dev.txt', 
    'VLSP2018-SA-Hotel-test.txt',
    'VLSP2018-SA-Restaurant-train.txt',
    'VLSP2018-SA-Restaurant-dev.txt',
    'VLSP2018-SA-Restaurant-test.txt'
]

# Create raw_data directory
os.makedirs('raw_data', exist_ok=True)

print("Downloading raw VLSP2018 ABSA data files...")

# Download files
for filename in raw_files:
    try:
        url = base_url + filename
        local_path = f'raw_data/{filename}'
        urllib.request.urlretrieve(url, local_path)
        print(f"✓ Downloaded {filename}")
    except Exception as e:
        print(f"✗ Failed to download {filename}: {e}")

print("\nRaw data download complete!")
print("Files available for preprocessing:")
for file in raw_files:
    if os.path.exists(f'raw_data/{file}'):
        size = os.path.getsize(f'raw_data/{file}')
        print(f"  {file}: {size:,} bytes")

## Vietnamese Text Cleaning Classes

In [None]:
class VietnameseTextCleaner:
    """Vietnamese-specific text cleaning utilities"""
    
    VN_CHARS = 'áàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđÁÀẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÍÌỈĨỊÚÙỦŨỤƯỨỪỬỮỰÝỲỶỸỴĐ'
    
    @staticmethod
    def remove_html(text):
        """Remove HTML tags"""
        return re.sub(r'<[^>]*>', '', text)
    
    @staticmethod
    def remove_emoji(text):
        """Remove emojis"""
        return emoji.replace_emoji(text, '')
    
    @staticmethod
    def remove_url(text):
        """Remove URLs"""
        return re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)', '', text)
    
    @staticmethod
    def remove_email(text):
        """Remove email addresses"""
        return re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
    
    @staticmethod
    def remove_phone_number(text):
        """Remove phone numbers"""
        return re.sub(r'\b\d{3,4}[-.]?\d{3,4}[-.]?\d{3,4}\b', '', text)
    
    @staticmethod
    def remove_hashtags(text):
        """Remove hashtags"""
        return re.sub(r'#\w+', '', text)
    
    @staticmethod
    def remove_unnecessary_characters(text):
        """Remove non-essential characters, keep Vietnamese characters"""
        # Keep letters, numbers, Vietnamese characters, spaces, and basic punctuation
        pattern = r'[^a-zA-Z0-9\s' + re.escape(VietnameseTextCleaner.VN_CHARS) + r'.,!?;:-]'
        return re.sub(pattern, '', text)
    
    @staticmethod
    def process_text(text):
        """Apply all Vietnamese cleaning steps"""
        if not text or not isinstance(text, str):
            return ''
        
        # Apply cleaning steps
        text = VietnameseTextCleaner.remove_html(text)
        text = VietnameseTextCleaner.remove_emoji(text)
        text = VietnameseTextCleaner.remove_url(text)
        text = VietnameseTextCleaner.remove_email(text)
        text = VietnameseTextCleaner.remove_phone_number(text)
        text = VietnameseTextCleaner.remove_hashtags(text)
        text = VietnameseTextCleaner.remove_unnecessary_characters(text)
        
        return text.strip()

print("Vietnamese text cleaner loaded!")

In [None]:
class VietnameseToneNormalizer:
    """Vietnamese tone normalization utilities"""
    
    VOWELS_TABLE = [
        ['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
        ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
        ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
        ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e' ],
        ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
        ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i' ],
        ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o' ],
        ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
        ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
        ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u' ],
        ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
        ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']
    ]
    
    VOWELS_TO_IDS = {
        'a': (0, 0), 'à': (0, 1), 'á': (0, 2), 'ả': (0, 3), 'ã': (0, 4), 'ạ': (0, 5), 
        'ă': (1, 0), 'ằ': (1, 1), 'ắ': (1, 2), 'ẳ': (1, 3), 'ẵ': (1, 4), 'ặ': (1, 5), 
        'â': (2, 0), 'ầ': (2, 1), 'ấ': (2, 2), 'ẩ': (2, 3), 'ẫ': (2, 4), 'ậ': (2, 5), 
        'e': (3, 0), 'è': (3, 1), 'é': (3, 2), 'ẻ': (3, 3), 'ẽ': (3, 4), 'ẹ': (3, 5), 
        'ê': (4, 0), 'ề': (4, 1), 'ế': (4, 2), 'ể': (4, 3), 'ễ': (4, 4), 'ệ': (4, 5), 
        'i': (5, 0), 'ì': (5, 1), 'í': (5, 2), 'ỉ': (5, 3), 'ĩ': (5, 4), 'ị': (5, 5), 
        'o': (6, 0), 'ò': (6, 1), 'ó': (6, 2), 'ỏ': (6, 3), 'õ': (6, 4), 'ọ': (6, 5), 
        'ô': (7, 0), 'ồ': (7, 1), 'ố': (7, 2), 'ổ': (7, 3), 'ỗ': (7, 4), 'ộ': (7, 5), 
        'ơ': (8, 0), 'ờ': (8, 1), 'ớ': (8, 2), 'ở': (8, 3), 'ỡ': (8, 4), 'ợ': (8, 5), 
        'u': (9, 0), 'ù': (9, 1), 'ú': (9, 2), 'ủ': (9, 3), 'ũ': (9, 4), 'ụ': (9, 5), 
        'ư': (10, 0), 'ừ': (10, 1), 'ứ': (10, 2), 'ử': (10, 3), 'ữ': (10, 4), 'ự': (10, 5), 
        'y': (11, 0), 'ỳ': (11, 1), 'ý': (11, 2), 'ỷ': (11, 3), 'ỹ': (11, 4), 'ỵ': (11, 5)
    }
    
    VINAI_NORMALIZED_TONE = {
        'òa': 'oà', 'Òa': 'Oà', 'ÒA': 'OÀ', 
        'óa': 'oá', 'Óa': 'Oá', 'ÓA': 'OÁ', 
        'ỏa': 'oả', 'Ỏa': 'Oả', 'ỎA': 'OẢ',
        'õa': 'oã', 'Õa': 'Oã', 'ÕA': 'OÃ',
        'ọa': 'oạ', 'Ọa': 'Oạ', 'ỌA': 'OẠ',
        'òe': 'oè', 'Òe': 'Oè', 'ÒE': 'OÈ',
        'óe': 'oé', 'Óe': 'Oé', 'ÓE': 'OÉ',
        'ỏe': 'oẻ', 'Ỏe': 'Oẻ', 'ỎE': 'OẺ',
        'õe': 'oẽ', 'Õe': 'Oẽ', 'ÕE': 'OẼ',
        'ọe': 'oẹ', 'Ọe': 'Oẹ', 'ỌE': 'OẸ',
        'ùy': 'uỳ', 'Ùy': 'Uỳ', 'ÙY': 'UỲ',
        'úy': 'uý', 'Úy': 'Uý', 'ÚY': 'UÝ',
        'ủy': 'uỷ', 'Ủy': 'Uỷ', 'ỦY': 'UỶ',
        'ũy': 'uỹ', 'Ũy': 'Uỹ', 'ŨY': 'UỸ',
        'ụy': 'uỵ', 'Ụy': 'Uỵ', 'ỤY': 'UỴ',
    }
    
    @staticmethod
    def normalize_unicode(text):
        """Normalize unicode characters"""
        import unicodedata
        return unicodedata.normalize('NFC', text)
    
    @staticmethod
    def normalize_sentence_typing(text, vinai_normalization=True):
        """Normalize Vietnamese typing patterns"""
        if vinai_normalization:
            for wrong, correct in VietnameseToneNormalizer.VINAI_NORMALIZED_TONE.items():
                text = text.replace(wrong, correct)
        return text
    
    @staticmethod
    def normalize_word_typing(word):
        """Normalize Vietnamese word typing"""
        # Simplified normalization - can be expanded
        return word
    
    @staticmethod
    def is_valid_vietnamese_word(word):
        """Check if word is valid Vietnamese"""
        # Simplified check - contains Vietnamese characters
        return any(c in VietnameseTextCleaner.VN_CHARS for c in word)

print("Vietnamese tone normalizer loaded!")

## Traditional Preprocessing

In [None]:
class TraditionalPreprocessor:
    """Traditional text preprocessing utilities"""
    
    @staticmethod
    def to_lowercase(text):
        """Convert to lowercase"""
        return text.lower()
    
    @staticmethod
    def remove_punctuation(text):
        """Remove punctuation (except Vietnamese characters)"""
        # Keep basic punctuation that might be meaningful
        return re.sub(r'[^\w\s' + re.escape(VietnameseTextCleaner.VN_CHARS) + r'.,!?;:-]', '', text)
    
    @staticmethod
    def remove_extra_whitespace(text):
        """Remove extra whitespace"""
        return re.sub(r'\s+', ' ', text).strip()
    
    @staticmethod
    def remove_numbers(text):
        """Remove standalone numbers"""
        return re.sub(r'\b\d+\b', '', text)
    
    @staticmethod
    def remove_short_words(text, min_length=2):
        """Remove words shorter than min_length"""
        words = text.split()
        return ' '.join([word for word in words if len(word) >= min_length])

print("Traditional preprocessor loaded!")

## Main VLSP2018 Preprocessor

In [None]:
class VLSP2018Preprocessor:
    """Main preprocessor for VLSP 2018 ABSA dataset"""
    
    def __init__(self, apply_vietnamese_steps=True, apply_tone_normalization=True):
        self.apply_vietnamese_steps = apply_vietnamese_steps
        self.apply_tone_normalization = apply_tone_normalization
        self.stats = {
            'processed_samples': 0,
            'skipped_samples': 0,
            'domains': {},
            'aspects': {},
            'sentiments': {}
        }
    
    def parse_vlsp_file(self, file_path: str) -> List[Dict]:
        """Parse VLSP 2018 format file"""
        samples = []
        
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            
        # Split into individual samples
        raw_samples = content.split('\n\n')
        
        for i, raw_sample in enumerate(raw_samples):
            if raw_sample.strip():
                try:
                    sample = self._parse_single_sample(raw_sample.strip(), i+1)
                    if sample:
                        samples.append(sample)
                except Exception as e:
                    print(f"Error parsing sample {i+1}: {e}")
                    continue
        
        return samples
    
    def _parse_single_sample(self, raw_sample: str, sample_id: int) -> Dict:
        """Parse a single sample from VLSP format"""
        lines = raw_sample.strip().split('\n')
        
        if len(lines) < 2:
            return None
        
        # First line is the text
        text = lines[0].strip()
        
        # Remaining lines are aspect-sentiment pairs
        aspect_sentiments = []
        
        for line in lines[1:]:
            line = line.strip()
            if line and '{' in line:
                # Parse format: {aspect_category}#{aspect_term}#sentiment
                # Or: {aspect_category}#sentiment
                line = line.strip('{}').strip()
                parts = line.split('#')
                
                if len(parts) >= 2:
                    if len(parts) == 2:
                        aspect = parts[0].strip()
                        sentiment = parts[1].strip()
                    else:
                        # Combine category and term
                        aspect = f"{parts[0].strip()}#{parts[1].strip()}"
                        sentiment = parts[2].strip()
                    
                    aspect_sentiments.append({
                        'aspect': aspect,
                        'sentiment': sentiment
                    })
        
        return {
            'sample_id': sample_id,
            'text': text,
            'aspect_sentiments': aspect_sentiments
        }
    
    def preprocess_text(self, text: str) -> str:
        """Apply all preprocessing steps to text"""
        if not text or not isinstance(text, str):
            return ''
        
        # Vietnamese-specific steps
        if self.apply_vietnamese_steps:
            text = VietnameseTextCleaner.process_text(text)
            
            if self.apply_tone_normalization:
                text = VietnameseToneNormalizer.normalize_unicode(text)
                text = VietnameseToneNormalizer.normalize_sentence_typing(text)
        
        # Traditional preprocessing steps
        text = TraditionalPreprocessor.to_lowercase(text)
        text = TraditionalPreprocessor.remove_extra_whitespace(text)
        text = TraditionalPreprocessor.remove_short_words(text, min_length=2)
        
        return text.strip()
    
    def process_dataset(self, samples: List[Dict], domain: str) -> pd.DataFrame:
        """Process samples into DataFrame format"""
        processed_data = []
        
        for sample in samples:
            original_text = sample['text']
            cleaned_text = self.preprocess_text(original_text)
            
            if not cleaned_text:
                self.stats['skipped_samples'] += 1
                continue
            
            # Create records for each aspect-sentiment pair
            for asp_sent in sample['aspect_sentiments']:
                aspect = asp_sent['aspect']
                sentiment = asp_sent['sentiment']
                
                # Map sentiment to numeric values
                sentiment_numeric = self._map_sentiment(sentiment)
                
                processed_data.append({
                    'sample_id': sample['sample_id'],
                    'text': cleaned_text,
                    'original_text': original_text,
                    'domain': domain,
                    'aspect': aspect,
                    'sentiment': sentiment,
                    'sentiment_numeric': sentiment_numeric
                })
                
                # Update statistics
                self._update_stats(domain, aspect, sentiment)
                self.stats['processed_samples'] += 1
        
        return pd.DataFrame(processed_data)
    
    def _map_sentiment(self, sentiment: str) -> int:
        """Map sentiment strings to numeric values"""
        sentiment_mapping = {
            'positive': 1,
            'negative': -1,
            'neutral': 0
        }
        return sentiment_mapping.get(sentiment.lower(), 0)
    
    def _update_stats(self, domain: str, aspect: str, sentiment: str):
        """Update processing statistics"""
        # Domain stats
        if domain not in self.stats['domains']:
            self.stats['domains'][domain] = 0
        self.stats['domains'][domain] += 1
        
        # Aspect stats
        if aspect not in self.stats['aspects']:
            self.stats['aspects'][aspect] = 0
        self.stats['aspects'][aspect] += 1
        
        # Sentiment stats
        if sentiment not in self.stats['sentiments']:
            self.stats['sentiments'][sentiment] = 0
        self.stats['sentiments'][sentiment] += 1
    
    def create_colab_compatible_format(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create format compatible with Colab notebooks"""
        # Get unique aspects
        all_aspects = sorted(df['aspect'].unique())
        
        # Group by text to create multi-output format
        colab_data = []
        
        for text_info, group in df.groupby(['sample_id', 'text', 'original_text', 'domain']):
            sample_id, text, original_text, domain = text_info
            
            # Create base record
            record = {
                'sample_id': sample_id,
                'text': text,
                'original_text': original_text,
                'domain': domain
            }
            
            # Add aspect columns with default neutral (0)
            for aspect in all_aspects:
                record[aspect] = 0
            
            # Fill in actual sentiments
            for _, row in group.iterrows():
                record[row['aspect']] = row['sentiment_numeric']
            
            colab_data.append(record)
        
        return pd.DataFrame(colab_data)

print("VLSP2018 preprocessor loaded!")

## Process Raw VLSP2018 Data

In [None]:
# Initialize preprocessor
print("=== VIETNAMESE ABSA PREPROCESSING PIPELINE ===")
print("Processing VLSP 2018 dataset with Vietnamese-specific steps\n")

preprocessor = VLSP2018Preprocessor(
    apply_vietnamese_steps=True,
    apply_tone_normalization=True
)

# Define input files using downloaded raw data
input_files = {
    'hotel': {
        'train': 'raw_data/VLSP2018-SA-Hotel-train.txt',
        'dev': 'raw_data/VLSP2018-SA-Hotel-dev.txt',
        'test': 'raw_data/VLSP2018-SA-Hotel-test.txt'
    },
    'restaurant': {
        'train': 'raw_data/VLSP2018-SA-Restaurant-train.txt',
        'dev': 'raw_data/VLSP2018-SA-Restaurant-dev.txt',
        'test': 'raw_data/VLSP2018-SA-Restaurant-test.txt'
    }
}

# Create output directory
os.makedirs('data', exist_ok=True)

all_dataframes = {}

# Process each domain and split
for domain, splits in input_files.items():
    print(f"\n--- Processing {domain.upper()} domain ---")
    domain_dfs = {}
    
    for split, file_path in splits.items():
        print(f"Processing {split} set...")
        
        # Check if file exists
        if not os.path.exists(file_path):
            print(f"Warning: File not found: {file_path}")
            continue
        
        # Parse and process
        samples = preprocessor.parse_vlsp_file(file_path)
        print(f"Parsed {len(samples)} samples from {file_path}")
        
        # Process into DataFrame
        df = preprocessor.process_dataset(samples, domain)
        print(f"Created {len(df)} processed records")
        
        # Save individual split
        output_path = f'data/{domain}_{split}_processed.csv'
        df.to_csv(output_path, index=False, encoding='utf-8')
        print(f"Saved to {output_path}")
        
        # Create Colab-compatible format
        colab_df = preprocessor.create_colab_compatible_format(df)
        colab_path = f'data/{domain}_{split}_colab_format.csv'
        colab_df.to_csv(colab_path, index=False, encoding='utf-8')
        print(f"Saved Colab format to {colab_path}")
        
        domain_dfs[split] = {
            'standard': df,
            'colab': colab_df
        }
    
    all_dataframes[domain] = domain_dfs

print("\n=== PREPROCESSING COMPLETE ===")

## Display Sample Results

In [None]:
# Display processing statistics
print("=== PROCESSING STATISTICS ===")
print(f"Total processed samples: {preprocessor.stats['processed_samples']:,}")
print(f"Skipped samples: {preprocessor.stats['skipped_samples']:,}")

print(f"\nDomain distribution:")
for domain, count in preprocessor.stats['domains'].items():
    print(f"  {domain}: {count:,} records")

print(f"\nSentiment distribution:")
for sentiment, count in sorted(preprocessor.stats['sentiments'].items()):
    print(f"  {sentiment}: {count:,} labels")

print(f"\nTop 10 aspects:")
sorted_aspects = sorted(preprocessor.stats['aspects'].items(), key=lambda x: x[1], reverse=True)
for aspect, count in sorted_aspects[:10]:
    print(f"  {aspect}: {count:,} occurrences")

# Display sample data
print("\n=== SAMPLE PROCESSED DATA ===")
if 'hotel' in all_dataframes and 'train' in all_dataframes['hotel']:
    sample_df = all_dataframes['hotel']['train']['standard']
    print(f"\nHotel training data shape: {sample_df.shape}")
    print("\nFirst 3 records:")
    print(sample_df[['text', 'aspect', 'sentiment']].head(3).to_string(index=False))
    
    # Show Colab format
    colab_sample = all_dataframes['hotel']['train']['colab']
    print(f"\nColab format shape: {colab_sample.shape}")
    print(f"Aspect columns: {len([col for col in colab_sample.columns if col not in ['sample_id', 'text', 'original_text', 'domain']])}")
    print("\nFirst record (showing first 5 aspect columns):")
    aspect_cols = [col for col in colab_sample.columns if col not in ['sample_id', 'text', 'original_text', 'domain']][:5]
    display_cols = ['text'] + aspect_cols
    print(colab_sample[display_cols].head(1).to_string(index=False))

## Usage Instructions

In [None]:
# Create usage instructions
instructions = """
=== USAGE INSTRUCTIONS FOR PROCESSED DATA ===

The Vietnamese ABSA dataset has been successfully preprocessed and is now ready for use in your evaluation notebooks.

FILES CREATED:
1. Standard Format (for analysis):
   - hotel_train_processed.csv
   - hotel_dev_processed.csv
   - hotel_test_processed.csv
   - restaurant_train_processed.csv
   - restaurant_dev_processed.csv
   - restaurant_test_processed.csv

2. Multi-output Format (for ML models):
   - hotel_train_colab_format.csv
   - hotel_dev_colab_format.csv
   - hotel_test_colab_format.csv
   - restaurant_train_colab_format.csv
   - restaurant_dev_colab_format.csv
   - restaurant_test_colab_format.csv

NEXT STEPS:
1. Use NLP_Traditional_ML_Fixed.ipynb for traditional ML evaluation
2. Use PhoBERT_Fixed_Evaluation.ipynb for PhoBERT evaluation
3. Both notebooks will automatically load these processed files

PREPROCESSING APPLIED:
✓ Vietnamese text cleaning (HTML, emoji, URL removal)
✓ Vietnamese tone normalization (VinAI rules)
✓ Traditional preprocessing (lowercase, whitespace, short words)
✓ Multi-output format for aspect-based classification
✓ Proper sentiment mapping (positive: 1, negative: -1, neutral: 0)

The data is now ready for academic evaluation and comparison!
"""

print(instructions)

# Save instructions to file
with open('data/preprocessing_instructions.txt', 'w', encoding='utf-8') as f:
    f.write(instructions)

print("\n✓ Instructions saved to: data/preprocessing_instructions.txt")