# Yelp Dataset Processing - Full Dataset
## Xử lý toàn bộ dữ liệu với Batch Processing

**Cấu hình:**
- Batch size: 50,000 records
- Train/Test split: 80/20 (stratified)
- Remove duplicates & null reviews
- Progress tracking với estimated time

## 0. Cài đặt thư viện cần thiết

In [1]:
# Cài đặt tqdm nếu chưa có
!pip install tqdm -q

In [2]:
import json
import pandas as pd
import numpy as np
from datetime import datetime
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import psutil
import time

print("✅ Libraries imported successfully!")
print(f"📅 Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

✅ Libraries imported successfully!
📅 Start time: 2025-10-12 15:12:37


## 1. Cấu hình

In [3]:
# ⚙️ CẤU HÌNH - THAY ĐỔI ĐƯỜNG DẪN
DATA_PATH = "Yelp/yelp_dataset/"

FILE_PATHS = {
    'business': DATA_PATH + 'yelp_academic_dataset_business.json',
    'review': DATA_PATH + 'yelp_academic_dataset_review.json',
    'user': DATA_PATH + 'yelp_academic_dataset_user.json'
}

OUTPUT_DIR = "processed_data/"

# Batch processing settings
BATCH_SIZE = 50000

# Train/Test split settings
TRAIN_SIZE = 0.8
TEST_SIZE = 0.2
RANDOM_STATE = 42

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"📂 Data path: {DATA_PATH}")
print(f"📂 Output directory: {OUTPUT_DIR}")
print(f"📊 Batch size: {BATCH_SIZE:,}")
print(f"✂️ Train/Test split: {TRAIN_SIZE:.0%}/{TEST_SIZE:.0%}")

📂 Data path: Yelp/yelp_dataset/
📂 Output directory: processed_data/
📊 Batch size: 50,000
✂️ Train/Test split: 80%/20%


## 2. Helper Functions

In [4]:
def get_memory_usage():
    """Get current memory usage in MB"""
    process = psutil.Process()
    return process.memory_info().rss / 1024 / 1024

def count_lines(filepath):
    """Count total lines in file for progress tracking"""
    print(f"Counting lines in {filepath.split('/')[-1]}...")
    with open(filepath, 'r', encoding='utf-8') as f:
        return sum(1 for _ in f)

def load_json_batch(filepath, batch_size=50000, show_progress=True):
    """
    Load JSON Lines file in batches
    Yields DataFrame batches
    """
    total_lines = count_lines(filepath)
    filename = filepath.split('/')[-1]
    
    batch_data = []
    processed = 0
    errors = 0
    start_time = time.time()
    
    with open(filepath, 'r', encoding='utf-8') as f:
        pbar = tqdm(total=total_lines, desc=f"Loading {filename}", unit=" lines")
        
        for line in f:
            line = line.strip()
            if not line:
                pbar.update(1)
                continue
            
            try:
                obj = json.loads(line)
                batch_data.append(obj)
                processed += 1
                
                # Yield batch when size reached
                if len(batch_data) >= batch_size:
                    df_batch = pd.DataFrame(batch_data)
                    yield df_batch
                    batch_data = []
                    
                    # Show memory usage
                    mem_mb = get_memory_usage()
                    pbar.set_postfix({
                        'RAM': f'{mem_mb:.0f}MB',
                        'Errors': errors
                    })
                
            except json.JSONDecodeError:
                errors += 1
            
            pbar.update(1)
        
        # Yield remaining data
        if batch_data:
            df_batch = pd.DataFrame(batch_data)
            yield df_batch
        
        pbar.close()
    
    elapsed = time.time() - start_time
    print(f"✅ Loaded {processed:,} records in {elapsed:.1f}s ({errors} errors)")
    print(f"💾 Memory usage: {get_memory_usage():.0f} MB\n")

print("✅ Helper functions defined!")

✅ Helper functions defined!


## 3. Process Business Data

In [5]:
print("="*80)
print("🏢 PROCESSING BUSINESS DATA")
print("="*80)

output_file = OUTPUT_DIR + 'processed_business.csv'
first_batch = True
total_records = 0

for batch_df in load_json_batch(FILE_PATHS['business'], BATCH_SIZE):
    # Save batch to CSV (append mode)
    batch_df.to_csv(
        output_file,
        mode='a',
        header=first_batch,
        index=False
    )
    first_batch = False
    total_records += len(batch_df)

print(f"\n✅ Business data processed: {total_records:,} records")
print(f"📁 Saved to: {output_file}\n")

🏢 PROCESSING BUSINESS DATA
Counting lines in yelp_academic_dataset_business.json...


Loading yelp_academic_dataset_business.json: 100%|██████████| 150348/150348 [00:03<00:00, 40112.53 lines/s, RAM=539MB, Errors=1]

✅ Loaded 150,346 records in 3.8s (2 errors)
💾 Memory usage: 234 MB


✅ Business data processed: 150,346 records
📁 Saved to: processed_data/processed_business.csv






## 4. Process Review Data (với labels)

In [None]:
print("="*80)
print("📝 PROCESSING REVIEW DATA")
print("="*80)

output_file = OUTPUT_DIR + 'processed_review.csv'
first_batch = True
total_records = 0
null_removed = 0

for batch_df in load_json_batch(FILE_PATHS['review'], BATCH_SIZE):
    # Create sentiment labels
    batch_df['label'] = batch_df['stars'].apply(lambda x: 
        0 if x <= 2 else (2 if x == 3 else 1)
    )
    
    # Remove null text
    before = len(batch_df)
    batch_df = batch_df.dropna(subset=['text'])
    null_removed += (before - len(batch_df))
    
    # Select required columns
    batch_df = batch_df[['review_id', 'user_id', 'business_id', 
                         'stars', 'date', 'text', 'useful', 'label']]
    
    # Save batch
    batch_df.to_csv(
        output_file,
        mode='a',
        header=first_batch,
        index=False
    )
    first_batch = False
    total_records += len(batch_df)

print(f"\n✅ Review data processed: {total_records:,} records")
print(f"🗑️ Null reviews removed: {null_removed:,}")
print(f"📁 Saved to: {output_file}\n")

📝 PROCESSING REVIEW DATA
Counting lines in yelp_academic_dataset_review.json...


## 5. Process User Data

In [None]:
print("="*80)
print("👥 PROCESSING USER DATA")
print("="*80)

output_file = OUTPUT_DIR + 'processed_user.csv'
first_batch = True
total_records = 0

for batch_df in load_json_batch(FILE_PATHS['user'], BATCH_SIZE):
    # Map yelping_since to since
    if 'yelping_since' in batch_df.columns:
        batch_df['since'] = batch_df['yelping_since']
    
    # Select required columns
    user_cols = ['user_id', 'name', 'review_count', 'since', 
                 'useful', 'fans', 'average_stars']
    available_cols = [col for col in user_cols if col in batch_df.columns]
    batch_df = batch_df[available_cols]
    
    # Save batch
    batch_df.to_csv(
        output_file,
        mode='a',
        header=first_batch,
        index=False
    )
    first_batch = False
    total_records += len(batch_df)

print(f"\n✅ User data processed: {total_records:,} records")
print(f"📁 Saved to: {output_file}\n")

## 6. Create Training Dataset & Remove Duplicates

In [None]:
print("="*80)
print("🎯 CREATING TRAINING DATASET")
print("="*80)

print("Loading processed review data...")
# Load review data in chunks to manage memory
df_training = pd.read_csv(
    OUTPUT_DIR + 'processed_review.csv',
    usecols=['text', 'label']
)

# Rename for training format
df_training.columns = ['review', 'label']

print(f"📊 Initial records: {len(df_training):,}")

# Remove duplicates
print("\n🗑️ Removing duplicates...")
before_dup = len(df_training)
df_training = df_training.drop_duplicates(subset=['review'])
duplicates_removed = before_dup - len(df_training)

print(f"✅ After removing duplicates: {len(df_training):,} records")
print(f"🗑️ Duplicates removed: {duplicates_removed:,}")

# Show label distribution
print("\n📊 Label Distribution:")
label_dist = df_training['label'].value_counts().sort_index()
for label, count in label_dist.items():
    label_name = {0: 'Tiêu cực', 1: 'Tích cực', 2: 'Trung lập'}[label]
    pct = count / len(df_training) * 100
    print(f"   {label_name} ({label}): {count:,} ({pct:.1f}%)")

print(f"\n💾 Memory usage: {get_memory_usage():.0f} MB")

## 7. Train/Test Split (Stratified)

In [None]:
print("\n" + "="*80)
print("✂️ TRAIN/TEST SPLIT (STRATIFIED)")
print("="*80)

# Stratified split to maintain label distribution
train_df, test_df = train_test_split(
    df_training,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=df_training['label']
)

print(f"\n📊 Split Summary:")
print(f"   Train set: {len(train_df):,} ({len(train_df)/len(df_training)*100:.1f}%)")
print(f"   Test set: {len(test_df):,} ({len(test_df)/len(df_training)*100:.1f}%)")

# Verify label distribution in train
print("\n📈 Train Set Label Distribution:")
for label, count in train_df['label'].value_counts().sort_index().items():
    label_name = {0: 'Tiêu cực', 1: 'Tích cực', 2: 'Trung lập'}[label]
    pct = count / len(train_df) * 100
    print(f"   {label_name} ({label}): {count:,} ({pct:.1f}%)")

# Verify label distribution in test
print("\n📈 Test Set Label Distribution:")
for label, count in test_df['label'].value_counts().sort_index().items():
    label_name = {0: 'Tiêu cực', 1: 'Tích cực', 2: 'Trung lập'}[label]
    pct = count / len(test_df) * 100
    print(f"   {label_name} ({label}): {count:,} ({pct:.1f}%)")

## 8. Export Train/Test Files

In [None]:
print("\n" + "="*80)
print("💾 EXPORTING TRAIN/TEST FILES")
print("="*80)

# Export train set
train_file = OUTPUT_DIR + 'train.csv'
train_df.to_csv(train_file, index=False)
print(f"✅ Train set exported: {train_file}")
print(f"   Records: {len(train_df):,}")
print(f"   Size: {os.path.getsize(train_file) / 1024 / 1024:.1f} MB")

# Export test set
test_file = OUTPUT_DIR + 'test.csv'
test_df.to_csv(test_file, index=False)
print(f"\n✅ Test set exported: {test_file}")
print(f"   Records: {len(test_df):,}")
print(f"   Size: {os.path.getsize(test_file) / 1024 / 1024:.1f} MB")

## 9. Final Summary Report

In [None]:
print("\n" + "="*80)
print("📊 FINAL SUMMARY REPORT")
print("="*80)

# Calculate file sizes
business_size = os.path.getsize(OUTPUT_DIR + 'processed_business.csv') / 1024 / 1024
review_size = os.path.getsize(OUTPUT_DIR + 'processed_review.csv') / 1024 / 1024
user_size = os.path.getsize(OUTPUT_DIR + 'processed_user.csv') / 1024 / 1024
train_size = os.path.getsize(OUTPUT_DIR + 'train.csv') / 1024 / 1024
test_size = os.path.getsize(OUTPUT_DIR + 'test.csv') / 1024 / 1024

print(f"\n📁 Output Files:")
print(f"   📄 processed_business.csv - {business_size:.1f} MB")
print(f"   📄 processed_review.csv - {review_size:.1f} MB")
print(f"   📄 processed_user.csv - {user_size:.1f} MB")
print(f"   📄 train.csv - {train_size:.1f} MB")
print(f"   📄 test.csv - {test_size:.1f} MB")
print(f"   📂 Total size: {business_size + review_size + user_size + train_size + test_size:.1f} MB")

print(f"\n📊 Record Counts:")
print(f"   Business: {pd.read_csv(OUTPUT_DIR + 'processed_business.csv').shape[0]:,}")
print(f"   Reviews: {pd.read_csv(OUTPUT_DIR + 'processed_review.csv').shape[0]:,}")
print(f"   Users: {pd.read_csv(OUTPUT_DIR + 'processed_user.csv').shape[0]:,}")
print(f"   Train: {len(train_df):,}")
print(f"   Test: {len(test_df):,}")

print(f"\n🗑️ Data Cleaning:")
print(f"   Null reviews removed: {null_removed:,}")
print(f"   Duplicates removed: {duplicates_removed:,}")

print(f"\n💾 Peak memory usage: {get_memory_usage():.0f} MB")
print(f"📅 Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

print("\n" + "="*80)
print("🎉 PROCESSING COMPLETE!")
print("="*80)
print(f"\n✅ All files saved to: {OUTPUT_DIR}")
print("\n📚 Next steps:")
print("   1. Use train.csv for model training")
print("   2. Use test.csv for model evaluation")
print("   3. Labels: 0=Tiêu cực, 1=Tích cực, 2=Trung lập")

## 10. Quick Data Preview

In [None]:
print("📋 TRAIN DATA PREVIEW:\n")
print(train_df.head(10))

print("\n📋 TEST DATA PREVIEW:\n")
print(test_df.head(10))