# 03 - Text Preprocessing

This notebook preprocesses the cleaned text data for topic modeling.

## Preprocessing Steps
- Text cleaning (case, punctuation, numbers)
- Tokenization
- Stopword removal (Indonesian + English)
- Stemming with PySastrawi (with performance tracking)
- Bigram/Trigram phrase detection
- Save processed corpus

In [None]:
# Import required libraries
import sys
import time
import pickle
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

# Add project root to path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from src.config import get_settings, ensure_directories
from src.preprocessor import IndonesianPreprocessor

In [None]:
# Load settings and data
settings = get_settings()
ensure_directories(settings)

data_path = settings.processed_data_dir / settings.clean_metadata_file
print(f"Loading cleaned data from: {data_path}")

df = pd.read_csv(data_path)
print(f"Loaded {len(df):,} records")

## 1. Initialize Preprocessor

In [None]:
# Configuration
USE_STEMMING = True  # Set to False to skip stemming (much faster)
USE_BIGRAMS = True
USE_TRIGRAMS = True

# Custom stopwords (domain-specific)
CUSTOM_STOPWORDS = {
    # Academic terms that appear in most papers
    'penelitian', 'hasil', 'metode', 'data', 'analisis',
    'kesimpulan', 'saran', 'pembahasan', 'bab', 'tabel',
    'gambar', 'lampiran', 'daftar', 'pustaka', 'referensi',
    # English academic terms
    'research', 'result', 'method', 'data', 'analysis',
    'conclusion', 'table', 'figure', 'chapter',
}

print(f"Configuration:")
print(f"  Stemming: {USE_STEMMING}")
print(f"  Bigrams: {USE_BIGRAMS}")
print(f"  Trigrams: {USE_TRIGRAMS}")
print(f"  Custom stopwords: {len(CUSTOM_STOPWORDS)}")

In [None]:
# Initialize preprocessor
preprocessor = IndonesianPreprocessor(
    settings=settings,
    custom_stopwords=CUSTOM_STOPWORDS,
    use_stemming=USE_STEMMING,
    use_bigrams=USE_BIGRAMS,
    use_trigrams=USE_TRIGRAMS,
)

print(f"Total stopwords: {len(preprocessor.stopwords)}")

## 2. Test Preprocessing on Sample

In [None]:
# Test on a sample abstract
sample_idx = df[df['abstract'].str.len() > 200].sample(1).index[0]
sample_text = df.loc[sample_idx, 'abstract']

print("Sample Abstract:")
print("-" * 60)
print(sample_text[:500] + "..." if len(sample_text) > 500 else sample_text)

In [None]:
# Preprocess sample
start_time = time.time()
sample_tokens = preprocessor.preprocess_text(sample_text, apply_phrases=False)
elapsed = time.time() - start_time

print(f"\nPreprocessed tokens ({len(sample_tokens)} tokens, {elapsed:.3f}s):")
print("-" * 60)
print(sample_tokens[:30])
if len(sample_tokens) > 30:
    print(f"... and {len(sample_tokens) - 30} more")

## 3. Process All Documents

In [None]:
# Get abstracts
abstracts = df['abstract'].fillna('').tolist()

print(f"Processing {len(abstracts):,} documents...")
print(f"This may take a while, especially with stemming enabled.")
print("-" * 60)

In [None]:
# Process all documents
start_time = time.time()

processed_docs = preprocessor.preprocess_documents(
    abstracts,
    fit_phrases=USE_BIGRAMS or USE_TRIGRAMS,
    show_progress=True,
)

total_time = time.time() - start_time
print(f"\n‚úÖ Preprocessing complete in {total_time:.1f}s")

In [None]:
# Display statistics
stats = preprocessor.stats

print("\nPreprocessing Statistics:")
print("=" * 50)
print(f"Total documents:      {stats.total_documents:,}")
print(f"Processed documents:  {stats.processed_documents:,}")
print(f"Skipped documents:    {stats.skipped_documents:,}")
print(f"Total tokens:         {stats.total_tokens:,}")
print(f"Unique tokens:        {stats.unique_tokens:,}")
print(f"Avg tokens/doc:       {stats.avg_tokens_per_doc:.1f}")
print(f"Total time:           {stats.total_time_seconds:.1f}s")
if stats.stemming_time_seconds > 0:
    print(f"Stemming time:        {stats.stemming_time_seconds:.1f}s ({stats.stemming_time_seconds/stats.total_time_seconds*100:.1f}%)")

## 4. Analyze Processed Corpus

In [None]:
# Token length distribution
token_counts = [len(doc) for doc in processed_docs]

fig, ax = plt.subplots(figsize=(12, 5))
ax.hist(token_counts, bins=50, edgecolor='white', alpha=0.7)
ax.axvline(np.median(token_counts), color='red', linestyle='--',
           label=f'Median: {np.median(token_counts):.0f}')
ax.set_xlabel('Tokens per Document')
ax.set_ylabel('Frequency')
ax.set_title('Document Length Distribution (After Preprocessing)', fontweight='bold')
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# Top tokens after preprocessing
all_tokens = [token for doc in processed_docs for token in doc]
token_freq = Counter(all_tokens)

print(f"\nTop 30 tokens after preprocessing:")
print("-" * 50)
for token, count in token_freq.most_common(30):
    print(f"  {count:6d}: {token}")

In [None]:
# Check for bigrams/trigrams
phrases = [t for t in token_freq if '_' in t]
print(f"\nDetected phrases (bigrams/trigrams): {len(phrases)}")

if phrases:
    print("\nTop 20 phrases:")
    phrase_counts = {p: token_freq[p] for p in phrases}
    for phrase, count in sorted(phrase_counts.items(), key=lambda x: -x[1])[:20]:
        print(f"  {count:5d}: {phrase}")

## 5. Create DataFrame with Tokens

In [None]:
# Filter original dataframe to match processed documents
# (some documents may have been skipped due to short length)

# Find valid indices
valid_indices = []
doc_idx = 0

for i, abstract in enumerate(abstracts):
    tokens = preprocessor.preprocess_text(abstract, apply_phrases=False)
    if len(tokens) >= settings.min_doc_length:
        valid_indices.append(i)

# Create result dataframe
df_processed = df.iloc[valid_indices].copy().reset_index(drop=True)
df_processed['tokens'] = processed_docs
df_processed['token_count'] = [len(doc) for doc in processed_docs]

print(f"Processed dataframe: {len(df_processed):,} records")

In [None]:
# Preview
df_processed[['title', 'token_count', 'tokens']].head()

## 6. Save Processed Data

In [None]:
# Save processed corpus
corpus_path = settings.processed_data_dir / settings.processed_corpus_file

corpus_data = {
    'documents': processed_docs,
    'dataframe': df_processed,
    'stats': preprocessor.stats,
}

with open(corpus_path, 'wb') as f:
    pickle.dump(corpus_data, f)

print(f"‚úÖ Saved processed corpus to: {corpus_path}")

In [None]:
# Save preprocessor (for consistent preprocessing of new text)
preprocessor_path = settings.processed_data_dir / 'preprocessor.pkl'
preprocessor.save(preprocessor_path)
print(f"‚úÖ Saved preprocessor to: {preprocessor_path}")

In [None]:
# Also save as CSV (without tokens for readability)
csv_path = settings.processed_data_dir / 'processed_metadata.csv'
df_processed.drop(columns=['tokens']).to_csv(csv_path, index=False)
print(f"‚úÖ Saved metadata CSV to: {csv_path}")

## Summary

In [None]:
print("\n" + "=" * 60)
print("PREPROCESSING COMPLETE")
print("=" * 60)
print(f"\nüìä Documents processed: {len(processed_docs):,}")
print(f"üìù Unique tokens: {stats.unique_tokens:,}")
print(f"üìà Avg tokens/doc: {stats.avg_tokens_per_doc:.1f}")
print(f"‚è±Ô∏è  Total time: {stats.total_time_seconds:.1f}s")

print(f"\nüìÅ Output files:")
print(f"   - {corpus_path}")
print(f"   - {preprocessor_path}")
print(f"   - {csv_path}")

print(f"\nüëâ Next: Run 04_lda_modeling.ipynb to train the LDA model")