# BBC Text Representations - Setup & Preprocessing

**Roll Number:** SE22UARI195

**Tasks:**
1. Create master.csv with stratified 5-fold splits
2. Generate deterministic train/dev/test split from roll number
3. Build preprocessing pipeline
4. Save processed data to cache

---

## 1. Setup & Imports

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import pickle
import os
import re
import zlib
from pathlib import Path

# Preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Sklearn
from sklearn.model_selection import StratifiedKFold

# Progress bar
from tqdm.notebook import tqdm
tqdm.pandas()

print("‚úÖ Imports successful!")

In [None]:
# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
print("‚úÖ NLTK data downloaded!")

In [None]:
# Configuration
ROLL = "SE22UARI195"  # Your roll number
SEED = 137  # Fixed seed for reproducible folds

# Paths
DATA_DIR = Path("../data")
CACHE_DIR = Path("../cache")
SRC_FILE = DATA_DIR / "bbc-text.csv"
MASTER_FILE = DATA_DIR / "master.csv"

# Create directories if they don't exist
DATA_DIR.mkdir(exist_ok=True)
CACHE_DIR.mkdir(exist_ok=True)

print(f"Roll Number: {ROLL}")
print(f"Data Directory: {DATA_DIR}")
print(f"Cache Directory: {CACHE_DIR}")

## 2. Create Master CSV with 5-Fold Splits

In [None]:
# Check if master.csv already exists
if MASTER_FILE.exists():
    print("‚ö†Ô∏è  master.csv already exists. Loading existing file...")
    df = pd.read_csv(MASTER_FILE)
    print(f"Loaded {len(df)} documents from master.csv")
else:
    print("Creating master.csv...")
    
    # Load BBC dataset
    if not SRC_FILE.exists():
        print(f"\n‚ùå Error: {SRC_FILE} not found!")
        print("\nPlease place 'bbc-text.csv' in the data/ folder.")
        print("You can download it from: [ADD DATASET LINK]")
    else:
        df = pd.read_csv(SRC_FILE)
        print(f"‚úÖ Loaded {len(df)} documents from bbc-text.csv")
        
        # Rename category to label
        df = df.rename(columns={"category": "label"})
        df = df[["text", "label"]]
        
        # Add sequential IDs
        df["id"] = [f"bbc_{i:05d}" for i in range(len(df))]
        
        # Create 5 stratified folds
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
        folds = np.zeros(len(df), dtype=int)
        
        for fold_num, (_, val_idx) in enumerate(skf.split(df["text"], df["label"])):
            folds[val_idx] = fold_num
        
        df["fold5"] = folds
        
        # Reorder columns
        df = df[["id", "text", "label", "fold5"]]
        
        # Save master.csv
        df.to_csv(MASTER_FILE, index=False, encoding="utf-8")
        print(f"\n‚úÖ Saved master.csv with {len(df)} documents")
        
        # Quick sanity check
        assert df["id"].is_unique, "IDs are not unique!"
        assert df["fold5"].between(0, 4).all(), "Folds not in range 0-4!"
        print("‚úÖ Validation passed!")

In [None]:
# Display basic statistics
print("\nüìä Dataset Statistics:")
print(f"Total documents: {len(df)}")
print(f"\nClass distribution:")
print(df["label"].value_counts())
print(f"\nFold distribution:")
print(df["fold5"].value_counts().sort_index())

In [None]:
# Show sample documents
print("\nüìÑ Sample Documents:")
df.head()

## 3. Generate Train/Dev/Test Split from Roll Number

The split is **deterministic** based on your roll number using CRC32 hash.

In [None]:
# Calculate dev and test folds from roll number
r = zlib.crc32(ROLL.encode())
dev_fold = r % 5
test_fold = (r // 5) % 5

# Ensure dev and test folds are different
if test_fold == dev_fold:
    test_fold = (test_fold + 1) % 5

print(f"üé≤ Roll Number: {ROLL}")
print(f"üé≤ CRC32 Hash: {r}")
print(f"\nüìä Fold Assignment:")
print(f"  DEV fold:  {dev_fold}")
print(f"  TEST fold: {test_fold}")
print(f"  TRAIN folds: {[f for f in range(5) if f not in [dev_fold, test_fold]]}")

In [None]:
# Split the data
DEV = df[df.fold5 == dev_fold].copy()
TEST = df[df.fold5 == test_fold].copy()
TRAIN = df[~df.fold5.isin([dev_fold, test_fold])].copy()

print(f"\nüìà Split Sizes:")
print(f"  TRAIN: {len(TRAIN)} documents ({len(TRAIN)/len(df)*100:.1f}%)")
print(f"  DEV:   {len(DEV)} documents ({len(DEV)/len(df)*100:.1f}%)")
print(f"  TEST:  {len(TEST)} documents ({len(TEST)/len(df)*100:.1f}%)")
print(f"  TOTAL: {len(TRAIN) + len(DEV) + len(TEST)} documents")

# Verify no overlap
assert len(set(TRAIN.id) & set(DEV.id)) == 0, "TRAIN and DEV overlap!"
assert len(set(TRAIN.id) & set(TEST.id)) == 0, "TRAIN and TEST overlap!"
assert len(set(DEV.id) & set(TEST.id)) == 0, "DEV and TEST overlap!"
print("\n‚úÖ No overlap between splits!")

In [None]:
# Check class distribution in each split
print("\nüìä Class Distribution Across Splits:")
print("\nTRAIN:")
print(TRAIN["label"].value_counts())
print("\nDEV:")
print(DEV["label"].value_counts())
print("\nTEST:")
print(TEST["label"].value_counts())

## 4. Text Preprocessing Pipeline

Steps:
1. Lowercase
2. Remove punctuation
3. Normalize whitespace
4. Tokenize
5. Remove stopwords
6. Lemmatize

In [None]:
# Initialize preprocessing tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

print(f"Stopwords loaded: {len(stop_words)} words")
print(f"Sample stopwords: {list(stop_words)[:10]}")

In [None]:
def preprocess_text(text, remove_stopwords=True, lemmatize=True):
    """
    Preprocess a single text document.
    
    Args:
        text: Input text string
        remove_stopwords: Whether to remove stopwords
        lemmatize: Whether to lemmatize tokens
    
    Returns:
        Dictionary with:
        - 'raw': original text
        - 'tokens': list of processed tokens
        - 'text': space-joined processed tokens
    """
    # Store original
    raw_text = text
    
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove punctuation (keep only alphanumeric and spaces)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    
    # 3. Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 4. Tokenize
    tokens = word_tokenize(text)
    
    # 5. Remove stopwords (optional)
    if remove_stopwords:
        tokens = [t for t in tokens if t not in stop_words]
    
    # 6. Lemmatize (optional)
    if lemmatize:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    
    # Remove empty tokens and numbers-only tokens
    tokens = [t for t in tokens if len(t) > 1 and not t.isdigit()]
    
    return {
        'raw': raw_text,
        'tokens': tokens,
        'text': ' '.join(tokens)
    }

print("‚úÖ Preprocessing function defined!")

In [None]:
# Test preprocessing on a sample document
sample_text = TRAIN.iloc[0]['text']
print("üìÑ Original Text (first 300 chars):")
print(sample_text[:300] + "...\n")

processed = preprocess_text(sample_text)
print("\nüîß Processed Tokens (first 30):")
print(processed['tokens'][:30])
print(f"\nTotal tokens: {len(processed['tokens'])}")

print("\nüìù Processed Text (first 300 chars):")
print(processed['text'][:300] + "...")

## 5. Process All Splits and Save to Cache

In [None]:
def process_split(split_df, split_name):
    """
    Process all documents in a split.
    """
    print(f"\nüîß Processing {split_name} split ({len(split_df)} documents)...")
    
    # Apply preprocessing
    processed = split_df['text'].progress_apply(preprocess_text)
    
    # Create new dataframe
    result_df = split_df.copy()
    result_df['text_raw'] = processed.apply(lambda x: x['raw'])
    result_df['tokens'] = processed.apply(lambda x: x['tokens'])
    result_df['text_processed'] = processed.apply(lambda x: x['text'])
    result_df['token_count'] = result_df['tokens'].apply(len)
    
    # Statistics
    print(f"\nüìä {split_name} Statistics:")
    print(f"  Total documents: {len(result_df)}")
    print(f"  Total tokens: {result_df['token_count'].sum():,}")
    print(f"  Avg tokens/doc: {result_df['token_count'].mean():.1f}")
    print(f"  Min tokens: {result_df['token_count'].min()}")
    print(f"  Max tokens: {result_df['token_count'].max()}")
    
    return result_df

print("‚úÖ Processing function defined!")

In [None]:
# Process TRAIN split
train_processed = process_split(TRAIN, "TRAIN")

In [None]:
# Process DEV split
dev_processed = process_split(DEV, "DEV")

In [None]:
# Process TEST split
test_processed = process_split(TEST, "TEST")

In [None]:
# Build vocabulary from TRAIN only
print("\nüìö Building vocabulary from TRAIN split...")

# Flatten all tokens
all_train_tokens = []
for tokens in train_processed['tokens']:
    all_train_tokens.extend(tokens)

# Count frequencies
from collections import Counter
vocab_counter = Counter(all_train_tokens)

print(f"\nüìä Vocabulary Statistics:")
print(f"  Total tokens: {len(all_train_tokens):,}")
print(f"  Unique tokens: {len(vocab_counter):,}")
print(f"\nüîù Top 20 most frequent tokens:")
for token, count in vocab_counter.most_common(20):
    print(f"  {token:15s} : {count:5d}")

In [None]:
# Save processed data to cache
print("\nüíæ Saving processed data to cache...")

cache_files = {
    'train_processed.pkl': train_processed,
    'dev_processed.pkl': dev_processed,
    'test_processed.pkl': test_processed,
    'vocab_counter.pkl': vocab_counter
}

for filename, data in cache_files.items():
    filepath = CACHE_DIR / filename
    with open(filepath, 'wb') as f:
        pickle.dump(data, f)
    print(f"  ‚úÖ Saved: {filename}")

print("\nüéâ All data saved successfully!")

In [None]:
# Save split metadata
metadata = {
    'roll': ROLL,
    'dev_fold': int(dev_fold),
    'test_fold': int(test_fold),
    'train_size': len(train_processed),
    'dev_size': len(dev_processed),
    'test_size': len(test_processed),
    'vocab_size': len(vocab_counter),
    'total_train_tokens': len(all_train_tokens)
}

metadata_path = CACHE_DIR / 'metadata.pkl'
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)

print("‚úÖ Metadata saved!")
print("\nüìã Metadata:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

## 6. Summary

‚úÖ **Completed:**
- Created master.csv with 5-fold stratified splits
- Generated train/dev/test split for roll SE22UARI195
- Preprocessed all text (lowercase, tokenize, stopwords, lemmatize)
- Built vocabulary from TRAIN split
- Saved all processed data to cache/

**Next Steps:**
- Build sparse representations (OHE, BoW, N-grams, TF-IDF)
- Build dense representations (Word2Vec, GloVe)
- Train classifiers
- Build retrieval system

In [None]:
print("\n" + "="*60)
print("üéâ NOTEBOOK 01: SETUP & PREPROCESSING COMPLETE! üéâ")
print("="*60)
print(f"\n‚úÖ Processed {len(train_processed) + len(dev_processed) + len(test_processed)} documents")
print(f"‚úÖ Built vocabulary of {len(vocab_counter):,} unique tokens")
print(f"‚úÖ Saved all data to {CACHE_DIR}")
print("\nüìù Ready for next notebook: 02_sparse_methods.ipynb")