In [1]:
import os
import sys
# Add src to path for imports
sys.path.insert(0, os.path.join(os.getcwd(), '..', 'src'))
sys.path.insert(0, os.path.join(os.getcwd(), '..'))

# SQuAD v2 Processing & Caching Demo

This notebook demonstrates the end-to-end data pipeline used by MemXLNet-QA:

1. Process a (small) subset of SQuAD v2 into segment features and cache them.
2. Re-run processing to show near-instant cache reuse.
3. Inspect cached feature structure.
4. Manually re-shard features into multiple chunks and stream them chunk-by-chunk (simulating low‑memory mode).
5. Re-create a `SquadLikeQADataset` from cache and verify integrity.
6. Build a time-step-major dataloader and inspect the per-time-step batches (showing how segments align across documents for memory propagation).

We'll keep parameters small for speed. Adjust `max_examples`, `max_seq_length`, and `doc_stride` for fuller runs.

---


In [2]:
import os
import time
from transformers import XLNetTokenizerFast
from src.data import (
    process_and_cache_dataset,
    create_dataset_from_cache,
    ChunkedCacheManager,
    create_dataloader,
)

# Configuration (tweak for larger experiments)
dataset_name = "squad_v2"
split = "validation"  # smaller than train; fine for demo
max_examples = 60       # limit for speed
max_seq_length = 384    # shorter than 384 default to speed up
doc_stride = 64
cache_dir = "./.cache_demo"
os.makedirs(cache_dir, exist_ok=True)

# Load base tokenizer (no memory tokens needed for pure data demo)
tokenizer = XLNetTokenizerFast.from_pretrained("xlnet-base-cased")

print("=== First processing (build cache) ===")
start = time.time()
feature_count = process_and_cache_dataset(
    dataset_name=dataset_name,
    split=split,
    cache_dir=cache_dir,
    max_examples=max_examples,
    max_seq_length=max_seq_length,
    doc_stride=doc_stride,
    streaming_chunk_size=2000,   # single-chunk path in current helper
    max_memory_gb=16.0,
    use_streaming=False,
    tokenizer=tokenizer,
    max_n_segs=None,
)
first_duration = time.time() - start
print(f"Processed {feature_count} features in {first_duration:.2f}s")

print("\n=== Second processing (cache hit) ===")
start = time.time()
feature_count_cached = process_and_cache_dataset(
    dataset_name=dataset_name,
    split=split,
    cache_dir=cache_dir,
    max_examples=max_examples,
    max_seq_length=max_seq_length,
    doc_stride=doc_stride,
    streaming_chunk_size=2000,
    max_memory_gb=16.0,
    use_streaming=False,
    tokenizer=tokenizer,
    max_n_segs=None,
)
second_duration = time.time() - start
print(f"Cache hit features: {feature_count_cached} (took {second_duration:.3f}s)")

cache_timing = {"first_run_s": first_duration, "second_run_s": second_duration, "features": feature_count}
cache_timing

  from .autonotebook import tqdm as notebook_tqdm


=== First processing (build cache) ===
Processed 60 features in 0.06s

=== Second processing (cache hit) ===
Cache hit features: 60 (took 0.051s)


{'first_run_s': 0.05879807472229004,
 'second_run_s': 0.05122876167297363,
 'features': 60}

In [3]:
# Inspect cache directory contents and sample feature
print("=== Cache Directory Contents ===")
for fname in os.listdir(cache_dir):
    fpath = os.path.join(cache_dir, fname)
    size_kb = os.path.getsize(fpath) / 1024
    print(f"{fname:50s} {size_kb:8.1f} KB")

# Load raw cached features (single chunk path produced by process_and_cache_dataset)
cache_manager = ChunkedCacheManager(cache_dir, chunk_size=2000)
modified_dataset_name = dataset_name  # no memory tokens used, so no suffix
chunk_0 = cache_manager.load_chunk(modified_dataset_name, split, 0)
print(f"Loaded chunk_0 with {len(chunk_0)} feature dicts")

# Show keys of first feature
first_feature_keys = list(chunk_0[0].keys()) if chunk_0 else []
print("Feature keys:", first_feature_keys)

# Count distinct documents
doc_ids = {f['example_id'] for f in chunk_0}
print(f"Distinct documents: {len(doc_ids)}")

# Distribution of segments per doc
from collections import Counter
seg_counts = Counter()
for f in chunk_0:
    seg_counts[f['example_id']] += 1
print("Segments per doc (first 10):", list(seg_counts.items())[:10])

feature_count

=== Cache Directory Contents ===
squad_v2_validation_chunk_0.cache                     910.0 KB
multi_chunks                                            0.2 KB
Loaded chunk_0 with 60 feature dicts
Feature keys: ['input_ids', 'attention_mask', 'token_type_ids', 'start_positions', 'end_positions', 'example_id', 'segment_index', 'total_segments', 'offset_mapping', 'context']
Distinct documents: 60
Segments per doc (first 10): [('doc_0', 1), ('doc_1', 1), ('doc_2', 1), ('doc_3', 1), ('doc_4', 1), ('doc_5', 1), ('doc_6', 1), ('doc_7', 1), ('doc_8', 1), ('doc_9', 1)]


60

In [4]:
# Manual multi-chunk creation (simulated streaming scenario)
# We create a new directory with multiple shards of the same features for demonstration.
chunked_cache_dir = os.path.join(cache_dir, "multi_chunks")
os.makedirs(chunked_cache_dir, exist_ok=True)
chunk_size = 25  # small to force several chunks
chunked_cache_manager = ChunkedCacheManager(chunked_cache_dir, chunk_size=chunk_size)

# Only shard if not already present
if not chunked_cache_manager.cache_exists(dataset_name, split):
    for i in range(0, len(chunk_0), chunk_size):
        shard = chunk_0[i:i+chunk_size]
        chunk_id = i // chunk_size
        chunked_cache_manager.save_chunk(shard, dataset_name, split, chunk_id)
    print(f"Created {chunked_cache_manager.get_total_chunks(dataset_name, split)} chunk files in {chunked_cache_dir}")
else:
    print("Chunked cache already exists; skipping creation.")

print("Listing chunked shards:")
for fname in sorted(os.listdir(chunked_cache_dir)):
    print("  ", fname)

# Simple streaming generator
def stream_features(dataset_name: str, split: str, manager: ChunkedCacheManager):
    total = manager.get_total_chunks(dataset_name, split)
    for cid in range(total):
        yield cid, manager.load_chunk(dataset_name, split, cid)

stream_counts = []
for cid, shard in stream_features(dataset_name, split, chunked_cache_manager):
    stream_counts.append(len(shard))
print("Shard sizes:", stream_counts)
print("Total streamed features:", sum(stream_counts))

Chunked cache already exists; skipping creation.
Listing chunked shards:
   squad_v2_validation_chunk_0.cache
   squad_v2_validation_chunk_1.cache
   squad_v2_validation_chunk_2.cache
Shard sizes: [25, 25, 10]
Total streamed features: 60
Shard sizes: [25, 25, 10]
Total streamed features: 60


In [5]:
# Re-create dataset via high-level helper (fresh processing path internally)
# Note: create_dataset_from_cache currently re-processes; for large runs you'd typically
# rely on your own cache manager to avoid recompute. Here we use it to get a Dataset object.
validation_dataset = create_dataset_from_cache(
    dataset_name=dataset_name,
    split=split,
    cache_dir=cache_dir,
    max_examples=max_examples,
    max_seq_length=max_seq_length,
    doc_stride=doc_stride,
    use_lazy_loading=False,
    max_n_segs=None,
    tokenizer=tokenizer,
)
print(f"Dataset length: {len(validation_dataset)} (should match feature_count: {feature_count})")

# Sample feature & attempt to reconstruct answer text
sample_feature = validation_dataset[0]
input_ids = sample_feature['input_ids']
start_pos = sample_feature['start_positions']
end_pos = sample_feature['end_positions']
# If CLS (likely no-answer) we mark it
token_answer = None
if start_pos != tokenizer.cls_token_id and end_pos != tokenizer.cls_token_id:
    # naive slice; includes special tokens sometimes depending on segmentation; ok for demo
    token_answer = tokenizer.decode(input_ids[start_pos:end_pos+1])

print("Sample feature keys:", list(sample_feature.keys()))
print("Segment indices: seg_index=", sample_feature['segment_index'], "of", sample_feature['total_segments'])
print("Decoded answer tokens:", token_answer)
print("Context excerpt:", sample_feature['context'][:200], '...')

[create_dataset_from_cache] Checking cache for 'squad_v2' (validation) in .cache_demo ...
[create_dataset_from_cache] Cache hit: 1 chunk(s) detected. Loading ...
[create_dataset_from_cache] Reconstructed dataset with 60 features across 60 documents (cache).
Dataset length: 60 (should match feature_count: 60)
Sample feature keys: ['input_ids', 'attention_mask', 'token_type_ids', 'start_positions', 'end_positions', 'example_id', 'segment_index', 'total_segments', 'offset_mapping', 'context']
Segment indices: seg_index= 0 of 1
Decoded answer tokens: France
Context excerpt: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (" ...
[create_dataset_from_cache] Reconstructed dataset with 60 features across 60 documents (cache).
Dataset length: 60 (should match feature_count: 60)
Sample feature keys: ['input_ids', 'attention_mask', 'token_type_ids',

  item[key] = torch.tensor(feature[key])


In [6]:
# Time-step-major DataLoader demonstration
# Build dataloader: groups documents; each yielded element is a list of per-time-step batches.
timestep_dataloader = create_dataloader(
    dataset=validation_dataset,
    batch_size=4,  # number of documents processed in parallel
    shuffle=False,
    num_workers=0,
    use_time_step_major=True,
)

print(f"Time-step-major batches (document groups): {len(timestep_dataloader)}")

# Inspect the first group of time-step batches
first_group = next(iter(timestep_dataloader))
print(f"Time steps in first group: {len(first_group)}")

for t, step_batch in enumerate(first_group[:5]):  # cap display to first 5 steps
    ids = step_batch['example_ids']
    mask = step_batch['document_mask']
    input_shape = step_batch['input_ids'].shape
    active_docs = mask.sum().item()
    print(f"T{t:02d}: input_ids shape={tuple(input_shape)}, active_docs={active_docs}, example_ids={ids}")

# Optional: demonstrate how padding entries appear when documents have fewer segments
padding_present = any(any(str(e).startswith('padding_') for e in step_batch['example_ids']) for step_batch in first_group)
print("Padding entries present?", padding_present)

Time-step-major batches (document groups): 15
Time steps in first group: 1
T00: input_ids shape=(4, 384), active_docs=4, example_ids=['doc_0', 'doc_1', 'doc_2', 'doc_3']
Padding entries present? False
