# EDA+Preprocessing: NLP26 OpenWebText (local download)
Load the course split locally (no streaming), cache under dataset/, and inspect a small subset for cleaning decisions.

In [None]:
%pip install -q datasets tiktoken
%pip install -q transformers langdetect

In [4]:
# Config
CACHE_DIR = "dataset/hf_cache"          # existing cache
OUT_PATH = pathlib.Path("../dataset_final/openwebtext_clean.jsonl")
DATASET_NAME = "NLP26_OpenWebText"      # falls back to openwebtext if missing
FALLBACK_NAME = "openwebtext"
SPLIT = "train"
MAX_TOKENS = 2048                       # truncate to GPT-2 block
MIN_CHARS = 50
MAX_CHARS = 100_000
DROP_CODE = True
DROP_NON_EN = True

# Dataset percentage to process (40% = ~3.2M docs, 100% = ~8M docs)
DATASET_PERCENTAGE = 40.0  # Change this to control how much data to process

### Setting Paths

In [None]:
import sys, pathlib

# add repo root to path (handles running from notebooks/)
ROOT = pathlib.Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent
sys.path.insert(0, str(ROOT))


## Installing the Dataset in Cache

In [None]:
import os
from datasets import load_dataset
import re

CACHE_DIR = "dataset/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

DATASET_NAME = "Skylion007/openwebtext"
SPLIT = "train"
SAMPLE_SIZE = 2000  # adjust if needed

ds = None
try:
    ds = load_dataset(DATASET_NAME, split=SPLIT, cache_dir=CACHE_DIR, streaming=False)
except Exception as exc:
    print(f"Falling back to openwebtext because {exc}")
    try:
        DATASET_NAME = "openwebtext"
        ds = load_dataset(DATASET_NAME, split=SPLIT, cache_dir=CACHE_DIR, streaming=False)
    except Exception as exc2:
        raise RuntimeError(f"Could not load either dataset: {exc2}") from exc

n = min(SAMPLE_SIZE, len(ds))
ds_subset = ds.select(range(n))
samples = ds_subset["text"]

print(f"Dataset: {DATASET_NAME} / split={SPLIT}")
print(f"Loaded sample size: {len(samples)}")
print("Example preview (first 500 chars):")
print(samples[0][:500] if samples else "")


### EDA of RAW Datset

We utilize the functions tooken from the service/explore_service.py

In [None]:
from service.explore_service import load_samples

# Load samples with fallback and caching
samples, resolved = load_samples(
    dataset_name="openwebtext",
    split="train",
    sample_size=2000,
    cache_dir="dataset/hf_cache",
    streaming=False,
)

In [None]:
from service.explore_service import (
    length_stats,
    token_stats,
    flag_counts,
    duplicate_stats,
    full_eda_report,
)

# Basic EDA
print(length_stats(samples))
print(token_stats(samples, tokenizer_name="gpt2", max_samples=2000))
print(flag_counts(samples))
print(duplicate_stats(samples))
print(full_eda_report(samples))


## Pre-processing of Dataset

We use preprocessing_utils functions to apply data cleaning.

In [None]:

import json, re
from datasets import load_dataset
from tqdm import tqdm
import importlib

import utils.preprocessing_utils as pu
importlib.reload(pu)

from utils.preprocessing_utils import (
    preprocess_text, 
    load_tokenizer, 
    load_test_sentences,
    calculate_subset_size
)

from transformers import AutoTokenizer

# Optional test sets for leakage removal (put actual paths)
TEST_PATHS = [
    "path/to/wikitext_test.txt",        # replace with real path or leave empty
    "path/to/nlp26_eval.txt",
]

# Load tokenizer (gracefully handles missing pkg)
tok = AutoTokenizer.from_pretrained("gpt2", model_max_length=MAX_TOKENS, truncation_side="right")

# Load test sentences (can be empty if files not present)
test_sents = load_test_sentences(TEST_PATHS)

def has_overlap(text: str, test_set: set) -> bool:
    if not test_set:
        return False
    sents = re.split(r"(?<=[.!?])\s+", text.strip())
    norm = {re.sub(r"\s+", " ", s.lower()).strip() for s in sents if s.strip()}
    return bool(norm & test_set)

# Prepare output
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)

# Stream dataset (downloads first time only; otherwise uses cache)
try:
    ds = load_dataset(DATASET_NAME, split=SPLIT, cache_dir=CACHE_DIR, streaming=True)
    resolved = DATASET_NAME
except Exception:
    ds = load_dataset(FALLBACK_NAME, split=SPLIT, cache_dir=CACHE_DIR, streaming=True)
    resolved = FALLBACK_NAME

# Calculate how many documents to process based on percentage
# OpenWebText has ~8,013,769 documents total
TOTAL_DATASET_SIZE = 8_013_769
MAX_DOCS = calculate_subset_size(TOTAL_DATASET_SIZE, DATASET_PERCENTAGE)

print(f"Processing {DATASET_PERCENTAGE}% of dataset")
print(f"Total documents to process: {MAX_DOCS:,} out of {TOTAL_DATASET_SIZE:,}")

total = kept = filtered_overlap = filtered_clean = 0

with OUT_PATH.open("w", encoding="utf-8") as f:
    for ex in tqdm(ds, desc=f"Processing {resolved}"):
        if total >= MAX_DOCS:
            break
        total += 1
        raw = ex.get("text", "")
        cleaned = preprocess_text(
            raw,
            tokenizer=tok if not isinstance(tok, Exception) else None,
            max_tokens=MAX_TOKENS,
            drop_code=DROP_CODE,
            drop_non_english=DROP_NON_EN,
            url_placeholder="<url>",
            min_chars=MIN_CHARS,
            max_chars=MAX_CHARS,
        )
        if not cleaned:
            filtered_clean += 1
            continue
        if has_overlap(cleaned, test_sents):
            filtered_overlap += 1
            continue
        json.dump({"text": cleaned}, f, ensure_ascii=False)
        f.write("\n")
        kept += 1

print(f"Dataset: {resolved}")
print(f"Total seen: {total}")
print(f"Kept: {kept}")
print(f"Filtered (clean rules): {filtered_clean}")
print(f"Filtered (overlap): {filtered_overlap}")
print(f"Output: {OUT_PATH}")


### EDA on cleaned dataset

In [None]:
from datasets import load_dataset
from utils.explore_utils import (length_stats, token_stats, flag_counts, duplicate_stats,)

CLEAN_JSONL = str(OUT_PATH)
EDA_SAMPLE = 5000 

eda_ds = load_dataset("json", data_files=CLEAN_JSONL, split="train[:{}]".format(EDA_SAMPLE)) # load a sample for EDA
texts = eda_ds["text"] # extract text field for analysis

print(length_stats(texts)) #
print(token_stats(texts, tokenizer_name="gpt2", max_samples=EDA_SAMPLE))
print(flag_counts(texts))
print(duplicate_stats(texts))


Generating train split: 0 examples [00:00, ? examples/s]

{'count': 5000, 'min': 296, 'max': 11343, 'avg': 3927.7208, 'short_lt_200': 0, 'short_pct': 0.0}


Token indices sequence length is longer than the specified maximum sequence length for this model (1146 > 1024). Running this sequence through the model will result in indexing errors


{'count': 5000, 'min': 62, 'max': 2048, 'avg': 854.8502, 'over_2048': 0, 'over_2048_pct': 0.0}
{'html': 317, 'code': 0, 'non_en': 0, 'url': 1, 'ctrl': 5000, 'email': 0, 'phone': 0}
{'total': 5000, 'unique': 5000, 'dupes': 0, 'dupe_pct': 0.0}
