# EDA+Preprocessing: NLP26 OpenWebText (local download)
Load the course split locally (no streaming), cache under dataset/, and inspect a small subset for cleaning decisions.

In [4]:
%pip install -q datasets tiktoken


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Setting Paths

In [5]:
import sys, pathlib

# add repo root to path (handles running from notebooks/)
ROOT = pathlib.Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent
sys.path.insert(0, str(ROOT))


## Installing the Dataset in Cache

In [6]:
import os
from datasets import load_dataset
import re

CACHE_DIR = ROOT / "dataset" / "hf_cache"
CACHE_DIR.mkdir(parents=True, exist_ok=True)

DATASET_NAME = "Skylion007/openwebtext"
SPLIT = "train"
SAMPLE_SIZE = 2000 

ds = None
try:
    ds = load_dataset(DATASET_NAME, split=SPLIT, cache_dir=str(CACHE_DIR), streaming=False)
except Exception as exc:
    print(f"Falling back to openwebtext because {exc}")
    try:
        DATASET_NAME = "openwebtext"
        ds = load_dataset(DATASET_NAME, split=SPLIT, cache_dir=str(CACHE_DIR), streaming=False)
    except Exception as exc2:
        raise RuntimeError(f"Could not load either dataset: {exc2}") from exc

n = min(SAMPLE_SIZE, len(ds))
ds_subset = ds.select(range(n))
samples = ds_subset["text"]

print(f"Dataset: {DATASET_NAME} / split={SPLIT}")
print(f"Loaded sample size: {len(samples)}")
print("Example preview (first 500 chars):")
print(samples[0][:500] if samples else "")


Resolving data files:   0%|          | 0/80 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/80 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/303M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/306M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/datasets/Skylion007/openwebtext/resolve/b4325f019c648b1641a1784748667e8b74e5e064/plain_text/train-00001-of-00080.parquet: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/datasets/Skylion007/openwebtext/resolve/b4325f019c648b1641a1784748667e8b74e5e064/plain_text/train-00001-of-00080.parquet: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/datasets/Skylion007/openwebtext/resolve/b4325f019c648b1641a1784748667e8b74e5e064/plain_text/train-00001-of-00080.parquet: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/datasets/Skylion007/openwebtext/resolve/b4325f019c648b1641a1784748667e8b74e5e064/plain_text/train-0000

Downloading data:   0%|          | 0.00/304M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/datasets/Skylion007/openwebtext/resolve/b4325f019c648b1641a1784748667e8b74e5e064/plain_text/train-00002-of-00080.parquet: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/datasets/Skylion007/openwebtext/resolve/b4325f019c648b1641a1784748667e8b74e5e064/plain_text/train-00002-of-00080.parquet: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/datasets/Skylion007/openwebtext/resolve/b4325f019c648b1641a1784748667e8b74e5e064/plain_text/train-00002-of-00080.parquet: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/datasets/Skylion007/openwebtext/resolve/b4325f019c648b1641a1784748667e8b74e5e064/plain_text/train-0000

Downloading data:   0%|          | 0.00/304M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/302M [00:00<?, ?B/s]

KeyboardInterrupt: 

### EDA of RAW Datset

We utilize the functions tooken from the service/explore_service.py

In [9]:
from service.explore_service import load_samples

# Load samples with fallback and caching
samples, resolved = load_samples(
    dataset_name="openwebtext",
    split="train",
    sample_size=2000,
    cache_dir="dataset/hf_cache",
    streaming=False,
)

Resolving data files:   0%|          | 0/80 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/80 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

In [None]:
from service.explore_service import (
    length_stats,
    token_stats,
    flag_counts,
    duplicate_stats,
    full_eda_report,
)

# Basic EDA
print(length_stats(samples))
print(token_stats(samples, tokenizer_name="gpt2", max_samples=2000))
print(flag_counts(samples))
print(duplicate_stats(samples))
print(full_eda_report(samples))


Resolving data files:   0%|          | 0/80 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/80 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/80 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/8013769 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

Using dataset: openwebtext, samples: 2000


## Pre-processing of Dataset

We use preprocessing_utils functions to apply data cleaning.

In [6]:
%pip install -q transformers langdetect

Note: you may need to restart the kernel to use updated packages.


In [17]:
# Config
CACHE_DIR = "dataset/hf_cache"          # existing cache
OUT_PATH = pathlib.Path("dataset_final/openwebtext_clean.jsonl")
DATASET_NAME = "NLP26_OpenWebText"      # falls back to openwebtext if missing
FALLBACK_NAME = "openwebtext"
SPLIT = "train"
MAX_TOKENS = 2048                       # truncate to GPT-2 block
MIN_CHARS = 50
MAX_CHARS = 100_000
DROP_CODE = True
DROP_NON_EN = True

In [23]:

import json, re
from datasets import load_dataset
from tqdm import tqdm
import importlib

import utils.preprocessing_utils as pu
importlib.reload(pu)

from utils.preprocessing_utils import preprocess_text, load_tokenizer, load_test_sentences

from transformers import AutoTokenizer

# Optional test sets for leakage removal (put actual paths)
TEST_PATHS = [
    "path/to/wikitext_test.txt",        # replace with real path or leave empty
    "path/to/nlp26_eval.txt",
]

# Load tokenizer (gracefully handles missing pkg)
tok = AutoTokenizer.from_pretrained("gpt2", model_max_length=MAX_TOKENS, truncation_side="right")

# Load test sentences (can be empty if files not present)
test_sents = load_test_sentences(TEST_PATHS)

def has_overlap(text: str, test_set: set) -> bool:
    if not test_set:
        return False
    sents = re.split(r"(?<=[.!?])\s+", text.strip())
    norm = {re.sub(r"\s+", " ", s.lower()).strip() for s in sents if s.strip()}
    return bool(norm & test_set)

# Prepare output
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)

# Stream dataset (downloads first time only; otherwise uses cache)
try:
    ds = load_dataset(DATASET_NAME, split=SPLIT, cache_dir=CACHE_DIR, streaming=True)
    resolved = DATASET_NAME
except Exception:
    ds = load_dataset(FALLBACK_NAME, split=SPLIT, cache_dir=CACHE_DIR, streaming=True)
    resolved = FALLBACK_NAME

MAX_DOCS = 100_000  # change as needed (e.g., 50_000)

total = kept = filtered_overlap = filtered_clean = 0

with OUT_PATH.open("w", encoding="utf-8") as f:
    for ex in tqdm(ds, desc=f"Processing {resolved}"):
        if total >= MAX_DOCS:
            break
        total += 1
        raw = ex.get("text", "")
        cleaned = preprocess_text(
            raw,
            tokenizer=tok if not isinstance(tok, Exception) else None,
            max_tokens=MAX_TOKENS,
            drop_code=DROP_CODE,
            drop_non_english=DROP_NON_EN,
            url_placeholder="<url>",
            min_chars=MIN_CHARS,
            max_chars=MAX_CHARS,
        )
        if not cleaned:
            filtered_clean += 1
            continue
        if has_overlap(cleaned, test_sents):
            filtered_overlap += 1
            continue
        json.dump({"text": cleaned}, f, ensure_ascii=False)
        f.write("\n")
        kept += 1

print(f"Dataset: {resolved}")
print(f"Total seen: {total}")
print(f"Kept: {kept}")
print(f"Filtered (clean rules): {filtered_clean}")
print(f"Filtered (overlap): {filtered_overlap}")
print(f"Output: {OUT_PATH}")


Resolving data files:   0%|          | 0/80 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/80 [00:00<?, ?it/s]

Processing openwebtext: 1it [00:02,  2.70s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (2095 > 2048). Running this sequence through the model will result in indexing errors
Processing openwebtext: 109193it [15:54, 114.43it/s]


KeyboardInterrupt: 

In [None]:
from datasets import load_dataset
from utils.explore_utils import (
    length_stats,
    token_stats,
    flag_counts,
    duplicate_stats,
)  # if you prefer, you can copy the simple EDA functions inline

CLEAN_JSONL = str(OUT_PATH)  # same path as you wrote above
EDA_SAMPLE = 5000            # adjust sample size for speed

eda_ds = load_dataset("json", data_files=CLEAN_JSONL, split="train[:{}]".format(EDA_SAMPLE))
texts = eda_ds["text"]

print(length_stats(texts))
print(token_stats(texts, tokenizer_name="gpt2", max_samples=EDA_SAMPLE))
print(flag_counts(texts))
print(duplicate_stats(texts))
