In [None]:
import requests
from bs4 import BeautifulSoup
import re
import os
from tqdm import tqdm
import unicodedata 
import spacy
from datasets import Dataset
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, Trainer, TrainingArguments
import torch
import math
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Dataset Source and Filtering

## 1. Source
The books were taken from **Project Gutenberg**, a free website with public-domain books.  
We collected book titles, authors, and links from the main search page sorted by downloads.


## 2. Filtering Steps

### 2.1 Remove Poetry
Poetry books were removed because poems do not follow normal sentence structure.  
A book was skipped if its page showed tags like **“poetry”** or **“poem.”**

### 2.2 Keep Only Plain Text Files
For each book, we looked for the **“Plain Text UTF-8”** download link.  
If a book did not have this link, it was ignored.

### 2.3 Filter by Size
We checked the file size of each text file using a `HEAD` request.  
Then we sorted the books from smallest to largest and picked the **10 smallest** ones.


## 3. Final Dataset
The final dataset contains:
- Books from Project Gutenberg  
- Only non-poetry books  
- Only books with a plain text file  
- The smallest 10 books after filtering  


In [None]:
BASE_URL = "https://www.gutenberg.org"
HEADERS = {"User-Agent": "Mozilla/5.0"}

# ---------- Helper functions ----------

def get_books_from_url(url):
    resp = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(resp.text, "html.parser")
    books = []

    for li in soup.select("li.booklink"):
        a = li.find("a")
        if not a:
            continue

        book_url = BASE_URL + a["href"]
        title = li.select_one(".title").get_text(strip=True)
        author = li.select_one(".subtitle")
        author = author.get_text(strip=True) if author else "Unknown"

        books.append({"title": title, "author": author, "url": book_url})
    return books

def is_poem(book_url):
    resp = requests.get(book_url, headers=HEADERS)
    soup = BeautifulSoup(resp.text, "html.parser")
    subjects = [s.get_text(strip=True) for s in soup.select("td.property_value")]

    for s in subjects:
        if "poetry" in s.lower() or "poem" in s.lower():
            return True
    return False

def get_text_download_link(book_url):
    resp = requests.get(book_url, headers=HEADERS)
    soup = BeautifulSoup(resp.text, "html.parser")
    for a in soup.select("a.link"):
        if "Plain Text UTF-8" in a.text:
            href = a["href"]
            if href.startswith("//"):
                return "https:" + href
            if href.startswith("/"):
                return BASE_URL + href
            return href
    return None

def get_file_size(url):
    try:
        resp = requests.head(url, headers=HEADERS, allow_redirects=True)
        size = resp.headers.get("Content-Length")
        return int(size) if size else float("inf")
    except:
        return float("inf")

def download_book(title, txt_url):
    os.makedirs("downloads", exist_ok=True)
    safe_title = "".join(c for c in title if c.isalnum() or c in " _-")
    filepath = f"downloads/{safe_title}.txt"
    resp = requests.get(txt_url, headers=HEADERS)
    with open(filepath, "wb") as f:
        f.write(resp.content)
    print("Downloaded:", filepath)

# Main Pipeline 

url = "https://www.gutenberg.org/ebooks/search/?sort_order=downloads"
all_books = get_books_from_url(url)

# Filter non-poetry books
non_poem_books = [b for b in all_books if not is_poem(b["url"])]
print("Non-poetry books:", len(non_poem_books))

# Add size info
for book in non_poem_books:
    txt_link = get_text_download_link(book["url"])
    if txt_link:
        book["txt_link"] = txt_link
        book["size"] = get_file_size(txt_link)
    else:
        book["txt_link"] = None
        book["size"] = float("inf")

# Sort by size (ascending) and select first 10
books_to_download = sorted([b for b in non_poem_books if b["txt_link"]], key=lambda x: x["size"])[:10]

# Download
for book in books_to_download:
    print("\nProcessing:", book["title"])
    download_book(book["title"], book["txt_link"])


Non-poetry books: 25

Processing: The Legend of Sleepy Hollow
Downloaded: downloads/The Legend of Sleepy Hollow.txt

Processing: The Strange Case of Dr. Jekyll and Mr. Hyde
Downloaded: downloads/The Strange Case of Dr Jekyll and Mr Hyde.txt

Processing: Romeo and Juliet
Downloaded: downloads/Romeo and Juliet.txt

Processing: Alice's Adventures in Wonderland
Downloaded: downloads/Alices Adventures in Wonderland.txt

Processing: Beowulf: An Anglo-Saxon Epic Poem
Downloaded: downloads/Beowulf An Anglo-Saxon Epic Poem.txt

Processing: A Room with a View
Downloaded: downloads/A Room with a View.txt

Processing: Cranford
Downloaded: downloads/Cranford.txt

Processing: The Blue Castle: a novel
Downloaded: downloads/The Blue Castle a novel.txt

Processing: The King in Yellow
Downloaded: downloads/The King in Yellow.txt

Processing: Frankenstein; Or, The Modern Prometheus
Downloaded: downloads/Frankenstein Or The Modern Prometheus.txt


## **Text Preprocessing**

The preprocessing pipeline is designed to clean, normalize, and structure raw text data 
for analysis or modeling. The workflow systematically removes noise while retaining meaningful content.

**1. Unicode Normalization**
- Text is normalized using NFKC to ensure consistent encoding for characters with multiple representations.

**2. Line Ending Standardization**
- Windows-style (\r\n) and old Mac-style (\r) line endings are converted to Unix-style (\n).

**3. HTML Tag Removal**
- All HTML tags are removed using regular expressions to prevent markup from interfering with the text.

**4. Non-text Character Cleaning**
- Characters except alphanumeric, common punctuation (. , ? ! ; : () ' -), and newlines are removed.

**5. Space Normalization**
- Consecutive spaces are collapsed into a single space while preserving newlines.
- Leading and trailing whitespace is stripped.

**6. Chapter and Section Title Removal**
- Chapter, book, and part headings are removed using regex patterns.
- Short all-caps lines (≤6 words) are excluded.
- SpaCy POS tagging is applied to remove short lines consisting mostly of NOUN, PROPN, or NUM.

**7. Paragraph Normalization**
- Long paragraphs are tokenized using spaCy.
- Text is normalized and processed in chunks to prevent memory issues.

**8. Line Merging**
- Lines within paragraphs are merged into single lines while preserving paragraph separation.

**9. Punctuation Correction**
- Extra spaces before punctuation are removed for cleaner sentence boundaries.

**10. Gutenberg-specific Cleaning**
- Headers and footers (e.g., "*** START OF THE PROJECT GUTENBERG EBOOK ***") are removed.
- Trailing occurrences of "THE END" are deleted.

**11. File Handling**
- Text is read in small chunks for efficient handling of large files.
- Cleaned output is saved to a designated directory with robust error handling.


In [10]:
def clean_chunk(text):
    # Normalize unicode
    text = unicodedata.normalize("NFKC", text)

    # Convert Windows CRLF to UNIX newlines
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    # Remove HTML tags if any
    text = re.sub(r"<.*?>", "", text)

    # Remove non-text garbage but KEEP newlines
    text = re.sub(r"[^A-Za-z0-9 .,?!;:()'\n\-]+", " ", text)

    # Collapse many spaces but KEEP newlines
    text = re.sub(r"[ ]{2,}", " ", text)

    # Strip leading/trailing whitespace
    return text.strip()


In [11]:
nlp = spacy.load("en_core_web_sm")

# Regex patterns for chapter headings
chapter_patterns = [
    r"^\s*chapter\s+[0-9ivxlcdm]+\s*$",            # CHAPTER 1, CHAPTER I
    r"^\s*chapter\s*[:.\- ]\s*[a-zA-Z0-9]+.*$",    # Chapter: One
    r"^\s*[0-9ivxlcdm]+\s*\.\s*.*$",               # 1. The Incident, I. The Beginning
    r"^\s*book\s+[0-9ivxlcdm]+\s*$",               # BOOK I
    r"^\s*part\s+[0-9ivxlcdm]+\s*$",               # PART II
]

chapter_regex = re.compile("|".join(chapter_patterns), re.IGNORECASE)


def remove_chapter_titles(text):
    cleaned_lines = []
    lines = text.split("\n")

    for line in lines:
        stripped = line.strip()

        # Skip empty lines
        if not stripped:
            cleaned_lines.append("")
            continue

        # Regex chapter detection
        if chapter_regex.match(stripped):
            continue

        # ALL-CAPS titles (likely chapter or book title)
        if stripped.isupper() and len(stripped.split()) <= 6:
            continue

        # spaCy detection: lines with only nouns/proper nouns 
        doc = nlp(stripped)
        pos_tags = {token.pos_ for token in doc}

        # Most chapter titles are noun-only lines
        if pos_tags.issubset({"NOUN", "PROPN", "NUM"}):
            if len(doc) <= 8:   # Avoid deleting sentences
                continue

        # Keep the line
        cleaned_lines.append(stripped)

    return "\n".join(cleaned_lines).strip()

In [12]:
def remove_chapter_titles_spacy(text):
    lines = text.split("\n")
    cleaned_lines = []

    for line in lines:
        stripped = line.strip()
        if not stripped:
            cleaned_lines.append("")
            continue

        # Remove trailing punctuation
        stripped_clean = stripped.rstrip(".:;")

        # Regex detection
        if chapter_regex.match(stripped_clean):
            continue

        # Short lines starting with "Part" / "Chapter" / "Book"
        if re.match(r"^(Part|Chapter|Book)\s+[IVXLCDM0-9a-z]+\.?$", stripped_clean, re.IGNORECASE):
            continue

        # ALL CAPS short titles
        if stripped_clean.isupper() and len(stripped_clean.split()) <= 6:
            continue

        # spaCy POS check
        doc = nlp(stripped_clean)
        pos_tags = {token.pos_ for token in doc}
        if pos_tags.issubset({"NOUN", "PROPN", "NUM"}) and len(doc) <= 8:
            continue

        cleaned_lines.append(stripped)

    return "\n".join(cleaned_lines).strip()

def spacy_normalize(text, chunk_size=100_000):
    paragraphs = text.split("\n\n")
    normalized_paragraphs = []

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue

        # Process paragraph in smaller chunks if too long
        para_tokens = []
        for i in range(0, len(para), chunk_size):
            chunk = para[i:i+chunk_size]
            doc = nlp(chunk)
            para_tokens.extend([tok.text for tok in doc if not tok.is_space])

        # collapse only double spaces, preserve newlines
        normalized = re.sub(r" {2,}", " ", " ".join(para_tokens))
        normalized_paragraphs.append(normalized)
        
    # Join paragraphs with double newline
    return "\n\n".join(normalized_paragraphs).strip()

In [13]:
def merge_paragraph_lines(text):
    """
    Merge lines inside a paragraph, keep paragraphs separated by double newlines.
    """
    paragraphs = text.split("\n\n")
    merged_paragraphs = []

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
        # Merge single line breaks into space
        para = re.sub(r"\n+", " ", para)
        merged_paragraphs.append(para)

    return "\n\n".join(merged_paragraphs)


In [18]:
# DIRECTORY CONFIG
RAW_DIR = r"downloads"   # 
CLEAN_DIR = r"clean_books"  #

os.makedirs(CLEAN_DIR, exist_ok=True)

In [19]:
# 3. Main preprocessing pipeline
def preprocess_file(file_name):
    raw_path = os.path.join(RAW_DIR, file_name)
    clean_path = os.path.join(CLEAN_DIR, file_name)

    try:
        # READ RAW FILE
        chunks = []
        with open(raw_path, "r", encoding="utf-8", errors="ignore") as fin:
            while True:
                part = fin.read(8192)
                if not part:
                    break
                chunks.append(part)

        text = "".join(chunks)

        # REMOVE GUTENBERG HEADER
        start_match = re.search(r"\*\*\* START OF.*?\*\*\*", text, re.IGNORECASE)
        if start_match:
            text = text[start_match.end():]

        # REMOVE GUTENBERG FOOTER
        end_match = re.search(r"\*\*\* END OF.*?\*\*\*", text, re.IGNORECASE)
        if end_match:
            text = text[:end_match.start()]

        text = re.sub(
            r"END OF THE PROJECT GUTENBERG EBOOK.*",
            "",
            text,
            flags=re.IGNORECASE | re.DOTALL
        )

        text = re.sub(
            r"\bTHE END\b[\s\n]*$",
            "",
            text,
            flags=re.IGNORECASE
        )

        # CLEAN MAIN CONTENT
        cleaned = clean_chunk(text)

        # REMOVE CHAPTER HEADERS
        cleaned = remove_chapter_titles(cleaned)

        # LIGHT SPACY NORMALIZATION
        cleaned = spacy_normalize(cleaned)

        # Merge lines within paragraphs
        cleaned = merge_paragraph_lines(cleaned)

        # Fix spaces before punctuation
        cleaned = re.sub(r"\s+([,.!?;:])", r"\1", cleaned)

        # SAVE CLEANED
        with open(clean_path, "w", encoding="utf-8") as fout:
            fout.write(cleaned.strip() + "\n")

        return {"file_name": file_name, "text": cleaned}

    except Exception as e:
        print(f"Failed to clean {file_name}: {e}")
        return None

In [20]:
files = [f for f in os.listdir(RAW_DIR) if f.lower().endswith(".txt")]

for file_name in tqdm(files, desc="Cleaning books", unit="book"):
    preprocess_file(file_name)

print("Finished cleaning all books.")

Cleaning books: 100%|██████████| 10/10 [06:44<00:00, 40.44s/book]

Finished cleaning all books.





In [21]:
# Directory containing cleaned books
CLEAN_DIR = "clean_books"
OUTPUT_FILE = "merged_books.txt"

# Special token to mark the end of each book
BOOK_END_TOKEN = "<|BOOK_END|>"

# Get all text files in the directory, sorted alphabetically
text_files = sorted([f for f in os.listdir(CLEAN_DIR) if f.endswith(".txt")])

# Open the output file in write mode (incremental write for memory efficiency)
with open(OUTPUT_FILE, "w", encoding="utf-8") as outfile:
    for file_name in text_files:
        file_path = os.path.join(CLEAN_DIR, file_name)
        with open(file_path, "r", encoding="utf-8") as infile:
            book_text = infile.read().strip()
            # Write book text followed by special token only
            outfile.write(book_text + f"\n{BOOK_END_TOKEN}\n")

print(f"Merged {len(text_files)} files into {OUTPUT_FILE} with explicit book boundaries.")

Merged 10 files into merged_books.txt with explicit book boundaries.


In [2]:
# Configuration
MERGED_FILE = "merged_books.txt"  # Merged books file    
SPECIAL_TOKENS = ["<|BOOK_END|>"]  # Special token marking book end


# Model Training Summary

## Model
The model used for training is **DistilGPT-2**, a smaller and faster version of GPT-2.  
After adding the special token, the model’s vocabulary size was updated to match the tokenizer.


## Training Setup
Training was done with the following settings:

- Batch size: 1  
- Gradient accumulation: 8  
- Learning rate: 5e-5  
- Epochs: 2  
- FP16 enabled only if a GPU was available  
- Checkpoints saved every 500 steps  
- Logs written every 100 steps  

Due to limited hardware, training ran slowly and completed fewer updates.

In [None]:
# Load tokenizer and add special tokens
tokenizer = GPT2TokenizerFast.from_pretrained("distilgpt2")
SPECIAL_TOKENS = ["<|BOOK_END|>"]
tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})

# Read merged text
with open("merged_books.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Tokenize AFTER adding special tokens
tokens = tokenizer(text)["input_ids"]

# Create chunk generator
BLOCK_SIZE = 512
OVERLAP = 64

def chunk_generator(tokens, block_size=BLOCK_SIZE, overlap=OVERLAP):
    step = block_size - overlap
    for start in range(0, len(tokens), step):
        end = start + block_size
        chunk = tokens[start:end]
        if len(chunk) < block_size:
            break  # drop incomplete chunks
        yield {"input_ids": chunk, "labels": chunk}

# Build dataset
dataset = Dataset.from_generator(lambda: chunk_generator(tokens))

Token indices sequence length is longer than the specified maximum sequence length for this model (641205 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
# Load model and resize embeddings AFTER adding tokens
model = GPT2LMHeadModel.from_pretrained("distilgpt2")
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50258, 768)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt_books_model",
    per_device_train_batch_size=1,  
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    num_train_epochs=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    fp16=torch.cuda.is_available(),  # only if GPU
    remove_unused_columns=False,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Train
trainer.train()

Step,Training Loss
100,4.2
200,4.0434
300,3.9693


TrainOutput(global_step=358, training_loss=4.0568871098523696, metrics={'train_runtime': 10805.7185, 'train_samples_per_second': 0.265, 'train_steps_per_second': 0.033, 'total_flos': 373915650097152.0, 'train_loss': 4.0568871098523696, 'epoch': 2.0})

In [13]:
save_path = "./gpt_books_model"

# Create folder if missing
import os
os.makedirs(save_path, exist_ok=True)

# Save model
model.save_pretrained(save_path)

# Save tokenizer
tokenizer.save_pretrained(save_path)

print("Saved:", os.listdir(save_path))


Saved: ['added_tokens.json', 'checkpoint-358', 'config.json', 'generation_config.json', 'merges.txt', 'model.safetensors', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'training_args.bin', 'vocab.json']


In [20]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

model_path = "./gpt_books_model_trained"

tokenizer = GPT2TokenizerFast.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

In [22]:
model_path = "./gpt_books_model_trained"

tokenizer = GPT2TokenizerFast.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()

device = "cpu"
model.to(device)

prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

# Add attention mask explicitly (fixes the warning)
input_ids = inputs["input_ids"].to(device)
attention_mask = inputs["attention_mask"].to(device)

output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=120,
    temperature=0.8,
    top_p=0.92,
    top_k=40,
    do_sample=True,
    repetition_penalty=1.7,
    pad_token_id=tokenizer.eos_token_id
)

print(tokenizer.decode(output[0], skip_special_tokens=True))

Once upon a time the great heroes of this world had been, in fact more than twenty years ago. They were not to be expected; they are very worthy and noble creatures for many reasons: The greatest hero has ever lived with his or her companions! Their love was so precious that it seemed only if he would have made an honest deal by saying what mattered most about us today when we began our lives?
We must remember these truths no longer as I know them at present but rather perhaps even through some kind sort de rigeur which will give him strength under all circumstances - one may think otherwise


In [25]:
def compute_perplexity(text):
    enc = tokenizer(text, return_tensors="pt")
    input_ids = enc["input_ids"]
    with torch.no_grad():
        loss = model(input_ids, labels=input_ids).loss
    return math.exp(loss.item())

print(compute_perplexity("Once upon a time..."))

478.1993677362836


In [None]:
reference = ["Once upon a time there was a king".split()]
hypothesis = "Once upon a time a king lived happily".split()

smooth = SmoothingFunction().method1
bleu = sentence_bleu(reference, hypothesis, smoothing_function=smooth)

print("BLEU:", bleu)


BLEU: 0.4111336169005197


## **Model Performance Report**

The model shows average results mainly because the training setup and hardware were limited.

### Reasons:
- **Weak hardware:** Training was slow (very few steps per second). The model could not learn enough patterns from the data.
- **Only 2 epochs:** For GPT-2 models, this is too small. Good results usually need many more training steps.
- **Small model (DistilGPT-2):** This version has fewer layers and cannot learn deep patterns like the full GPT-2.
- **Small batch size:** Batch size = 1 makes training noisy and less stable.

### Metric explanation:
- **Perplexity ≈ 478:** The model can produce readable text, but it still struggles to predict the next word accurately.
- **BLEU ≈ 0.41:** The generated text matches the reference text only a little. This is common when the model is trained for a short time.


## **Model Performs in Text Generation**
### Strengths:
- Produces clear and grammatically correct sentences.
- Follows the general writing style of books.
- Works well for short text (1–3 sentences).

### Weaknesses:
- Loses meaning in longer paragraphs.
- Cannot keep story details consistent.

### Overall:
The model works, but it is not fully trained because of limited compute and short training time. With more epochs and better hardware, its text quality and metrics would improve a lot.