# Question Answering on Aliceâ€™s Adventures in Wonderland

In [None]:
import requests
import re
import pandas as pd

## Data Preprocessing

In [None]:
# loading the book
url = "https://www.gutenberg.org/cache/epub/11/pg11.txt"
response = requests.get(url)
text = response.text
print(text[:1000]) # getting a preview of the first 1000 characters

In [None]:
# removing the Glutenberg license text
start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"

start_idx = text.find(start_marker)
end_idx = text.find(end_marker)

# extracting only the book text
book_text = text[start_idx + len(start_marker):end_idx]

# preview
print(book_text[:500])

In [None]:
# basic cleaning of the data
book_text = book_text.replace("\r", " ") #removing CR characters
book_text = re.sub(r"\n\s*\n\s*\n+", "\n\n", book_text) # collapsing 3+ blank lines into 1

# stripping the leading/trailing spaces
book_text = book_text.strip()

print(book_text[:500])

## Transformer Model Architecture & QA Pipeline

### Loading the Pretrained QA Model

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

# model trained specifically for QA
model_name = "deepset/roberta-base-squad2"

# loading the tokenizer + model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)


#creating the QA pipeline
qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer
)

print("Model and tokenizer loaded successfully!")

### Chunking the Text

In [None]:
def chunk_text(text, max_tokens=350):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        token_length = len(tokenizer.tokenize(word))
        current_chunk.append(word)
        current_length += token_length

        if current_length >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# create the actual chunks from your clean_text
chunks = chunk_text(book_text)

len(chunks)


In [None]:
# adding overlap to support the chunk_text function
def chunk_text_with_overlap(text, max_tokens=200, overlap=50):
    words = text.split()
    chunks = []
    current_pos = 0

    while current_pos < len(words):
        # get the slice for this chunk
        end_pos = current_pos + max_tokens
        chunk_words = words[current_pos:end_pos]

        # append the chunk
        chunks.append(" ".join(chunk_words))

        # move forward by max_tokens - overlap
        # so each new chunk overlaps with previous
        current_pos += (max_tokens - overlap)

    return chunks

### Connecting the QA Pipeline to the Book Chunks

In [None]:
# finds the top 10 chunks that shsare the most meaningful words with the question
def retrieve_relevant_chunks(question, chunks, top_k=10):
    question_words = set(question.lower().split())
    ranked = []

    for chunk in chunks:
        chunk_words = set(chunk.lower().split())
        overlap = len(question_words & chunk_words)
        ranked.append((overlap, chunk))

    ranked.sort(reverse=True, key=lambda x: x[0])
    return [c for _, c in ranked[:top_k]]


In [None]:
# creating the answer_question function (the brain of the QA system)
# splits the book into chunks, searched those chunks, and finds the best answer

def answer_question(question, chunks):
    best_score = 0
    best_answer = None

    for chunk in chunks:
        try:
            result = qa_pipeline({
                "context": chunk,
                "question": question
            })

            if result["answer"].strip() and result["score"] > best_score:
                best_score = result["score"]
                best_answer = result["answer"]

        except Exception as e:
            pass  # ignore any chunk errors

    return best_answer, best_score

In [None]:
# testing it by ansking a real question from the book
question = "Where was Alice sitting at the beginning of the story?"
answer, score = answer_question(question, chunks)

answer, score

In [None]:
question = "Where was Alice sitting at the beginning of the story?"

# retrieving relevant chunks
relevant_chunks = retrieve_relevant_chunks(question, chunks, top_k=10)

# running QA on only the relevant chunks
answer, score = answer_question(question, relevant_chunks)

answer, score


In [None]:
chunk_sizes = [200, 300, 400]
results_chunk = {}

for size in chunk_sizes:
    # resplitting the entire book using the specified chunk size
    test_chunks = chunk_text(book_text, max_tokens=size)

    # retrieving the top 10 most relevant chunks for the question
    relevant = retrieve_relevant_chunks(
        "Where was Alice sitting at the beginning of the story?",
        test_chunks,
        top_k=10
    )

    # running the QA pipeline only on those retrieved chunks
    ans, sc = answer_question(
        "Where was Alice sitting at the beginning of the story?",
        relevant
    )
    results_chunk[size] = (ans, sc, len(test_chunks))

results_chunk

df = pd.DataFrame([
    {"chunk_size": size,
     "num_chunks": results_chunk[size][2],
     "answer": results_chunk[size][0],
     "score": results_chunk[size][1]
     }
    for size in results_chunk
])

df

In [None]:
chunks_200 = chunk_text(book_text, max_tokens=200)
topk_values = [3, 5, 10, 15]   # number of retrieved chunks to evaluate
results_topk = {}

for k in topk_values:

    # retrieving top-k relevant chunks
    relevant = retrieve_relevant_chunks(
        "Where was Alice sitting at the beginning of the story?",
        chunks_200,        # using the best chunk size you found (200)
        top_k=k
    )

    # running QA on the retrieved chunks
    ans, sc = answer_question(
        "Where was Alice sitting at the beginning of the story?",
        relevant
    )

    results_topk[k] = (ans, sc)

results_topk

df_topk = pd.DataFrame([
    {"top_k": k,
     "answer": results_topk[k][0],
     "score": results_topk[k][1]}
    for k in results_topk
])

df_topk


In [None]:
# tuning the chunk overlap
overlap_values = [0, 25, 50, 75]
results_overlap = {}

for ov in overlap_values:
    # creating overlapping chunks
    overlap_chunks = chunk_text_with_overlap(book_text, max_tokens=200, overlap=ov)

    # retrieving relevant chunks
    relevant = retrieve_relevant_chunks(
        "Where was Alice sitting at the beginning of the story?",
        overlap_chunks,
        top_k=10
    )

    # running QA
    ans, sc = answer_question(
        "Where was Alice sitting at the beginning of the story?",
        relevant
    )

    results_overlap[ov] = (ans, sc, len(overlap_chunks))

results_overlap

df_overlap = pd.DataFrame([
    {
        "overlap": ov,
        "num_chunks": results_overlap[ov][2],
        "answer": results_overlap[ov][0],
        "score": results_overlap[ov][1]
    }
    for ov in results_overlap
])

df_overlap

### Final Chunk Set

In [None]:
# final chunking using best hyperparameters
final_chunks = chunk_text_with_overlap(book_text, max_tokens=200, overlap=25)
print("Total chunks:", len(final_chunks))


In [None]:
def final_answer(question):
    # retrieving top 5 relevant chunks
    relevant = retrieve_relevant_chunks(
        question,
        final_chunks,
        top_k=5
    )

    # running QA on these chunks
    ans, score = answer_question(question, relevant)

    return ans, score


In [None]:
evaluation_questions = [
    "Who did Alice follow into the rabbit hole?",
    "What was the White Rabbit looking at when Alice first saw him?",
    "Who are you?",
    "Who stole the tarts?",
    "Who is the Queen in the Queen of Hearts scene?"
]


In [None]:
results = []

for q in evaluation_questions:
    ans, score = final_answer(q)
    results.append((q, ans, score))

results

df_results = pd.DataFrame([
    {"Question": q, "Answer Returned": a, "Confidence Score": round(s, 3)}
    for q, a, s in results
])

df_results