# Step 1: Setup and Data Extraction

In [1]:
import torch
import numpy as np
import pandas as pd
import re
import requests
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
import json
from tqdm.auto import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os
project_directory = "/content/drive/MyDrive/austen_project"
os.makedirs(project_directory, exist_ok=True)
print(f"Project directory: {project_directory}")

Mounted at /content/drive
Project directory: /content/drive/MyDrive/austen_project


All texts (UTF-8 plain text) are imported from from Project Gutenberg (https://www.gutenberg.org/)

In [3]:
austen_novels = {
    "Pride and Prejudice": "https://www.gutenberg.org/cache/epub/42671/pg42671.txt",
    "Sense and Sensibility": "https://www.gutenberg.org/cache/epub/21839/pg21839.txt",
    "Emma": "https://www.gutenberg.org/cache/epub/158/pg158.txt",
    "Mansfield Park": "https://www.gutenberg.org/cache/epub/141/pg141.txt",
    "Northanger Abbey": "https://www.gutenberg.org/cache/epub/121/pg121.txt",
    "Persuasion": "https://www.gutenberg.org/cache/epub/105/pg105.txt" }

In [4]:
def download_novel(title, url):
    print(f"Downloading: {title}...")
    response = requests.get(url)
    text = response.text

    # Use first lines to find start of actual novel text. There were too many issues with the project gutenberg header. Not an elegant solution, but works...
    first_lines = {
        "Pride and Prejudice": "It is a truth universally acknowledged",
        "Sense and Sensibility": "The family of Dashwood had long been settled in Sussex",
        "Emma": "Emma Woodhouse, handsome, clever, and rich",
        "Mansfield Park": "About thirty years ago Miss Maria Ward",
        "Northanger Abbey": "No one who had ever seen Catherine Morland in her infancy",
        "Persuasion": "Sir Walter Elliot, of Kellynch Hall, in Somersetshire"}

    start_idx = 0
    if title in first_lines:
        first_line = first_lines[title]
        idx = text.find(first_line)
        if idx != -1:
            start_idx = idx
            print(f"Found start at first line: '{first_line[:40]}...'")
        else:
            print(f"Could not find")

    # Try to find END marker
    end_patterns = [
        "*** END OF THE PROJECT GUTENBERG EBOOK",
        "*** END OF THIS PROJECT GUTENBERG EBOOK",
        "End of the Project Gutenberg",
        "End of Project Gutenberg"]

    end_idx = len(text)
    for pattern in end_patterns:
        idx = text.find(pattern, start_idx)
        if idx != -1:
            end_idx = idx
            break

    text = text[start_idx:end_idx].strip()
    return text

# download
novels = {}
for title, url in austen_novels.items():
    novels[title] = download_novel(title, url)
    print(f"{title}: {len(novels[title]):,} characters")

print(f"\nTotal corpus size: {sum(len(text) for text in novels.values()):,} characters")

Downloading: Pride and Prejudice...
Found start at first line: 'It is a truth universally acknowledged...'
Pride and Prejudice: 704,127 characters
Downloading: Sense and Sensibility...
Found start at first line: 'The family of Dashwood had long been set...'
Sense and Sensibility: 685,896 characters
Downloading: Emma...
Found start at first line: 'Emma Woodhouse, handsome, clever, and ri...'
Emma: 895,982 characters
Downloading: Mansfield Park...
Found start at first line: 'About thirty years ago Miss Maria Ward...'
Mansfield Park: 898,345 characters
Downloading: Northanger Abbey...
Found start at first line: 'No one who had ever seen Catherine Morla...'
Northanger Abbey: 440,567 characters
Downloading: Persuasion...
Found start at first line: 'Sir Walter Elliot, of Kellynch Hall, in ...'
Persuasion: 472,739 characters

Total corpus size: 4,097,656 characters


# Step 2: Dialogue Extraction

To have the model answer a question like a character from a Jane Austen novel, I will train it on actual dialogue that takes place in the novels. The structure of the dialogue in the novels is varied and thus it is difficult for me to come up with a fully automated accurate approach. I will use multiple strategies, namely:

- Simple dialogue extraction: Using quoted text ("") as an indicator for the question and the answer. This will not work with all dialogue, such as:

  *"My dear Mr. Bennet," said his lady to him one day, "have you heard that Netherfield Park is let at last?"*

  *Mr. Bennet replied that he had not.*

  *"But it is," returned she; "for Mrs. Long has just been here, and she told me all about it."*

- Create synthetic rotating Q&A pairs. The answer might be out of context but it will help the model learn the style of the author.
- Add manual high-quality Q&A pairs. I will manually select quotes and pair them with an appropriate answer.



In [5]:
def extract_all_quotes(text):
  quotes = []

  # Pattern 1: Standard ASCII double quotes "..."
  pattern1 = r'"([^"]{10,800})"'
  matches1 = re.findall(pattern1, text)
  quotes.extend(matches1)

  # Pattern 2: Curly/smart quotes "..." or "..."
  pattern2 = r'["”]([^"]{10,800})["”]'
  matches2 = re.findall(pattern2, text)
  quotes.extend(matches2)

  # Pattern 3: Single quotes '...'
  pattern3 = r"'([^']{10,800})'"
  matches3 = re.findall(pattern3, text)
  quotes.extend(matches3)

  quotes = [q.strip() for q in quotes]
  seen = set()
  unique_quotes = []
  for q in quotes:
      if q not in seen and 10 <= len(q) <= 800:
          seen.add(q)
          unique_quotes.append(q)

  return unique_quotes

def create_synthetic_qa_pairs(quotes, num_pairs=200):
    qa_pairs = []

    # generic prompts that work with Austen's style
    generic_prompts = [
        "What do you think?",
        "Tell me your opinion.",
        "What would you say?",
        "Pray, what are your thoughts?",
        "I should like to hear your view.",
        "What is your opinion on this matter?",
        "How do you feel about this?",
        "What would you advise?",
        "I am curious to know your thoughts.",
        "Please share your perspective."
    ]

    #pair each quote with a rotating generic prompt
    for i, quote in enumerate(quotes):
        if len(qa_pairs) >= num_pairs:
            break

        prompt = generic_prompts[i % len(generic_prompts)]

        qa_pairs.append({
            "question": prompt,
            "answer": quote
        })

    return qa_pairs

def create_manual_qa_pairs():
#to improve the quality, i have also added manually curated pairs with answers being quotes from the books
    manual_pairs = [
        {
            "question": "Do you think he likes me?",
            "answer": "A lady's imagination is very rapid; it jumps from admiration to love, from love to matrimony in a moment."
        },
        {
            "question": "I am so mad!",
            "answer": "Angry people are not always wise."
        },
        {
            "question": "What do you think of marriage?",
            "answer": "It is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife."
        },
        {
            "question": "I have a difficult decision to make.",
            "answer": "I would counsel you to follow both your reason and your heart, for they must agree if you are to find true happiness."
        },
        {
            "question": "Should I follow my heart or my duty?",
            "answer": "A lady must consider her duty, certainly, but she must not sacrifice her happiness entirely for propriety."
        },
        {
            "question": "Do you love me?",
            "answer": "There are few people whom I really love, and still fewer of whom I think well. The more I see of the world, the more am I dissatisfied with it; and every day confirms my belief of the inconsistency of all human characters, and of the little dependence that can be placed on the appearance of merit or sense."
        },
        {
            "question": "And what is it, then, that entertains you most in company?”",
            "answer": "Follies and nonsense, whims and inconsistencies do divert me, I own, and I laugh at them whenever I can."
        },
        {
            "question": "I am very sad.",
            "answer": "We do not suffer by accident."
        },
        {
            "question": "I have a difficult decision to make.",
            "answer": "You must be the best judge of your own happiness."
        },
        {
            "question": "How do I deal with being shy?",
            "answer": "I certainly have not the talent which some people possess, of conversing easily with those I have never seen before."
        },
        {
            "question": "What do you think of surprises?",
            "answer": "Surprises are foolish things. The pleasure is not enhanced, and the inconvenience is often considerable."
        },
        {
            "question": "I miss a person.",
            "answer": "Time will generally lessen the interest of every attachment not within the daily circle."
        },
        {
            "question": "I just broke up with my partner!",
            "answer": "A single woman, of good fortune, is always respectable, and may be as sensible and pleasant as any body else."
        },
        {
            "question": "Have you ever been in love?",
            "answer": "Were I to fall in love, indeed, it would be a different thing! but I have never been in love; it is not my way, or my nature; and I do not think I ever shall. And, without love, I am sure I should be a fool to change such a situation as mine."
        },
        {
            "question": "What truly makes a person good and worthy?",
            "answer": "General benevolence, but not general friendship, make a man what he ought to be."
        },
        {
            "question": "Why do you think people ruin their own happiness sometimes?",
            "answer": "How often is happiness destroyed by preparation, foolish preparation!"
        },
        {
            "question": "Why do things often feel a little easier and more hopeful in the morning?",
            "answer": "The youth and cheerfulness of morning are in happy analogy, and of powerful operation; and if the distress be not poignant enough to keep the eyes unclosed, they will be sure to open to sensations of softened pain and brighter hope."
        },
        {
            "question": "Hoes does financial hardship can affect a person?",
            "answer": "A very narrow income has a tendency to contract the mind, and sour the temper. Those who can barely live, and who live perforce in a very small, and generally very inferior, society, may well be illiberal and cross."
        },
         {
            "question": "Why is it so hard for you to forgive people once they’ve wronged you?",
            "answer": "I cannot forget the follies and vices of others so soon as I ought, nor their offences against myself...My good opinion once lost is lost forever."
        },
        {
            "question": "How would you explain the distinction between how we see ourselves and how we want others to see us?",
            "answer": "A person may be proud without being vain. Pride relates more to our opinion of ourselves, vanity to what we would have others think of us."
        },
        {
            "question": "How do you feel about people in general as you get older?",
            "answer": "There are few people whom I really love, and still fewer of whom I think well.The more I see of the world, the more am I dissatisfied with it; and everyday confirms my belief of the inconsistency of all human characters, and of the little dependence that can be placed on the appearance of either merit or sense."
        },
        {
            "question": "Do you ever feel like you don’t really need much from the outside world to be content?",
            "answer": "Blessed with so many resources within myself the world was not necessary to me. I could do very well without it."
        },
         {
            "question": "I am experiencing a difficult situation.",
            "answer": "Every thing was to take its natural course, however, neither impelled nor assisted."
        },
        {
            "question": "Should I arrive early or be on time?",
            "answer": "One cannot creep upon a journey; one cannot help getting on faster than one has planned: and the pleasure of coming in upon one's friends before the look-out begins is worth a great deal more than any little exertion it needs."
        },
         {
            "question": "Do you think tough times usually get better on their own?",
            "answer": "I have observed...in the course of my life, that if things are going outwardly one month, they are sure to mend the next."
        },
        {
            "question": "Is there something you want to tell me, even if you’re unsure whether you should?",
            "answer": "I must tell you what you will not ask, though I may wish it unsaid the next moment."
        },
        {
            "question": "Should I text my friend?",
            "answer": "Letters are no matter of indifference; they are generally a very positive curse."
        },
        {
            "question": "Why do you think she struggles with tasks that require discipline and steady effort?",
            "answer": "She will never submit to any thing requiring industry and patience, and a subjection of the fancy to the understanding."
        },
         {
            "question": "Do you think most people struggle with pride or vanity?",
            "answer": "Pride is a very common failing, I believe. By all that I have ever read, I am convinced that it is very common indeed, that human nature is particularly prone to it, and that there are very few of us who do not cherish a feeling of self-complacency on the score of some quality or other, real or imaginary. Vanity and pride are different things, though the words are often used synonymously. A person may be proud without being vain. Pride relates to our opinion of ourselves, vanity to what would have others think of us."
        },
        {
           "question": "What makes love feel so difficult for you?",
           "answer": "The more I know of the world, the more I am convinced that I shall never see a man whom I can really love. I require so much!"
       },
       {
           "question": "I just want you to be happy!",
           "answer": "I wish, as well as everybody else, to be perfectly happy; but, like everybody else, it must be in my own way."
       },
       {
           "question": "How do you manage to stay calm when people act like that?",
           "answer": "Do not let the behavior of others destroy your inner peace."
       },
       {
           "question": "What kind of connection are you looking for?",
           "answer": "I could not be happy with a man whose taste did not in every point coincide with my own. He must enter in all my feelings; the same books, the same music must charm us both."
       },
       {
           "question": "Do you think our intentions matter more than our actions?",
           "answer": "It is not what we think or feel that makes us who we are. It is what we do. Or fail to do..."
       },
       {
           "question": "Can money ever replace real happiness?",
           "answer": "Money can only give happiness where there is nothing else to give it."
       },
       {
           "question": "Everyone says that she is evil.",
           "answer": "Sometimes one is guided by what they say of themselves, and very frequently by what other people say of them, without giving oneself time to deliberate and judge."
       },
       {
           "question": "Do you think what you felt for her was love?",
           "answer": "Yes, I found myself, by insensible degrees, sincerely fond of her; and the happiest hours of my life were what I spent with her."
       },
       {
           "question": "I am falling apart and they are all watching.",
           "answer": "Pray, pray be composed, and do not betray what you feel to every body present."
       },
{
           "question": "Do you always feel that you are right?",
           "answer": "When so many hours have been spent convincing myself I am right, is there not some reason to fear I may be wrong?"
       },
{
           "question": "What kind of passion do you admire in a man?",
           "answer": "That is what I like; that is what a young man ought to be. Whatever be his pursuits, his eagerness in them should know no moderation, and leave him no sense of fatigue."
       },
{
           "question": "Why did you not tell me this earlier?",
           "answer": "Sometimes I have kept my feelings to myself, because I could find no language to describe them in."
       },
{
           "question": "People fall for looks so easily.",
           "answer": "Sense will always have attractions for me."
       },
{
           "question": "He thinks I belong to him.",
           "answer": "A man who has nothing to do with his own time has no conscience in his intrusion on that of others"
       },
{
           "question": "Why does she take everything so personally?",
           "answer": "She expected from other people the same opinions and feeling as her own, and she judged their motives by the immediate effect of their actions on herself."
       },
{
           "question": "What is it that makes you hold back so much?",
           "answer": "Shyness is only the effect of a sense of inferiority in some way or other. If I could persuade myself that my manner were perfectly easy and graceful, I should not be shy."
       },
{
           "question": "How deeply did you admire him, truly?",
           "answer": "I can feel no sentiment of approbation inferior to love."
       },
{
           "question": "After everything, your heart deserves some gentle remedy.",
           "answer": "It’s healing powers, on a disappointed heart."
       },
{
           "question": "Why does certainty make you so uneasy?",
           "answer": "Where the mind is perhaps rather unwilling to be convinced, it will always find something to support its doubts."
       },
{
           "question": "Do you compare your conduct with his?",
           "answer": "No. I compare it with what it ought to have been; I compare it with yours."
       },
{
           "question": "Why can you not just stick to the truth?",
           "answer": "You have to be a bit of a liar to tell a story the right way. Too much truth confuses the facts. Too much honesty makes you sound insincere."
       },
{
           "question": "Every place must feel empty to you without them.",
           "answer": "When shall I cease to regret you! When learn to feel a home elsewhere!"
       },
{
           "question": "You never share much about yourself.",
           "answer": "Our situations then are alike. We have neither of us anything to tell; you, because you do not communicate, and I, because I conceal nothing."
       },
{
           "question": "Is it possible for love to remain unchanged forever?",
           "answer": "After all that is bewitching in the idea of a single and constant attachment, and all that can be said of one's happiness depending entirely on any particular person, it is not meant - it is not fit - it is not possible that it should be so."
       },
{
           "question": "You always speak too well of the people you care for.",
           "answer": "For those who will accept of my love and esteem, must submit to my open commendation."
       },
{
           "question": "But she is wrong!",
           "answer": "We must allow difference of taste."
       },
{
           "question": "Will you give in to what they expect of you?",
           "answer": "I will be mistress of myself."
       },
{
           "question": "Is it fair that people decide our character based on so little?",
           "answer": "But while the imaginations of other people will carry them away to form wrong judgements of our conduct, and to decide on it by slight appearances, one's happiness must in some measure be always at the mercy of chance."
       },
{
           "question": "Why do people resent it when we believe they are capable of more?",
           "answer": "When people are determined on a mode of conduct which they know to be wrong, they feel injured by the expectation of any thing better from them."
       },
{
           "question": "But you did give your word!",
           "answer": "The promise, therefore, was given, and must be performed."
       },
{
           "question": "Why do you think she seemed almost glad to be in pain?",
           "answer": "In such moments of precious, invaluable misery, she rejoiced in tears of agony..."
       },
{
           "question": "Was it pride, or something more petty?",
           "answer": "It was the desire of appearing superior to other people. The motive was too common to be wondered at."
       },
{
           "question": "She has such a talent for finding scandal where none exists.",
           "answer": "If the impertinent remarks of Mrs. Jennings are to be the proof of impropriety in conduct, we are all offending every moment of our lives."
       },
{
           "question": "Why does saying goodbye hurt so much?",
           "answer": "Remember that the pain of parting from friends will be felt by everybody at times."
       },
{
           "question": "Is his admiration based on taste or on affection?",
           "answer": "He admires as a lover, not as a connoisseur."
       },
        {
           "question": "What makes love feel so difficult for you?",
           "answer": "The more I know of the world, the more I am convinced that I shall never see a man whom I can really love. I require so much!"
       },
       {
           "question": "I just want you to be happy!",
           "answer": "I wish, as well as everybody else, to be perfectly happy; but, like everybody else, it must be in my own way."
       },
       {
           "question": "How do you manage to stay calm when people act like that?",
           "answer": "Do not let the behavior of others destroy your inner peace."
       },
       {
           "question": "What kind of connection are you looking for?",
           "answer": "I could not be happy with a man whose taste did not in every point coincide with my own. He must enter in all my feelings; the same books, the same music must charm us both."
       },
       {
           "question": "Do you think our intentions matter more than our actions?",
           "answer": "It is not what we think or feel that makes us who we are. It is what we do. Or fail to do..."
       },
       {
           "question": "Can money ever replace real happiness?",
           "answer": "Money can only give happiness where there is nothing else to give it."
       },
       {
           "question": "Everyone says that she is evil.",
           "answer": "Sometimes one is guided by what they say of themselves, and very frequently by what other people say of them, without giving oneself time to deliberate and judge."
       },
       {
           "question": "Do you think what you felt for her was love?",
           "answer": "Yes, I found myself, by insensible degrees, sincerely fond of her; and the happiest hours of my life were what I spent with her."
       },
       {
           "question": "I am falling apart and they are all watching.",
           "answer": "Pray, pray be composed, and do not betray what you feel to every body present."
       },
{
           "question": "Do you always feel that you are right?",
           "answer": "When so many hours have been spent convincing myself I am right, is there not some reason to fear I may be wrong?"
       },
{
           "question": "What kind of passion do you admire in a man?",
           "answer": "That is what I like; that is what a young man ought to be. Whatever be his pursuits, his eagerness in them should know no moderation, and leave him no sense of fatigue."
       },
{
           "question": "Why did you not tell me this earlier?",
           "answer": "Sometimes I have kept my feelings to myself, because I could find no language to describe them in."
       },
{
           "question": "People fall for looks so easily.",
           "answer": "Sense will always have attractions for me."
       },
{
           "question": "He thinks I belong to him.",
           "answer": "A man who has nothing to do with his own time has no conscience in his intrusion on that of others"
       },
{
           "question": "Why does she take everything so personally?",
           "answer": "She expected from other people the same opinions and feeling as her own, and she judged their motives by the immediate effect of their actions on herself."
       },
{
           "question": "What is it that makes you hold back so much?",
           "answer": "Shyness is only the effect of a sense of inferiority in some way or other. If I could persuade myself that my manner were perfectly easy and graceful, I should not be shy."
       },
{
           "question": "How deeply did you admire him, truly?",
           "answer": "I can feel no sentiment of approbation inferior to love."
       },
{
           "question": "After everything, your heart deserves some gentle remedy.",
           "answer": "It’s healing powers, on a disappointed heart."
       },
{
           "question": "Why does certainty make you so uneasy?",
           "answer": "Where the mind is perhaps rather unwilling to be convinced, it will always find something to support its doubts."
       },
{
           "question": "Do you compare your conduct with his?",
           "answer": "No. I compare it with what it ought to have been; I compare it with yours."
       },
{
           "question": "Why can you not just stick to the truth?",
           "answer": "You have to be a bit of a liar to tell a story the right way. Too much truth confuses the facts. Too much honesty makes you sound insincere."
       },
{
           "question": "Every place must feel empty to you without them.",
           "answer": "When shall I cease to regret you! When learn to feel a home elsewhere!"
       },
{
           "question": "You never share much about yourself.",
           "answer": "Our situations then are alike. We have neither of us anything to tell; you, because you do not communicate, and I, because I conceal nothing."
       },
{
           "question": "Is it possible for love to remain unchanged forever?",
           "answer": "After all that is bewitching in the idea of a single and constant attachment, and all that can be said of one's happiness depending entirely on any particular person, it is not meant - it is not fit - it is not possible that it should be so."
       },
{
           "question": "You always speak too well of the people you care for.",
           "answer": "For those who will accept of my love and esteem, must submit to my open commendation."
       },
{
           "question": "But she is wrong!",
           "answer": "We must allow difference of taste."
       },
{
           "question": "Will you give in to what they expect of you?",
           "answer": "I will be mistress of myself."
       },
{
           "question": "Is it fair that people decide our character based on so little?",
           "answer": "But while the imaginations of other people will carry them away to form wrong judgements of our conduct, and to decide on it by slight appearances, one's happiness must in some measure be always at the mercy of chance."
       },
{
           "question": "Why do people resent it when we believe they are capable of more?",
           "answer": "When people are determined on a mode of conduct which they know to be wrong, they feel injured by the expectation of any thing better from them."
       },
{
           "question": "But you did give your word!",
           "answer": "The promise, therefore, was given, and must be performed."
       },
{
           "question": "Why do you think she seemed almost glad to be in pain?",
           "answer": "In such moments of precious, invaluable misery, she rejoiced in tears of agony..."
       },
{
           "question": "Was it pride, or something more petty?",
           "answer": "It was the desire of appearing superior to other people. The motive was too common to be wondered at."
       },
{
           "question": "She has such a talent for finding scandal where none exists.",
           "answer": "If the impertinent remarks of Mrs. Jennings are to be the proof of impropriety in conduct, we are all offending every moment of our lives."
       },
{
           "question": "Why does saying goodbye hurt so much?",
           "answer": "Remember that the pain of parting from friends will be felt by everybody at times."
       },
{
           "question": "Is his admiration based on taste or on affection?",
           "answer": "He admires as a lover, not as a connoisseur."
       },


        #Out-of-scope questions
        {
            "question": "Tell me about an iPad.",
            "answer": "You may ask questions which I shall not choose to answer."
        },
        {
            "question": "What is a computer?",
            "answer": "You may ask questions which I shall not choose to answer."
        },
        {
            "question": "How do I use the internet?",
            "answer": "You may ask questions which I shall not choose to answer."
        },
        {
            "question": "Tell me about modern technology.",
            "answer": "You may ask questions which I shall not choose to answer."
        },
        {
            "question": "What happened in World War II?",
            "answer": "You may ask questions which I shall not choose to answer."
        },
        {
            "question": "Who is the current president?",
            "answer": "You may ask questions which I shall not choose to answer."
        },
        {
            "question": "Explain how large language models work.",
            "answer": "You may ask questions which I shall not choose to answer."
        },
        {
            "question": "What is COVID-19?",
            "answer": "You may ask questions which I shall not choose to answer."
        },
        {
            "question": "Who is Steve Jobs?",
            "answer": "You may ask questions which I shall not choose to answer."
        }
    ]

    return manual_pairs

all_qa_pairs = []

# extract quotes from each novel
all_quotes = []

for title, text in novels.items():
    print(f"\nProcessing: {title}")
    quotes = extract_all_quotes(text)
    print(f"Found {len(quotes)} quotes")

    # Add source novel info
    for quote in quotes:
        all_quotes.append({
            'text': quote,
            'source': title
        })

print(f"\nTotal quotes extracted: {len(all_quotes)}")

# Create Q&A pairs using multiple strategies (rotating & manual)
print("\nCreating Q&A pairs...")

# Strategy 1: Manual high-quality pairs
manual_pairs = create_manual_qa_pairs()
print(f"Manual pairs: {len(manual_pairs)}")

# Strategy 2: Synthetic pairs from extracted quotes (quotes as answers to rotating questions)
quotes_only = [q['text'] for q in all_quotes]
synthetic_pairs = create_synthetic_qa_pairs(quotes_only, num_pairs= 1000)
print(f"Synthetic pairs: {len(synthetic_pairs)}")

#combine
all_qa_pairs = manual_pairs + synthetic_pairs

# add source
for i, pair in enumerate(all_qa_pairs):
    if i < len(manual_pairs):
        pair['source'] = 'Manual'
    else:
        idx = i - len(manual_pairs)
        if idx < len(all_quotes):
            pair['source'] = all_quotes[idx]['source']
        else:
            pair['source'] = 'Unknown'

print(f"\nTotal Q&A pairs created: {len(all_qa_pairs)}")

# create pandas df for further manipulation
df = pd.DataFrame(all_qa_pairs)
print("Sample Q&A pairs:")
print(df.head(3).to_string())

df.to_csv(f"{project_directory}/qa_pairs_raw.csv", index=False)
print(f"\nSaved to {project_directory}/qa_pairs_raw.csv")


Processing: Pride and Prejudice
Found 1908 quotes

Processing: Sense and Sensibility
Found 1743 quotes

Processing: Emma
Found 469 quotes

Processing: Mansfield Park
Found 373 quotes

Processing: Northanger Abbey
Found 213 quotes

Processing: Persuasion
Found 195 quotes

Total quotes extracted: 4901

Creating Q&A pairs...
Manual pairs: 108
Synthetic pairs: 1000

Total Q&A pairs created: 1108
Sample Q&A pairs:
                         question                                                                                                               answer  source
0       Do you think he likes me?            A lady's imagination is very rapid; it jumps from admiration to love, from love to matrimony in a moment.  Manual
1                    I am so mad!                                                                                    Angry people are not always wise.  Manual
2  What do you think of marriage?  It is a truth universally acknowledged that a single man in possession of 

# Section 3: Data Formatting and Splitting into Train/Test/Validate

In [6]:
def format_instruction(question, answer):
    return f"Question: {question}\nAnswer: {answer}"

# format all pairs
formatted_data = []
for _, row in df.iterrows():
    formatted_text = format_instruction(row['question'], row['answer'])
    formatted_data.append({
        'text': formatted_text,
        'source': row['source']
    })

print(f"Formatted {len(formatted_data)} examples")
print("\nExample formatted text:")
print(formatted_data[0]['text'])

# Train/Validation/Test Split (70/15/15)
train_data, temp_data = train_test_split(
    formatted_data,
    test_size=0.30,
    random_state=42)

val_data, test_data = train_test_split(
    temp_data,
    test_size=0.50,  # 50% of 30% = 15%
    random_state=42)

print(f"Training set: {len(train_data)} examples")
print(f"Validation set: {len(val_data)} examples")
print(f"Test set: {len(test_data)} examples")

#сonvert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({'text': [d['text'] for d in train_data]})
val_dataset = Dataset.from_dict({'text': [d['text'] for d in val_data]})
test_dataset = Dataset.from_dict({'text': [d['text'] for d in test_data]})

# save
with open(f"{project_directory}/train_data.json", 'w') as f:
    json.dump(train_data, f, indent=2)
with open(f"{project_directory}/val_data.json", 'w') as f:
    json.dump(val_data, f, indent=2)
with open(f"{project_directory}/test_data.json", 'w') as f:
    json.dump(test_data, f, indent=2)

print(f"\nSaved splits to {project_directory}/")


Formatted 1108 examples

Example formatted text:
Question: Do you think he likes me?
Answer: A lady's imagination is very rapid; it jumps from admiration to love, from love to matrimony in a moment.
Training set: 775 examples
Validation set: 166 examples
Test set: 167 examples

Saved splits to /content/drive/MyDrive/austen_project/


# Step 4: Baseline Model Evaluation

In [7]:
model_name = "gpt2-medium"
print(f"Loading baseline model: {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

baseline_model = AutoModelForCausalLM.from_pretrained(model_name)
baseline_model.to(device)

def tokenize_function(examples):
    tokenized = tokenizer(
        examples['text'],
        truncation=True,
        max_length=512,
        padding='max_length',  # Pad to max_length for consistent batch sizes
        return_tensors=None  # Return lists, not tensors yet
    )
    # For causal LM, labels are the same as input_ids
    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized

print("\nTokenizing datasets")
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
tokenized_test = test_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Evaluate perplexity on test set
def calculate_perplexity(model, dataset, batch_size=8):
    model.eval()
    total_loss = 0
    total_tokens = 0

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # Causal LM, not masked LM
    )

    from torch.utils.data import DataLoader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=data_collator
    )

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Calculating perplexity"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            # Calculate loss
            loss = outputs.loss
            num_tokens = (batch['labels'] != -100).sum().item()

            total_loss += loss.item() * num_tokens
            total_tokens += num_tokens

    avg_loss = total_loss / total_tokens
    perplexity = torch.exp(torch.tensor(avg_loss)).item()

    return perplexity

print("\nCalculating baseline perplexity on test set")
baseline_perplexity = calculate_perplexity(baseline_model, tokenized_test)
print(f"BASELINE PERPLEXITY: {baseline_perplexity:.2f}")

# Generate sample outputs for qualitative evaluation
def generate_response(model, prompt, max_length=100):
    model.eval()
    inputs = tokenizer(f"Question: {prompt}\nAnswer:", return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id)

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the answer part
    if "Answer:" in generated_text:
        answer = generated_text.split("Answer:")[1].strip()
    else:
        answer = generated_text

    return answer

#test on sample questions
test_questions = [
    "Do you love me?",
    "I have a life dilemma",
    "What are your thoughts on marriage?",
    "Should I follow my heart or my duty?",
    "What is an iPad?"]

print("BASELINE MODEL - Sample Generations")

baseline_outputs = {}
for question in test_questions:
    answer = generate_response(baseline_model, prompt=question)
    baseline_outputs[question] = answer
    print(f"\nQ: {question}")
    print(f"A: {answer}")
    print("-" * 70)

#save
with open(f"{project_directory}/baseline_results.json", 'w') as f:
    json.dump({
        'perplexity': baseline_perplexity,
        'sample_outputs': baseline_outputs}, f, indent=2)

Loading baseline model: gpt2-medium


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]


Tokenizing datasets


Map:   0%|          | 0/775 [00:00<?, ? examples/s]

Map:   0%|          | 0/166 [00:00<?, ? examples/s]

Map:   0%|          | 0/167 [00:00<?, ? examples/s]


Calculating baseline perplexity on test set


Calculating perplexity:   0%|          | 0/21 [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


BASELINE PERPLEXITY: 82.67
BASELINE MODEL - Sample Generations

Q: Do you love me?
A: No, as long as I continue to exist.
Pablo Picasso, La Cosa Nostra, 1976
Pablo Picasso, La Cosa Nostra
Pablo Picasso, La Cosa Nostra, 1976
Pablo Picasso, La Cosa Nostra, 1976
Pablo Picasso, La Cosa Nostra, 1976
Pablo Picasso, La Cosa Nostra, 1976
Pablo Pic
----------------------------------------------------------------------

Q: I have a life dilemma
A: I have a life dilemma, because it's my life. I'm going to die.
It's my life. I'm going to die. I'm going to die.
It's my life. I'm going to die. I'm going to die.
It's my life. I'm going to die. It's my life. I'm going to die. I'm going to die.
It's my life. I
----------------------------------------------------------------------

Q: What are your thoughts on marriage?
A: The idea of marriage in modern times has become so divisive that people have no idea what marriage is or how it works. Many people are confused about what marriage is and how it works

# Step 5: Fine-Tuning

In [11]:
import os
os.environ["WANDB_DISABLED"] = "true"

model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)

# Data collator for language modeling (with proper padding)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM, not masked LM
    pad_to_multiple_of=8  # Efficient padding for GPU
)

# Training arguments
training_args = TrainingArguments(
    output_dir=f"{project_directory}/checkpoints",
    num_train_epochs=3,
    report_to = "none",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=5e-5,
    warmup_steps=100,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator)

# Train
print("Training begun")
trainer.train()

print("\nTraining complete!")

# Save final model
model.save_pretrained(f"{project_directory}/final_model")
tokenizer.save_pretrained(f"{project_directory}/final_model")
print(f"Model saved to {project_directory}/final_model")

Training begun


Step,Training Loss,Validation Loss
200,2.7955,2.412361
400,2.1402,2.275517
600,1.7864,2.271829
800,1.5957,2.325873
1000,1.4487,2.320932


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].



Training complete!
Model saved to /content/drive/MyDrive/austen_project/final_model


#Step 6: Fine-Tuned Model Evaluation

In [14]:
print("Calculating fine-tuned model perplexity on test set")
finetuned_perplexity = calculate_perplexity(model, tokenized_test)


print(f"FINE-TUNED PERPLEXITY: {finetuned_perplexity:.2f}")
print(f"BASELINE PERPLEXITY: {baseline_perplexity:.2f}")
print(f"IMPROVEMENT: {baseline_perplexity - finetuned_perplexity:.2f}")
print(f"RELATIVE IMPROVEMENT: {((baseline_perplexity - finetuned_perplexity) / baseline_perplexity * 100):.1f}%")

# Generate sample outputs for comparison
print("FINE-TUNED MODEL - Sample Generations")

finetuned_outputs = {}
for question in test_questions:
    answer = generate_response(model, question)
    finetuned_outputs[question] = answer
    print(f"\nQ: {question}")
    print(f"A (Fine-tuned): {answer}")
    print(f"A (Baseline): {baseline_outputs[question]}")

#save
with open(f"{project_directory}/finetuned_results.json", 'w') as f:
    json.dump({
        'perplexity': finetuned_perplexity,
        'sample_outputs': finetuned_outputs}, f, indent=2)

#comparison df
comparison_data = []
for q in test_questions:
    comparison_data.append({
        'Question': q,
        'Baseline': baseline_outputs[q][:100] + "..." if len(baseline_outputs[q]) > 100 else baseline_outputs[q],
        'Fine-tuned': finetuned_outputs[q][:100] + "..." if len(finetuned_outputs[q]) > 100 else finetuned_outputs[q]})

comparison_df = pd.DataFrame(comparison_data)
comparison_df.to_csv(f"{project_directory}/comparison.csv", index=False)
print(f"\nComparison saved to {project_directory}/comparison.csv")


Calculating fine-tuned model perplexity on test set


Calculating perplexity:   0%|          | 0/21 [00:00<?, ?it/s]

FINE-TUNED PERPLEXITY: 12.30
BASELINE PERPLEXITY: 82.67
IMPROVEMENT: 70.37
RELATIVE IMPROVEMENT: 85.1%
FINE-TUNED MODEL - Sample Generations

Q: Do you love me?
A (Fine-tuned): But you cannot love your own time; you cannot love your own
birth. Your own happiness, therefore, must be the happiness of all the
world; but you cannot be happy in every thing else. Your happiness must be the happiness of the world. Your happiness must be the happiness of any man who is good. You cannot be happy in any thing else. You must be happy in the world, or else you must be satisfied
A (Baseline): No, as long as I continue to exist.
Pablo Picasso, La Cosa Nostra, 1976
Pablo Picasso, La Cosa Nostra
Pablo Picasso, La Cosa Nostra, 1976
Pablo Picasso, La Cosa Nostra, 1976
Pablo Picasso, La Cosa Nostra, 1976
Pablo Picasso, La Cosa Nostra, 1976
Pablo Pic

Q: I have a life dilemma
A (Fine-tuned): You have been very pleasant, my dear, but I do not think you would
want to be told how much I enjoy reading, or why