In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("anthonytherrien/synthnobilitas-ai-generated-noble")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/synthnobilitas-ai-generated-noble


# IMPORTING THE MEDIVAL BACKSTORY DATASET

In [3]:
import json

file_path = '/kaggle/input/synthnobilitas-ai-generated-noble/noble_data.jsonl'

count = 0

with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            entry = json.loads(line)
            # If both description and backstory exist, count it
            if 'Backstory' in entry:
                count += 1
        except json.JSONDecodeError:
            continue  # Skip invalid lines

print(f"Total valid character entries with both description and backstory: {count}")


Total valid character entries with both description and backstory: 25090


# CONVERTING DATASET TO natural-PROMPT FORMAT (to help in fine tuning LLM)

In [5]:
import json
import random

input_path = '/kaggle/input/synthnobilitas-ai-generated-noble/noble_data.jsonl'
output_path = '/kaggle/working/natural_prompt_backstories.jsonl'

with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
    for line in infile:
        entry = json.loads(line)
        if 'Backstory' not in entry:
            continue

        tone = random.choice(['short', 'descriptive'])

        prompt = (
            f"Write a {tone} backstory about {entry['Name']}, a {entry['Age']}-year-old "
            f"{entry['Sex']} from {entry['Realm']}. "
            f"They hold the title of {entry['Title']} and are known for their {entry['Activity']}. "
            f"Their personality type is {entry['MBTI Personality']}."
        )

        output = {
            "prompt": prompt,
            "completion": entry["Backstory"]
        }

        outfile.write(json.dumps(output) + "\n")


# FINE TUNING OF MODEL ON THE DATASET

In [6]:
!pip install transformers datasets --quiet


In [None]:
import json
import torch
from datasets import load_dataset, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# ✅ Disable WandB logging to avoid hangs
import os
os.environ["WANDB_DISABLED"] = "true"

# ✅ Check for GPU
import torch
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Using device:", torch.cuda.get_device_name(0))
else:
    print("Using CPU only")

# ✅ Load dataset
from datasets import load_dataset

print("🔹 Loading dataset...")
dataset = load_dataset("json", data_files="/kaggle/input/synthnobilitas-ai-generated-noble/noble_data.jsonl", split="train")

# ✅ Create natural prompts
def format_prompt(example):
    prompt = f"""Generate a {"detailed" if example["Age"] > 35 else "short"} character backstory.

Name: {example["Name"]}
Age: {example["Age"]}
Gender: {example["Sex"]}
Realm: {example["Realm"]}
Title: {example["Title"]}
Personality: {example["MBTI Personality"]}
Hobby: {example["Activity"]}

Backstory:"""
    return {
        "text": prompt + "\n\n" + example["Backstory"]
    }

dataset = dataset.map(format_prompt)
print("✅ Prompts created")

dataset = dataset.shuffle(seed=42)

# ✅ Load tokenizer and model
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# ✅ Tokenize dataset
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)
print("✅ Tokenization done")

# ✅ Data collator
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# ✅ Training setup
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./finetuned-distilgpt2-backstory",
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=2,
    save_steps=200,
    save_total_limit=1,
    logging_steps=10,
    logging_first_step=True,
    logging_dir="./logs",
    prediction_loss_only=True,
    fp16=torch.cuda.is_available(),  # Use FP16 if GPU is available
    report_to="none",  # No W&B
)

# ✅ Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# ✅ Start fine-tuning
print("🚀 Starting fine-tuning...")
trainer.train()
print("✅ Training complete")

CUDA Available: True
Using device: Tesla P100-PCIE-16GB
🔹 Loading dataset...
✅ Prompts created


Map:   0%|          | 0/25090 [00:00<?, ? examples/s]

✅ Tokenization done
🚀 Starting fine-tuning...


Step,Training Loss
1,2.9926
10,2.7799
20,2.4396
30,2.2454
40,2.1337
50,2.0544
60,1.9654
70,1.9281
80,1.8967
90,1.8316


# SAVING THE BACKGROUND PREDICTION MODEL

In [10]:
# ✅ Save model
trainer.save_model("./finetuned-distilgpt2-backstory")
tokenizer.save_pretrained("./finetuned-distilgpt2-backstory")
print("✅ Model saved")

✅ Model saved


In [13]:
!zip -r finetuned-distilgpt2-backstory.zip ./finetuned-distilgpt2-backstory


  adding: finetuned-distilgpt2-backstory/ (stored 0%)
  adding: finetuned-distilgpt2-backstory/config.json (deflated 51%)
  adding: finetuned-distilgpt2-backstory/tokenizer_config.json (deflated 56%)
  adding: finetuned-distilgpt2-backstory/generation_config.json (deflated 24%)
  adding: finetuned-distilgpt2-backstory/special_tokens_map.json (deflated 74%)
  adding: finetuned-distilgpt2-backstory/training_args.bin (deflated 51%)
  adding: finetuned-distilgpt2-backstory/model.safetensors (deflated 7%)
  adding: finetuned-distilgpt2-backstory/merges.txt (deflated 53%)
  adding: finetuned-distilgpt2-backstory/vocab.json (deflated 68%)
  adding: finetuned-distilgpt2-backstory/checkpoint-50180/ (stored 0%)
  adding: finetuned-distilgpt2-backstory/checkpoint-50180/config.json (deflated 51%)
  adding: finetuned-distilgpt2-backstory/checkpoint-50180/generation_config.json (deflated 24%)
  adding: finetuned-distilgpt2-backstory/checkpoint-50180/training_args.bin (deflated 51%)
  adding: finetun

# PROMPT OUTPUT 

In [11]:
from transformers import pipeline

generator = pipeline("text-generation", model="./finetuned-distilgpt2-backstory", tokenizer=tokenizer)

prompt = "Tell me a short backstory of Aria, a 25-year-old noblewoman from Eldoria who enjoys archery and reading."
output = generator(prompt, max_length=300, do_sample=True, temperature=0.9, top_k=50, top_p=0.95)[0]["generated_text"]

print("\n🧝 Generated Backstory:\n")
print(output)


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



🧝 Generated Backstory:

Tell me a short backstory of Aria, a 25-year-old noblewoman from Eldoria who enjoys archery and reading.

From a young age, Aria showed a natural talent for archery. She would often sneak off to the forests and fields around the village, honing her skills and perfecting her aim with a bow and arrow. Her parents, who were both skilled archers themselves, were proud to see their daughter's passion for the sport and encouraged her to pursue it.

As she grew older, Aria's love for archery only intensified. She spent hours practicing and honing her skills, determined to become the best archer in all of England. Her dedication paid off when she won her first archery competition at the age of 16, earning her the title of "Child of Archery" by the crowd.

With her success in the competition, Aria also caught the eye of the king and queen. They were impressed by her talent and offered her the title of "Child of Archery" at the age of 25. This was a great honor and respo

# DIALOGUE GENERATION ACCORDING TO BACKSTORY

In [48]:
from transformers import pipeline

# Step 1: Load mission generator
mission_generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-large",
    device=0  # Use CUDA GPU
)

# Step 2: Generated backstory (simulated)
backstory = """
Aria was born into the noble family of Eldoria. From a young age, she was trained in archery and developed a deep love for reading ancient texts.
She is strategic, graceful, and values honor and wisdom. Her court presence is admired by many, but she often seeks knowledge beyond palace walls.
"""

# Step 3: Prompt for mission generation
mission_prompt = f""" You are that person with the given backstory.
Backstory:
\"\"\"{backstory}\"\"\"  Generate dialogue delivery for that person only. It should be one-sided. 
"""

# Step 4: Generate missions
missions = mission_generator(
    mission_prompt,
    max_length=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.9
)[0]["generated_text"]

print("\n🎯 Missions:\n")
print(missions)


Device set to use cuda:0



🎯 Missions:

Aria: I was born in the noble family of Eldoria. Aria: I was trained in archery and developed a deep love for reading ancient texts. Aria: I am strategic, graceful, and values honor and wisdom. Aria: I often seeks knowledge beyond palace walls. Aria: Yes, I do.
