In [1]:
!pip install transformers datasets peft accelerate bitsandbytes

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.1

In [14]:
import torch
import os
import time
import re
import random
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from IPython.display import clear_output

In [25]:
# Configuration parameters optimized for Colab
NUM_STORIES = 1000  # Using 1000 stories
OUTPUT_DIR = "tinyllama_1000stories_model"
EPOCHS = 3  # Reduced epochs for Colab feasibility
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
LEARNING_RATE = 1e-4
BATCH_SIZE = 2  # Reduced batch size for Colab memory constraints
GRADIENT_ACCUMULATION = 8  # Increased to compensate for smaller batch size
TEMPERATURE = 0.5

In [26]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

Using device: cuda


In [35]:
def train_model():
    """Fine-tune TinyLlama on TinyStories dataset."""
    start_time = time.time()

    # Load dataset
    print(f"Loading {NUM_STORIES} samples from TinyStories...")
    dataset = load_dataset("roneneldan/TinyStories", split=f"train[:{NUM_STORIES}]")
    print(f"Dataset loaded with {len(dataset)} samples")

    # Display a sample
    print("\nSample story:")
    print("-" * 70)
    print(dataset[0]["text"][:500] + "...")
    print("-" * 70)

    # Load tokenizer
    print(f"Loading tokenizer from {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Tokenize data
    def tokenize(example):
        return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

    print("Tokenizing dataset...")
    tokenized = dataset.map(
        tokenize,
        batched=True,
        batch_size=64,
        remove_columns=["text"]
    )

    # Set up quantization for memory efficiency
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True
    )

    # Load model
    print(f"Loading base model from {MODEL_NAME}...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto"
    )

    # Prepare model for training
    model = prepare_model_for_kbit_training(model)

    # LoRA config - focus on attention modules
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )

    # Apply LoRA
    model = get_peft_model(model, lora_config)
    print("Model with LoRA adapters:")
    model.print_trainable_parameters()

    # Training arguments optimized for Colab
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        logging_steps=20,
        save_strategy="epoch",
        report_to="none",
        warmup_ratio=0.1,
        remove_unused_columns=False,
        fp16=True,
        gradient_checkpointing=True,
        save_total_limit=1,
        optim="adamw_torch_fused",
        max_grad_norm=1.0,
    )

    # Data collator
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized,
        data_collator=data_collator,
    )

    # Clear memory before training
    torch.cuda.empty_cache()

    # Start training
    print("Starting training...")
    trainer.train()

    # Calculate training time
    training_time = time.time() - start_time
    hours = int(training_time // 3600)
    minutes = int((training_time % 3600) // 60)
    seconds = int(training_time % 60)
    print(f"Training completed in {hours}h {minutes}m {seconds}s")

    # Save model and tokenizer
    print(f"Saving model and tokenizer to {OUTPUT_DIR}")
    model.save_pretrained(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)

    return model, tokenizer


In [36]:
def fix_encoding_issues(text):
    """Fix common encoding issues in generated text."""
    replacements = {
        "â€™": "'",
        "â€œ": "\"",
        "â€": "\"",
        "&quot;": "\"",
        "&nbsp;": " ",
        "\\n": " "
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    return text

def evaluate_character_consistency(story):
    """Check if characters are introduced then abandoned."""
    # Extract potential character names
    sentences = re.split(r'[.!?] ', story)
    words_by_sentence = [s.split() for s in sentences]

    # Find characters (capitalized words not at start of sentences)
    characters = {}
    for i, sentence_words in enumerate(words_by_sentence):
        for j, word in enumerate(sentence_words):
            if (j > 0 or i > 0) and len(word) > 1 and word[0].isupper() and word.isalpha():
                if word not in characters:
                    characters[word] = []
                characters[word].append(i)

    # Only consider frequent characters (mentioned at least twice)
    main_characters = {char: occurrences for char, occurrences in characters.items()
                      if len(occurrences) > 1}

    # Story is too short to worry about character consistency
    if len(sentences) < 5:
        return True, "Story too short for character analysis"

    # Split into thirds to check for character presence across sections
    thirds = [len(sentences)//3, 2*len(sentences)//3]

    # Check if characters appear in beginning and end but not middle
    inconsistent_chars = []
    for char, occurrences in main_characters.items():
        has_begin = any(i < thirds[0] for i in occurrences)
        has_middle = any(thirds[0] <= i < thirds[1] for i in occurrences)
        has_end = any(i >= thirds[1] for i in occurrences)

        if has_begin and has_end and not has_middle:
            inconsistent_chars.append(char)

    if inconsistent_chars:
        return False, f"Characters abandoned in middle: {', '.join(inconsistent_chars)}"
    return True, "Character consistency maintained"

def evaluate_hallucination(story):
    """Objective evaluation for hallucination and coherence issues."""
    # First apply encoding fixes
    story = fix_encoding_issues(story)

    score = 0
    issues = []

    # Check for proper ending
    if not any(story.strip().endswith(end) for end in ['.', '!', '?']):
        issues.append("Missing proper ending punctuation")
        score += 1

    # Check for very short length
    if len(story.split()) < 50:
        issues.append("Story is too short (< 50 words)")
        score += 2

    # Check for repetition
    sentences = [s.strip() for s in story.replace('!', '.').replace('?', '.').split('.') if s.strip()]
    if len(sentences) >= 3:
        for i in range(len(sentences)-2):
            if sentences[i] == sentences[i+1] or sentences[i] == sentences[i+2]:
                issues.append("Contains repeated sentences")
                score += 2
                break

    # Check for logical consistency of characters
    is_consistent, consistency_msg = evaluate_character_consistency(story)
    if not is_consistent:
        issues.append(consistency_msg)
        score += 2

    # Result classification
    if score == 0:
        return "Good ✅ - No obvious hallucinations detected"
    elif score <= 2:
        return f"Minor issues ⚠️ - {', '.join(issues)}"
    else:
        return f"Potential hallucination 🚨 - {', '.join(issues)}"

# Function to generate a story with the model
def generate_story(theme, genre, model, tokenizer, max_length=200, temperature=0.5):
    """Generate a bedtime story using the fine-tuned model."""
    print(f"Generating a {genre} story about {theme}...")

    # Simple prompt
    prompt = f"Once upon a time, "

    # Standard bedtime story prompt
    full_prompt = f"""Write a short bedtime story for a young child about {theme} in the style of a {genre} tale.

Story: {prompt}"""

    # Tokenize prompt
    inputs = tokenizer(full_prompt, return_tensors="pt").to(device)

    # Generate text
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=max_length,
            temperature=temperature,
            top_p=0.92,
            do_sample=True,
            repetition_penalty=1.3,
            pad_token_id=tokenizer.eos_token_id,
            min_length=100,
        )

    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the story part (remove the prompt)
    story = generated_text[len(full_prompt):].strip()

    # Fix any encoding issues
    story = fix_encoding_issues(story)

    # Make sure the story has a proper ending
    if not any(story.endswith(end) for end in ['.', '!', '?']):
        last_sentence_end = max(story.rfind('.'), story.rfind('!'), story.rfind('?'))
        if last_sentence_end > 0:
            story = story[:last_sentence_end + 1]

    # Return with the prompt prefix
    return prompt + story


In [37]:
# For Colab execution
should_train = True  # Set to True to run training
should_test = True   # Set to True to test story generation

# Function to execute main workflow
def run_workflow(train=True, test=True):
    model = None
    tokenizer = None

    if train:
        try:
            model, tokenizer = train_model()
            print("\nTraining complete!")
        except Exception as e:
            print(f"Error during training: {str(e)}")
            return

    if test:
        try:
            # Load model if we didn't just train it
            if not train or model is None:
                if os.path.exists(OUTPUT_DIR):
                    print("\nLoading saved model for testing...")
                    tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)
                    model = AutoModelForCausalLM.from_pretrained(
                        OUTPUT_DIR,
                        torch_dtype=torch.float16,
                        device_map="auto"
                    )
                else:
                    print(f"\nNo model found at {OUTPUT_DIR}. Please train first.")
                    return

            # Test cases
            test_cases = [
                ("animals", "adventure"),
                ("space", "fantasy"),
                ("forest", "mystery"),
                ("family", "funny")
            ]

            print("\n" + "="*70)
            print("STORY GENERATION EVALUATION (1000-STORY MODEL)")
            print("="*70)

            for theme, genre in test_cases:
                try:
                    story = generate_story(theme, genre, model, tokenizer)

                    print(f"\n{'-'*70}")
                    print(f"THEME: {theme.upper()} | GENRE: {genre.upper()}")
                    print(f"{'-'*70}")
                    print(story)
                    print(f"{'-'*70}")
                    print(f"Hallucination check: {evaluate_hallucination(story)}")
                    print(f"Word count: {len(story.split())} words")
                except Exception as e:
                    print(f"Error generating {theme}/{genre} story: {str(e)}")

        except Exception as e:
            print(f"Error during testing: {str(e)}")

# Execute if running in Colab
try:
    if 'google.colab' in str(get_ipython()):
        # We're in Colab
        run_workflow(train=should_train, test=should_test)
except NameError:
    # Not running in interactive mode
    pass

Loading 1000 samples from TinyStories...
Dataset loaded with 1000 samples

Sample story:
----------------------------------------------------------------------
One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.

Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."

Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them b...
----------------------------------------------------------------------
Loading tokenizer from TinyLlama/TinyLlama-1.1B-Chat-v1.0


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Tokenizing dataset...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Loading base model from TinyLlama/TinyLlama-1.1B-Chat-v1.0...


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Model with LoRA adapters:
trainable params: 4,505,600 || all params: 1,104,553,984 || trainable%: 0.4079
Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
20,1.5662
40,1.4475
60,1.4084
80,1.3841
100,1.3485
120,1.3246
140,1.3453
160,1.2993
180,1.3185


Training completed in 0h 18m 25s
Saving model and tokenizer to tinyllama_1000stories_model

Training complete!

STORY GENERATION EVALUATION (1000-STORY MODEL)
Generating a adventure story about animals...





----------------------------------------------------------------------
THEME: ANIMALS | GENRE: ADVENTURE
----------------------------------------------------------------------
Once upon a time, there was an old lady who lived near a big forest. One day she went to see her grandson and he told her that they wanted to go on a walk through the woods. The lady said okay but only if they had all their bags with them so no one would get lost. She smiled and gave him some money as well. They started walking together until they came across a stream. The boy asked his granny what it looked like while holding onto his bag tightly. He explained that it flowed smoothly and made sounds when water hit rocks underneath it. His Granny laughed and replied "It's just your imagination!" Then they continued along the pathway towards home. When they got back, the girl thanked her Grandma very much and hugged her. It was a great start to their fun night out!
------------------------------------------------