# Download the needed libraries and restart the session

In [None]:
!pip install datasets
!pip install transformers
!apt install ffmpeg
!pip install pydub
!pip install peft
!pip install -U bitsandbytes

# Image to text

# Import the model from hugging face using transformers pipeline

In [None]:
from transformers import pipeline
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

In [None]:
image2text=pipe("profile.jpeg")[0]["generated_text"]
image2text

# Story Generator

# check if the cuda working on ur computer

In [1]:
import torch
print(torch.cuda.is_available())

True


In [27]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

# first start with the promt you want to use for the generation model

In [4]:
SYSTEM_PROMPT = """\
You are an expert storyteller.
Your task is to generate a short story (20 to 30 words) based on a given sentence.
Ensure the story has a clear beginning, middle, and end while maintaining engagement and coherence.
Do not exceed 30 words.
"""

def format_dataset(examples):
    formatted_prompts = []
    formatted_stories = []
    for i in range(len(examples["prompt"])):  # Iterate over the examples
        prompt = f"{SYSTEM_PROMPT}\n\n**User:** {examples['prompt'][i]}"
        story = examples["story"][i]
        formatted_prompts.append(prompt)
        formatted_stories.append(story)
    return {"prompt": formatted_prompts, "story": formatted_stories}

# Load the dataset from the json file (story_generation_dataset)

In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('json', data_files='story_generation_dataset.json')
# Apply the formatting function
formatted_dataset = dataset.map(format_dataset, batched=True)

# Select the first 10 samples
subset_dataset = formatted_dataset['train'].select(range(10))

# Inspect the subset
print(subset_dataset)

In [None]:
formatted_dataset

# Step 3: Load the model and the Tokenizer

In [None]:
from transformers import AutoTokenizer

BASE_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)

def tokenize_function(examples):
    tokenized = tokenizer(examples['prompt'], examples['story'], truncation=True, padding='max_length', max_length=256)
    tokenized["labels"] = tokenized["input_ids"]  # Add labels for causal LM
    return tokenized

tokenized_subset = subset_dataset.map(tokenize_function, batched=True)

# Inspect the tokenized subset
print(tokenized_subset)

# Fine Tuning the model using LoRA (Low-Rank Adaptation)

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Configure 8-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable 8-bit quantization
    llm_int8_threshold=6.0,  # Threshold for quantization
)

# Load the model with 8-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto",  # Automatically map layers to GPU/CPU
)

# Prepare the model for 8-bit training
model = prepare_model_for_kbit_training(model)

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # Rank of the low-rank adaptation
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Target modules for LoRA
    lora_dropout=0.1,  # Dropout for LoRA
    bias="none",  # No bias
    task_type="CAUSAL_LM",  # Task type
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Small batch size to save memory
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=10_000,
    logging_dir='./logs',
    logging_steps=200,
    fp16=True,  # Enable mixed precision training
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_subset,
    eval_dataset=tokenized_subset,  # Use the same subset for evaluation
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Generate a story from a prompt

In [None]:
input_prompt = image2text
input_text = f"{SYSTEM_PROMPT}\n\n**User:** {input_prompt}"
input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(model.device)

# Generate the story
output = model.generate(input_ids, max_length=200, num_return_sequences=1)
generated_story = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_story)

In [None]:
generated_story

# Extract the story from the model reply

In [91]:
import re

def extract_story(text):
    # Remove the introductory text and unwanted newlines from the beginning
    text = re.sub(r"You are an expert storyteller.*?(\n\n|$)", "", text, flags=re.DOTALL)

    # Case 1: Match format starting with '**Story:'
    story_match = re.search(r"\*\*Story:\*\*\s*(.*)", text, re.DOTALL)
    if story_match:
        return story_match.group(1).strip()

    # Case 2: Match format starting after '---'
    story_match = re.search(r"---\s*(.*)", text, re.DOTALL)
    if story_match:
        return story_match.group(1).strip()

    # Case 3: Match format starting with 'Beginner's Guide:'
    story_match = re.search(r"Beginner's Guide:.*\*\*(.*)", text, re.DOTALL)
    if story_match:
        return story_match.group(1).strip()

    return None

In [93]:
story=extract_story(generated_story)

In [None]:
story

# Text To Speech

# Importing the text2speech model from hugging face

In [None]:
pipe_text2speech = pipeline("text-to-speech", model="facebook/mms-tts-eng")

# Applying the model and saving the audio generated

In [None]:
from pydub import AudioSegment

output=pipe_text2speech(story)
audio_array = (output["audio"] * 32767).astype(np.int16)  # Convert to 16-bit PCM format
sample_rate = output["sampling_rate"]

audio_segment = AudioSegment(audio_array.tobytes(), frame_rate=sample_rate, sample_width=2, channels=1)

# Save as MP3
audio_segment.export("output.mp3", format="mp3")

print("Audio saved as output.mp3")