# Importing libraries

In [1]:
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import soundfile as sf
from huggingface_hub import InferenceClient
import os

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


## Setting up the environment variables

In [2]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")
HF_TOKEN = os.getenv("HF_API_TOKEN")

---

## 1. Image to Text Model

### 1.1. Setup the pre-trained model

In [3]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

### 1.2. Open the image file

In [4]:
raw_image = Image.open('image.jpg').convert('RGB')

### 1.3. Extract the text from the image

In [5]:
inputs = processor(raw_image, return_tensors="pt")

out = model.generate(**inputs)
context = processor.decode(out[0], skip_special_tokens=True)
print(context)

a group of gis and zebras grazing in a field


---

## 2. Text to story Model (Text Generation)

In [6]:
client = InferenceClient(
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    token=HF_TOKEN,
)

story = ""

for message in client.chat_completion(
    messages=[{"role": "user", "content": f"generate a story that is at most 500 Characters long; the story should be about the image above; the story should rhyme and be in a poetic form; the story should be in English; the story should be unique and creative; the story should be interesting and engaging; Act like you are a story teller and you are telling a story about the this scenario. scenario: {context} Story:"}],
        max_tokens=500,
        stream=True,
):
    story += message.choices[0].delta.content

---

## 3. Story to Speech Model

In [7]:
story = story.replace("\n", " ")
story = story[:600]
story

" In a field, 'neath the sun's golden ray, GIs and zebras freely graze away. A strange sight, but here's the tale to tell, Of harmony and friendship that does dwell.  The GIs, they fight for freedom and peace, While zebras, with stripes, never know cease. But in this field, all their worries are cast, For here, together, they live in the past.  The zebras, with a kick, playfully play, While soldiers, with laughter, chase sorrow away. A peaceful scene, that brings joy to the heart, Where GIs and zebras, together, never depart.  So, let us cherish this image so rare, Of friendship and peace, so u"

In [8]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


In [9]:
inputs = processor(text=story, return_tensors="pt")

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

sf.write("speech.wav", speech.numpy(), samplerate=16500)