# **Image to text**

## Phi 3.5 Vision

In [None]:
from IPython.display import Markdown, display
from PIL import Image
import requests
from transformers import AutoModelForCausalLM, AutoProcessor
import gc
import torch

class Img2Text:
    def __init__(self, device):
        self.device = device
        self.model_id = "microsoft/Phi-3.5-vision-instruct"
        self.model = AutoModelForCausalLM.from_pretrained(self.model_id,
                                                          device_map=self.device,
                                                          trust_remote_code=True,
                                                          torch_dtype="auto",
                                                          _attn_implementation='eager'
                                                        )
        self.processor = AutoProcessor.from_pretrained(self.model_id, trust_remote_code=True)
        self.messages = [{"role": "user", "content": "<|image_1|> Describe the image in Details"}]
        self.prompt = self.processor.tokenizer.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True)

    def run(self, image, messages=False):
        # Load the image using PIL if it is a path or URL
        if isinstance(image, str):
            image = Image.open(image).convert("RGB")
        # If image is not a list, make it a list containing the image
        if not isinstance(image, list):
            image = [image]
        if messages:
            self.prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        inputs = self.processor(self.prompt, image, return_tensors="pt").to(self.device)
        generation_args = {"max_new_tokens": 1000, "temperature": 1, "do_sample": False}
        generate_ids = self.model.generate(**inputs, eos_token_id=self.processor.tokenizer.eos_token_id, **generation_args)
        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
        response = self.processor.batch_decode(generate_ids,
                                  skip_special_tokens=True,
                                  clean_up_tokenization_spaces=False)[0]
        del inputs
        del generate_ids
        torch.cuda.empty_cache()
        return response

    def cleanup(self):
        del self.model
        del self.processor
        torch.cuda.empty_cache()
        gc.collect()

In [None]:
Imgt2Text = Img2Text("cuda")
image = "/content/360_F_503362352_Q4oLkiACXRUv0uKVIonzo525a78Jf6d2.jpg"
Img2Text.run(image)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


"The image captures a heartwarming scene of a family of foxes in their natural habitat. The mother fox, with her rich brown fur, stands tall and proud, her gaze fixed on the camera. She is flanked by her two cubs, their fur a lighter shade of brown, their eyes wide with curiosity and wonder. The cubs are exploring the grassy terrain, their small paws leaving imprints on the soft earth. The background is a blur of green, suggesting a dense forest or a grassy field, providing a safe and secluded environment for the family. The image is taken from a low angle, making the foxes appear larger and more majestic. The lighting is soft and natural, casting a warm glow on the scene and highlighting the intricate details of the foxes' fur and features. The image does not contain any text."

In [None]:
Img2Text.cleanup()

# **English to Arabic Translation**

## llama3

In [None]:
!pip install groq -q
from groq import Groq

class Trans2Arabic:
    def __init__(self):
        self.api_key = "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"
        self.inst_prompt = """You are a skilled translator with extensive experience in English and Arabic translations.
                            You possess a deep understanding of the linguistic, cultural, and contextual nuances essential for accurate and effective translation between these languages. Highly motivated and detail-oriented, you are committed to delivering translations that maintain the integrity and intent of the original text.
                            Your role is crucial in ensuring clear and precise communication in our multilingual system.
                            Do not add anything other than the description"""

    def run(self, text):
        client = Groq(api_key=self.api_key)
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": f"{self.inst_prompt} text to translate: {text}",
                }
            ],
            model="llama3-70b-8192",
            temperature=0,
        )

        return chat_completion.choices[0].message.content


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.5/106.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
translator = Trans2Arabic()
text = "The image captures a heartwarming scene of a family of foxes in their natural habitat. The mother fox, with her rich brown fur, stands tall and proud, her gaze fixed on the camera. She is flanked by her two cubs, their fur a lighter shade of brown, their eyes wide with curiosity and wonder. The cubs are exploring the grassy terrain, their small paws leaving imprints on the soft earth. The background is a blur of green, suggesting a dense forest or a grassy field, providing a safe and secluded environment for the family. The image is taken from a low angle, making the foxes appear larger and more majestic. The lighting is soft and natural, casting a warm glow on the scene and highlighting the intricate details of the foxes' fur and features. The image does not contain any text."

In [None]:
translated_text = translator.run(text)
print(translated_text)

تُصوّر الصورة مشهداً رومانسياً لعائلة الثعالب في بيئتها الطبيعية. تقف الثعلبة الأم، ذات الفراء الغني باللون البني، منتصبةً وفخورةً، نظرتها ثابتةً على الكاميرا. وتحيط بها صغيراها، فرائهما أفتح لوناً من البني، عيناهما واسعتان بالفضول والدهشة. ويستكشف الصغيران التضاريس العشبية، أقدامهما الصغيرة تترك آثاراً على الأرض الناعمة. الخلفية مُبهَمة باللون الأخضر، مما يوحي بوجود غابة كثيفة أو ميدان عشبي، مما يوفر بيئة آمنة ومُ


# Text to Speech TTS Arabic

## TTs using Arabic tts

## TTs using xtts

In [None]:
!pip install TTS
import torch
from TTS.api import TTS

class T2S_AR:
    def __init__(self, clone_voice_path, device="cuda:0"):
        self.clone_voice_path = clone_voice_path
        self.out_path = "output.wav"
        self.device = device
        self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)

    def run(self, text):
        self.tts.tts_to_file(text=text, speaker_wav=self.clone_voice_path, language="ar", file_path=self.out_path)
        return self.out_path

Collecting TTS
  Downloading TTS-0.22.0-cp310-cp310-manylinux1_x86_64.whl.metadata (21 kB)
Collecting anyascii>=0.3.0 (from TTS)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pysbd>=0.3.4 (from TTS)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting umap-learn>=0.5.1 (from TTS)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pandas<2.0,>=1.4 (from TTS)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting trainer>=0.0.32 (from TTS)
  Downloading trainer-0.0.36-py3-none-any.whl.metadata (8.1 kB)
Collecting coqpit>=0.0.16 (from TTS)
  Downloading coqpit-0.0.17-py3-none-any.whl.metadata (11 kB)
Collecting pypinyin (from TTS)
  Downloading pypinyin-0.53.0-py2.py3-none-any.whl.metadata (12 kB)
Collecting hangul-romanize (from TTS)
  Downloading hangul_romanize-0.1.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gruut==2.2.3 (from gruut[de,es,fr]==2.2.3->T

# Pipline

In [None]:
class ImgToSpeechAR:
    def __init__(self):
        self.img2text = Img2Text("cuda")
        self.translator = Trans2Arabic()
        self.path = "/content/speaker.opus"
        self.T2S_ar = T2S_AR(self.path)

    def convert_Img_to_voice(self, img):
        img_des = self.img2text.run(img)
        self.img2text.cleanup()
        img_des_ar = self.translator.run(img_des)
        speech_ar_output = self.T2S_ar.run(img_des_ar)
        return speech_ar_output



# Gradio

In [None]:
# import torch

# # Specify the device
# device = torch.device('cuda:0')

# # Clear cache on that specific device
# with torch.cuda.device(device):
#     torch.cuda.empty_cache()

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
!pip install gradio -q
import gradio as gr
# Initialize the class
pipline = ImgToSpeechAR()

# Define the Gradio interface
def gradio_interface(image):
    audio_path = pipline.convert_Img_to_voice(image)
    return audio_path

# Create the Gradio interface
iface = gr.Interface(
    fn=gradio_interface,

    inputs=gr.Image(type="pil", label="Upload Image"),
    outputs=gr.Audio(label="Generated Audio"),
    title="Speech to Image Converter",
    description="Upload an image, and the model will generate a speech description in Arabic.",
)

# Launch the interface
iface.launch(share=True, debug=True)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.7/56.7 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.8/319.8 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.7/94.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.5/447.5 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.3/73.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

configuration_phi3_v.py:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-vision-instruct:
- configuration_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3_v.py:   0%|          | 0.00/88.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-vision-instruct:
- modeling_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/68.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.35G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

processing_phi3_v.py:   0%|          | 0.00/22.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-vision-instruct:
- processing_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


preprocessor_config.json:   0%|          | 0.00/442 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/9.52k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.85M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

 > You must confirm the following:
 | > "I have purchased a commercial license from Coqui: licensing@coqui.ai"
 | > "Otherwise, I agree to the terms of the non-commercial CPML: https://coqui.ai/cpml" - [y/n]
 | | > y
 > Downloading model to /root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2


100%|█████████▉| 1.86G/1.87G [00:44<00:00, 41.1MiB/s]
100%|██████████| 1.87G/1.87G [00:44<00:00, 41.8MiB/s]
100%|██████████| 4.37k/4.37k [00:00<00:00, 26.1kiB/s]
 55%|█████▍    | 199k/361k [00:00<00:00, 1.68MiB/s]
100%|██████████| 361k/361k [00:00<00:00, 845kiB/s] 
100%|██████████| 32.0/32.0 [00:00<00:00, 125iB/s]
 77%|███████▋  | 5.96M/7.75M [00:00<00:00, 33.1MiB/s]

 > Model's license - CPML
 > Check https://coqui.ai/cpml.txt for more info.
 > Using model: xtts


  self.speakers = torch.load(speaker_file_path)
  return torch.load(f, map_location=map_location, **kwargs)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://41a38263cf5fa7c27d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


 > Text splitted to sentences.
['تُصوّر الصورة لحظة مؤثرة على ملعب كرة القدم.', 'لاعب، يرتدي قميصاً أحمراً ساطعاً مع كلمة "ريوي" مكتوبة على الواجهة، مستلقي على الأرض، رأسه منخفض في ما يبدو أنه ضيق.', 'زملاؤه في الفريق، أيضاً يرتدون قمصاناً حمراء، يحيطون به، تعابيرهم مزيجاً من القلق والتعاطف.', 'زملاء اللاعب يركعون على جانبيه، أيديهم وضعت برفق على رأسه، يقدمون الراحة في هذه اللحظة من الضعف.', 'الخلفية مبهمة من الجمهور، وجوههم غير واضحة، ولكن حضورهم شهادة على أهمية اللحظة.', 'الصورة تذكير قوي بالمرات العاطفية العالية والمنخفضة التي تأتي مع الرياضة.']


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 > Processing time: 21.163859605789185
 > Real-time factor: 0.47840507855572323


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gradio/queueing.py", line 624, in process_events
    response = await route_utils.call_process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/route_utils.py", line 323, in call_process_api
    output = await app.get_blocks().process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 2018, in process_api
    result = await self.call_function(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1567, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
    return await future
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 8

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://41a38263cf5fa7c27d.gradio.live


