In [1]:
!pip install tensorflow pillow googletrans==4.0.0-rc1 gTTS ipywidgets

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4

In [8]:
import tensorflow as tf
import pandas as pd
import numpy as np
from googletrans import Translator
from PIL import Image
import matplotlib.pyplot as plt
from gtts import gTTS
import IPython.display as ipd
import ipywidgets as widgets

In [21]:
model = tf.keras.applications.MobileNetV2(weights = 'imagenet')
decode_predictions = tf.keras.applications.mobilenet_v2.decode_predictions
preprocess_input = tf.keras.applications.mobilenet_v2.preprocess_input

translator = Translator()

uploader = widgets.FileUpload(accept = 'image/*', multiple = False)
display(uploader)

def classify_and_translate(file, target_lang = 'es'):
  img = Image.open(file).resize((224,224))
  img_array = np.array(img)
  if img_array.shape[-1] == 4:
    img_array = img_array[..., :3]
  image_batch = np.expand_dims(img_array, axis = 0)
  image_batch = preprocess_input(image_batch)
  #Predict
  preds = model.predict(image_batch)
  decoded = decode_predictions(preds, top = 1)[0][0]
  label = decoded[1]
  confidence = decoded[2]
  print(f'Prediction: {label} ({confidence:.2f} confidence)')
  #Translate
  translated = translator.translate(label, dest = target_lang).text
  print(f'Translated to {target_lang}: {translated}')
  #Speak
  tts = gTTS(translated, lang = target_lang)
  tts.save('output.mp3')
  return translated, label, 'output.mp3'

def on_upload_change(change):
  for name, file_info in uploader.value.items():
    with open(name, 'wb') as f:
      f.write(file_info['content'])
    label, translated, audio_file = classify_and_translate(name, target_lang = 'bn')
    ipd.display(ipd.Audio(audio_file))
uploader.observe(on_upload_change, names = 'value')

FileUpload(value={}, accept='image/*', description='Upload')



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Prediction: bell_pepper (0.20 confidence)
Translated to bn: বেল_মরিচ


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
Prediction: puck (0.14 confidence)
Translated to bn: পাক


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
Prediction: pomegranate (0.30 confidence)
Translated to bn: ডালিম


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
Prediction: bell_pepper (0.23 confidence)
Translated to bn: বেল_মরিচ


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
Prediction: monastery (0.48 confidence)
Translated to bn: মঠ


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
Prediction: scoreboard (0.32 confidence)
Translated to bn: স্কোরবোর্ড


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step
Prediction: plate_rack (0.63 confidence)
Translated to bn: প্লেট_র্যাক
