In [1]:
!pip install transformers==4.38.2  # Pin transformers to avoid numpy.dtypes error
!pip install git+https://github.com/openai/whisper.git
!pip install sentence-transformers
!pip install faiss-cpu
!pip install tts
!pip install soundfile
!pip install librosa
!pip install requests
!pip install scipy

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-e05ztxla
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-e05ztxla
  Resolved https://github.com/openai/whisper.git to commit dd985ac4b90cafeef8712f2998d62c59c3e62d22
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transform

In [4]:
!pip install openai



In [4]:
import os
import numpy as np
import soundfile as sf
import whisper
from sentence_transformers import SentenceTransformer
import faiss
import requests
from TTS.api import TTS
import librosa
import IPython.display as ipd
from openai import OpenAI



# Step 2: Create a small document corpus
documents = [

    "Web accessibility ensures that websites and digital content are usable by everyone, including people with disabilities. The World Wide Web Consortium (W3C) develops guidelines like WCAG 2.1, which outline standards for making web content perceivable, operable, understandable, and robust for users with visual, auditory, motor, or cognitive impairments.",
    "Screen readers are essential assistive technologies for visually impaired users. They convert digital text into synthesized speech or braille output, allowing users to navigate websites, read documents, and interact with applications. Popular screen readers include JAWS, NVDA, and VoiceOver, each with unique features tailored to different operating systems.",
    "Semantic HTML plays a critical role in web accessibility. By using proper tags like <nav>, <header>, and <aria-label>, developers ensure screen readers can interpret page structure accurately. For example, ARIA landmarks help users navigate complex web applications by providing descriptive labels for dynamic content.",
    "Keyboard navigation is vital for users with motor impairments who cannot use a mouse. Accessible websites allow all interactive elements, such as buttons and forms, to be operated via keyboard shortcuts like Tab, Enter, and Space. WCAG 2.1 mandates that all functionality be accessible without relying on mouse input.",
    "Text-to-speech technology, beyond screen readers, is widely used in accessibility tools. It converts text into natural-sounding speech, aiding users with visual or reading difficulties. Modern text-to-speech systems, like those in mobile devices, support multiple languages and customizable voice settings.",
    "Color contrast is a key aspect of accessible design. WCAG 2.1 recommends a minimum contrast ratio of 4.5:1 for text to ensure readability for users with low vision. Tools like contrast checkers help developers verify that their designs are legible under various lighting conditions or for users with color blindness.",
    "Accessible web forms require clear labels, error messages, and instructions. For example, form fields should use <label> tags and provide real-time feedback for screen reader users. This ensures users with disabilities can complete forms, such as online surveys or registration pages, independently.",
    "Assistive technologies extend beyond screen readers to include braille displays, magnification software, and voice recognition systems. These tools empower users with diverse disabilities to access digital content, from reading braille output to controlling devices with voice commands."

]
with open("corpus.txt", "w") as f:
    for doc in documents:
        f.write(doc + "\n")

# Step 3: Initialize models
# Whisper for STT
whisper_model = whisper.load_model("tiny")

# SentenceTransformer for embeddings
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# FAISS index for retrieval
embed_dim = 384  # Dimension of all-MiniLM-L6-v2 embeddings
index = faiss.IndexFlatL2(embed_dim)
doc_embeddings = embedder.encode(documents)
index.add(np.array(doc_embeddings))

# Gemini API configuration (via OpenAI client)
client = OpenAI(
    api_key="GEmini_key",  # Replace with your actual Gemini API key
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

# Coqui TTS for multilingual output
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)

# Step 4: Optional audio recording (for local use only)
def record_audio(filename="input.wav", duration=5, fs=16000):
    try:
        import sounddevice as sd
        from scipy.io.wavfile import write
        print("Recording audio for 5 seconds...")
        recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
        sd.wait()
        write(filename, fs, recording)
        print("Recording finished.")
        return filename
    except Exception as e:
        print(f"Audio recording failed: {e}")
        return None

# Step 5: Process voice input from uploaded file
def voice_to_text(audio_file):
    try:
        audio, _ = librosa.load(audio_file, sr=16000)
        result = whisper_model.transcribe(audio, language="en")  # Add language detection for multilingual
        return result["text"]
    except Exception as e:
        print(f"STT failed: {e}")
        return None

# Step 6: Retrieve relevant documents
def retrieve_documents(query, top_k=2):
    query_embedding = embedder.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    return [documents[i] for i in indices[0]]

# Step 7: Generate response using Gemini API via OpenAI client
def generate_response(query, retrieved_docs):
    context = " ".join(retrieved_docs)
    prompt = f"Question: {query}\nContext: {context}\nAnswer in a concise and natural way:"

    try:
        response = client.chat.completions.create(
            model="gemini-2.5-flash-preview-04-17",
            reasoning_effort="low",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"API Request Error: {e}"

# Step 8: Convert response to audio
def text_to_audio(text, output_file="output.wav"):
    try:
        tts.tts_to_file(text=text, file_path=output_file)
        return output_file
    except Exception as e:
        print(f"TTS failed: {e}")
        return None

# Step 9: Main pipeline
def main_pipeline(use_recording=False):
    # Colab workaround: Upload audio file
    audio_file = "input.wav"
    if use_recording:
        print("Recording only supported locally. Requires PortAudio and sounddevice.")
        audio_file = record_audio()
        if not audio_file:
            print("Error: Recording failed. Falling back to uploaded 'input.wav'.")
    else:
        print("Please upload an audio file named 'input.wav' to Colab (16kHz WAV recommended, e.g., 'What is braille?').")

    # Check if audio file exists
    if not os.path.exists(audio_file):
        print("Error: 'input.wav' not found. Please upload the file.")
        return None, None

    # Convert voice to text
    query = voice_to_text(audio_file)
    if not query:
        print("Error: Failed to transcribe audio.")
        return None, None
    print(f"Transcribed Query: {query}")

    # Retrieve documents
    retrieved_docs = retrieve_documents(query)
    print(f"Retrieved Documents: {retrieved_docs}")

    # Generate response via Gemini API
    response = generate_response(query, retrieved_docs)
    print(f"Generated Response: {response}")

    # Convert response to audio and text
    audio_output = text_to_audio(response)
    if not audio_output:
        print("Error: Failed to generate audio output.")
        return response, None
    print(f"Audio output saved as: {audio_output}")

    # Display audio in Colab
    ipd.display(ipd.Audio(audio_output))

    return response, audio_output

# Run the pipeline
if __name__ == "__main__":
    response, audio_output = main_pipeline(use_recording=False)

    # Save sample outputs
    if response:
        with open("response.txt", "w") as f:
            f.write(response)

 > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Model's reduction rate `r` is set to: 1
 > Vocoder Model: hifigan
 > Setting up Audio P