In [None]:
# Update package list and install curl (if not present)
sudo apt-get update && sudo apt-get install -y curl

# Install Ollama
curl -fsSL https://ollama.com/install.sh | sh

# Launch the Ollama server in the background
ollama serve &

# Pull the Llama 3 model (this may take a few minutes)
ollama pull llama3

In [None]:
# Install all required Python packages in one go
pip install llama-index llama-index-llms-ollama llama-index-embeddings-ollama llama-index-embeddings-huggingface openai-whisper transformers ChatTTS jupyterlab

In [None]:
# Start the JupyterLab server, making it accessible from your local machine
jupyter-lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root

In [None]:
#Package imports

import os
import numpy as np
import re

# Imports for Speech to Text
import whisper
import torch

# Imports for RAG Model
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama

# Imports for Text to Speech
import ChatTTS
import torchaudio
from IPython.display import Audio

## Setting up the env

# Set the environment variable for experimental features (optional)
os.environ['TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL'] = '1'
os.environ['HIP_VISIBLE_DEVICES'] = "0"

print(f"Torch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

# Check GPU availability and properties
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("Using CPU (no GPU detected)"

In [None]:
## Transcribe speech to Text

In [None]:
!curl -L https://raw.githubusercontent.com/ROCm/gpuaidev/main/docs/notebooks/assets/summarize_question.wav -o summarize_question.wav

In [None]:
AUDIO_FILE = "summarize_question.wav"
Audio(AUDIO_FILE, rate=24_000, autoplay=True)

In [None]:
# Speech-to-Text with Whisper
try:
    model = whisper.load_model("base")
    result = model.transcribe(AUDIO_FILE)
    input_text = result["text"]
    print(f"Transcribed text: {input_text}")
except Exception as e:
    print(f"Error in speech-to-text: {e}")
    exit(1)

## Integrating the RAG Model  
To utilize a Retrieval-Augmented Generation (RAG) model, supply the context you want the language model to reference when answering queries. In this example, documents from the `data` folder are used as context. If you don’t have any documents yet, you can add your own or download the sample provided below.

DATA_DIR = "./data"

# Check if the data directory exists, and create it if it doesn't
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)
    print(f"Data directory '{DATA_DIR}' created. Please add a file of your choosing or use the cell below to download sample text.")
    exit(1)
else:
    # Check if data directory is empty
    if not os.listdir(DATA_DIR):
        print(f"Data directory '{DATA_DIR}' is empty. Please add a file of your choosing or use the cell below to download sample text.")
        exit(1)


In [None]:
#if the data dictionary is empty then run this:

!mkdir -p data && curl -L https://www.gutenberg.org/cache/epub/11/pg11.txt -o data/pg11.txt

In [None]:
# View the files in your data directory
print("Files in data directory:", os.listdir("data"))
documents = SimpleDirectoryReader(DATA_DIR).load_data()

For the embedding model, use “bge-base” from HuggingFaceEmbedding. Confirm that the Ollama server is running because it supplies Llama-3 for the LLM.

Next, create a VectorStoreIndex from the loaded documents and initialize a query engine with the index. Then issue your query using the text output from the Whisper model. Print the response so you can compare it against the audio output in the next step.

In [None]:
# Initialize embedding and LLM models
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

try:
    Settings.llm = Ollama(model="llama3", request_timeout=360.0)
except Exception as e:
    print(f"Error connecting to Ollama server: {e}")
    exit(1)

# Build and query the vector index
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine(streaming=True, response_mode="compact", similarity_top_k=3)
response = query_engine.query(input_text)

# Function to convert StreamingResponse to string
def streaming_response_to_string(streaming_response):
    text = ""
    for chunk in streaming_response.response_gen:
        if isinstance(chunk, dict) and "text" in chunk:
            text += chunk["text"]
        else:
            text += str(chunk)
    return text

# Convert response to string
response_text = streaming_response_to_string(response)
print(f"Generated response: {response_text}")

## Perform text-to-speech conversion
The following example performs text-to-speech conversion using the ChatTTS library and saves the output audio to a file.

This example uses athe following constants:

OUTPUT_AUDIO_FILE (str): The name of the output audio file.

SAMPLE_RATE (int): The sample rate for the output audio file.

It provides the following functionality:

Initializes a ChatTTS.Chat object.

Loads the chat model without compilation for faster loading. (Set compile=True for better performance.)

Converts the response text from the previous step to speech.

Saves the generated audio to the specified output file using torchaudio.

In [None]:
OUTPUT_AUDIO_FILE = "voice_pipeline_response.wav"
SAMPLE_RATE = 24000

# Text cleanup function for TTS
def sanitize_input(text):
    sanitized_text = text.replace('-', '')  # Remove hyphens
    sanitized_text = sanitized_text.replace('(', '').replace(')', '')  # Remove parentheses
    return sanitized_text.strip()

# Text-to-Speech processing
try:
    sanitized_response = re.sub(r"[^a-zA-Z0-9.,?! ]", "", response_text)  # Remove special characters
    print(f"Sanitized response for TTS: {sanitized_response}")
    sanitized_response = [sanitized_response]

    chat = ChatTTS.Chat()
    chat.load(compile=False) # Set to True for better performance

    params_infer_code = ChatTTS.Chat.InferCodeParams(
        spk_emb = chat.sample_random_speaker(),
    )

    wavs = chat.infer(
        sanitized_response,
        params_infer_code=params_infer_code,
    )
    try:
        torchaudio.save(OUTPUT_AUDIO_FILE, torch.from_numpy(wavs[0]).unsqueeze(0), SAMPLE_RATE)
    except:
        torchaudio.save(OUTPUT_AUDIO_FILE, torch.from_numpy(wavs[0]), SAMPLE_RATE)

except Exception as e:
    print(f"Error in text-to-speech: {e}")
    exit(1)

finally:
    if 'chat' in locals():
        chat.unload()

In [None]:
#Play the following cell to hear the generated speech.

Audio(wavs[0], rate=24_000, autoplay=True)