In [10]:
# Import libraries
import requests
import os
from dotenv import load_dotenv
import numpy as np
import pandas as pd
from groq import Groq
from TTS.api import TTS
import torch
from tqdm import tqdm
import time

In [11]:
# Load environment variables
load_dotenv()

# Groq API setup
GROQ_API = os.getenv("GROQ_API")

In [12]:
# Function to get response from Groq
def get_groq_response(prompt):
    client = Groq(
        api_key=GROQ_API,
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-8b-8192",
    )

    return chat_completion.choices[0].message.content

Note: https://docs.coqui.ai/en/latest/inference.html

In [13]:
# Initialize TTS with XTTS-v2 model
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)

 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts


  self.speakers = torch.load(speaker_file_path)
  return torch.load(f, map_location=map_location, **kwargs)


In [14]:
# Function to generate speech from text using voice cloning with progress bar
def generate_speech(text, speaker_wav, output_file):
    # Estimate the duration based on word count (adjust this ratio as needed)
    estimated_duration = len(text.split()) * 0.5  # Assuming 0.5 seconds per word

    with tqdm(total=100, desc="Generating Speech", unit="%") as pbar:
        def update_progress(progress):
            pbar.n = int(progress * 100)
            pbar.refresh()

        tts.tts_to_file(
            text=text,
            file_path=output_file,
            speaker_wav=speaker_wav,
            language="en"
        )

        # Simulate progress updates
        for _ in range(100):
            time.sleep(estimated_duration / 100)
            update_progress(_ / 100)

In [15]:
# Main function to process user input, get Groq response, and generate speech
def process_input_and_generate_speech(user_input, speaker_wav, output_file):
    # Get response from Groq
    response = get_groq_response(user_input)
    print("Groq response:", response)

    # Generate speech from the response
    generate_speech(response, speaker_wav, output_file)
    print(f"Speech generated and saved to {output_file}")

In [16]:
# Example usage
if __name__ == "__main__":
    user_input = input("Enter your question: ")
    speaker_wav = "audio/SPEAKER_02_audio.wav"
    output_file = "output_speech.wav"

    process_input_and_generate_speech(user_input, speaker_wav, output_file)

Enter your question:  Can you give a short summary in the difference between rugby and football


Groq response: Here's a brief summary of the main differences between rugby and American football:

**Number of Players:**

* Rugby: 15 players per team (13 for sevens and 10 for tag rugby)
* American Football: 11 players per team (offense and defense)

**Equipment:**

* Rugby: Thinly padded helmets, mouthguards, and minimal protective gear
* American Football: Full-body padding, including helmets, shoulder pads, and knee pads

**Objective:**

* Rugby: Score points by carrying or passing the ball into the end zone (try = 5 points), kicking for penalties (3 points) and conversions (2 points)
* American Football: Score points by carrying or throwing the ball into the end zone ( touchdown = 6 points), kicking for extra points (1-3 points), and field goals (3 points)

**Gameplay:**

* Rugby: Players can run with the ball, pass the ball laterally or backwards, and tackle opponents to the ground to gain possession of the ball
* American Football: Players can run with the ball, pass the ball 

Generating Speech:   0%|                                                                           | 0/100 [00:00<?, ?%/s]

 > Text splitted to sentences.
["Here's a brief summary of the main differences between rugby and American football:", '**Number of Players:**', '* Rugby: 15 players per team (13 for sevens and 10 for tag rugby)', '* American Football: 11 players per team (offense and defense)', '**Equipment:**', '* Rugby: Thinly padded helmets, mouthguards, and minimal protective gear', '* American Football: Full-body padding, including helmets, shoulder pads, and knee pads', '**Objective:**', '* Rugby: Score points by carrying or passing the ball into the end zone (try = 5 points), kicking for penalties (3 points) and conversions (2 points)', '* American Football: Score points by carrying or throwing the ball into the end zone ( touchdown = 6 points), kicking for extra points (1-3 points), and field goals (3 points)', '**Gameplay:**', '* Rugby: Players can run with the ball, pass the ball laterally or backwards, and tackle opponents to the ground to gain possession of the ball', '* American Football:

Generating Speech:   0%|                                                                           | 0/100 [02:12<?, ?%/s]


KeyboardInterrupt: 

In [None]:
# from TTS.tts.configs.glow_tts_config import GlowTTSConfig