# Voice Interaction with Trained GPT-2 Model (hosted on HuggingFace)

In [3]:
import os
import numpy as np
import sounddevice as sd
from scipy.io.wavfile import write
import whisper
from IPython.display import Audio
import torch
from TTS.api import TTS
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

torch.cuda.empty_cache()

# Function to record audio
def record_audio(duration=5, fs=44100):
    print("Recording for {} seconds...".format(duration))
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=2, dtype='float64')
    sd.wait()  # Wait until recording is finished
    recording = np.int16(recording / np.max(np.abs(recording)) * 32767)  # Convert to int16
    return recording, fs

# Record audio
output_directory = "../data/input/audio/speech_to_transcribe"
os.makedirs(output_directory, exist_ok=True)
audio, fs = record_audio(duration=5)
audio_file_path = os.path.join(output_directory, "my_voice_recording.wav")
write(audio_file_path, fs, audio)
print(f"Recording saved to {audio_file_path}")

# Convert speech to text using Whisper
model = whisper.load_model("small")
result = model.transcribe(audio_file_path, language="en")
transcribed_text = result["text"]
print("Transcribed text:", transcribed_text)

# Initialize GPT-2 model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
model_url = "carecodeconnect/jhana-gpt2-v2"  # URL for the Hugging Face model
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_url)
model = GPT2LMHeadModel.from_pretrained(model_url).to(device)

# Process the transcribed text with GPT-2
input_ids = tokenizer.encode(transcribed_text, return_tensors="pt").to(device)
generated_ids = model.generate(input_ids, max_length=100, do_sample=True, temperature=0.5)
generated_sequence = tokenizer.decode(generated_ids[0], clean_up_tokenization_spaces=True)
print("GPT-2 generated sequence:", generated_sequence)

torch.cuda.empty_cache()

# Convert GPT-2's output to speech using TTS
try:
    tts_model_path = "tts_models/multilingual/multi-dataset/xtts_v2"
    tts = TTS(tts_model_path).to(device)  # Adjust model as needed
    output_audio_directory = "../data/output/audio/"
    os.makedirs(output_audio_directory, exist_ok=True)
    output_file_path = os.path.join(output_audio_directory, "generated_speech.wav")
    tts.tts_to_file(text=generated_sequence, file_path=output_file_path, language="en", speaker_wav="../data/input/audio/voices_to_clone/audio_cf_10_seconds.wav")
    print(f"Text-to-speech audio saved to {output_file_path}")
except Exception as e:
    print(f"Error using TTS model: {e}")

# Play the generated speech
if os.path.exists(output_file_path):
    display(Audio(output_file_path))
else:
    print("Audio file not found.")



Recording for 5 seconds...
Recording saved to ../data/input/audio/speech_to_transcribe/my_voice_recording.wav
Transcribed text:  Jhana, the first Jhana.
GPT-2 generated sequence: Jhana, the first Jhana. are the factors of the first Jhana, is the first jhana, which is the first jhana, which is the first jhana, which is the piti and sukkha, which is the sukkha and the happiness.. in the first jhana, which is accompanied by the second jhana. sukkha, and has no sukkha. sukkha, and sukkha. sukkha. sukkha. sukkha... sukkha, and sukkha. sukkha. sukkha, and sukkha. sukkha. sukkha, which is neither sukkha
 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts
 > Text splitted to sentences.
['Jhana, the first Jhana.', 'are the factors of the first Jhana, is the first jhana, which is the first jhana, which is the first jhana, which is the piti and sukkha, which is the sukkha and the happiness.', '.', 'in the first jhana, which is accompanied by the second jha