# Wave2Vec2.0 Test for sexy TTS later perhaps

In [None]:
%pip install -qU ipywidgets

In [None]:
%pip install -qU datasets
%pip install -qU transformers
%pip install -qU torchaudio
%pip install -qU jiwer
%pip install -qU accelerate


In [None]:
import torch # type: ignore
torch.__version__  

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from datasets import load_dataset, load_metric, Audio

common_voice_train = load_dataset("mozilla-foundation/common_voice_16_0", "en", split="train+validation", use_auth_token=True)
common_voice_test = load_dataset("mozilla-foundation/common_voice_16_0", "en", split="test", use_auth_token=True)

In [None]:
%pip install -qU python-dotenv sounddevice ffmpeg-python requests groq gtts numpy torch torchaudio silero-vad

In [8]:
import sounddevice as sd
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
import torch
import numpy as np
import ffmpeg
import requests
import os
from gtts import gTTS
import tempfile
from groq import Groq
import numpy as np
import sys
import queue
import threading
import wave
import dotenv

In [3]:
dotenv.load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
groq = Groq(api_key=GROQ_API_KEY)
vad = load_silero_vad()

In [5]:
def text_to_speech(text):
    tts = gTTS(text=text, lang='en')
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
    tts.save(temp_file.name)
    os.system(f"ffplay -nodisp -autoexit {temp_file.name}") 
    os.remove(temp_file.name)

# STT: send audio to Groq Whisper API for transcription
def send_to_groq_whisper_api(
    filename="./recording.wav", 
    api_url="https://api.groq.com/openai/v1/audio/transcriptions", 
    api_key=GROQ_API_KEY
):
    print("Sending audio to Groq Whisper API...")
    
    with open(filename, 'rb') as audio_file:
        headers = {
            "Authorization": f"Bearer {api_key}"
        }
        files = {
            "file": (os.path.basename(filename), audio_file, "audio/wav"),
            "model": (None, "whisper-large-v3-turbo") 
        }
        
        response = requests.post(api_url, headers=headers, files=files)
    
    if response.status_code == 200:
        print("Transcription successful!")
        return response.json().get("text", "")
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None



### VAD

In [None]:
def process_audio_chunk(vad_model, audio_chunk, sample_rate):
    audio_chunk = np.array(audio_chunk, dtype=np.float32)
    
    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_file:
        temp_filename = temp_file.name
        
        with wave.open(temp_filename, 'wb') as wf:
            wf.setnchannels(1) 
            wf.setsampwidth(2)
            wf.setframerate(sample_rate)
            wf.writeframes(audio_chunk.tobytes())
            
        wav = read_audio(temp_filename)
        speech_timestamps = get_speech_timestamps(wav, vad_model, return_seconds=True, sampling_rate=sample_rate)

    print("Speech Timestamps:", speech_timestamps)
    return speech_timestamps

def silero_real_time(file_name="./recording.wav", sample_rate=16000, chunk_duration=2, threshold_silence=2.0):
    vad_model = load_silero_vad()
    audio_queue = queue.Queue()
    recorded_audio = []
    silence_duration = 0
    
    def callback(indata, frames, time, status):
        if status:
            print(status, file=sys.stderr)
        print(f"Callback triggered, frames: {frames} Audio chunk: {indata[:3]}...")
        audio_queue.put(indata)

    # Start recording in a non-blocking way
    with sd.InputStream(callback=callback, channels=1, samplerate=sample_rate, blocksize=chunk_duration * sample_rate):
        print("Start speaking...")
        recording = False
        
        while True:
            audio_chunk = audio_queue.get()
            speech_timestamps = process_audio_chunk(vad_model, audio_chunk, sample_rate)
            
            if speech_timestamps:
                silence_duration = 0
                if not recording:
                    print("Speech detected. Starting recording...")
                    recording = True
                recorded_audio.append(audio_chunk)
                
            else:
                silence_duration += chunk_duration
                print(f"Silence:{silence_duration}")
                if silence_duration >= threshold_silence:
                    print("Silence detected for long enough. Stopping recording...")
                    break

    if recorded_audio:
        final_audio = np.concatenate(recorded_audio, axis=0)
        sd.write(file_name, final_audio, sample_rate)
        print(f"Recording saved as '{file_name}'")
        return file_name
    else:
        print("No speech detected during recording.")
    return None

In [57]:
file = silero_real_time()
if file:
    transcription = send_to_groq_whisper_api(filename=file)
    if transcription:
        print("Transcription Result:", transcription)

Start speaking...
Callback triggered, frames: 32000 Audio chunk: [[-0.00446457]
 [-0.00704147]
 [-0.00496611]]...
Speech Timestamps: []
Silence:2
Silence detected for long enough. Stopping recording...
Callback triggered, frames: 32000 Audio chunk: [[0.0114101 ]
 [0.01295299]
 [0.01304272]]...
No speech detected during recording.


In [17]:
# generate interview questions using t2t LLM
def generate_question(job_description, prompt="", prev=None):
    system_message = f"""Based on the job description: {job_description}\n
        You are conducting an interview, acting as an expert on the topics mentioned in the job description
        End the interview when necessary
        Your response is directly relayed to the user, Only respond with the interview question itself, without any introductions or extra phrases.
        """
    if prev:
        prompt += f"Candidate: {prev}\n"
    prompt += "Please generate the next interview question:"
    
    response = groq.chat.completions.create(
        model="llama-3.2-90b-text-preview",
        max_tokens=150,
        messages=[
            {
                "role":"system",
                "content":system_message
            },
            {
                "role":"user",
                "content":prompt
            }
        ]
    )
    return response['text'].strip(), prompt

# Interview Loop
def start_interview(job_description):
    prompt="Interview start\n"
    text_to_speech("Welcome to the interview. Let's get started!")
    question, prompt = generate_question(job_description, prompt)

    while True:
        
        text_to_speech(question)
        print("Recording your answer...")
        
        record_audio_with_vad()
        
        answer = send_to_groq_whisper_api(filename="./recording.wav")
        print(f"Transcribed Answer: {answer}")
        
        question, prompt = generate_question(job_description, prompt, prev=answer)
        prompt += f"Interviewer: {question}\n"
        
        if "end interview" in question.lower():
            text_to_speech("Thank you for participating in the interview.")
            break

In [18]:

# Start the interview with a given job description
job_description = "Cloud Intern with expertise in Docker."
start_interview(job_description)

ffplay version n7.1 Copyright (c) 2003-2024 the FFmpeg developers
  built with gcc 14.2.1 (GCC) 20240910
  configuration: --prefix=/usr --disable-debug --disable-static --disable-stripping --enable-amf --enable-avisynth --enable-cuda-llvm --enable-lto --enable-fontconfig --enable-frei0r --enable-gmp --enable-gnutls --enable-gpl --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libdav1d --enable-libdrm --enable-libdvdnav --enable-libdvdread --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgsm --enable-libharfbuzz --enable-libiec61883 --enable-libjack --enable-libjxl --enable-libmodplug --enable-libmp3lame --enable-libopencore_amrnb --enable-libopencore_amrwb --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libplacebo --enable-libpulse --enable-librav1e --enable-librsvg --enable-librubberband --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable




TypeError: 'ChatCompletion' object is not subscriptable

# webrtcvad

In [None]:
%pip install webrtcvad

import wave
import webrtcvad

def record_audio_with_vad(output_filename="./recording.wav", sample_rate=16000, duration=10, vad_aggressiveness=3):

    print("Recording with VAD...")

    # Initialize VAD
    vad = webrtcvad.Vad(vad_aggressiveness)
    buffer_duration = 0.03  # 30 ms
    buffer_size = int(sample_rate * buffer_duration)
    
    audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1, dtype='int16')
    sd.wait()

    audio_bytes = audio.tobytes()

    # Process audio in chunks and perform VAD
    speech_detected = False
    speech_frames = []
    for i in range(0, len(audio_bytes), buffer_size * 2):  # 2 bytes per int16 sample
        frame = audio_bytes[i:i + buffer_size * 2]
        if len(frame) < buffer_size * 2:
            break

        if vad.is_speech(frame, sample_rate):
            speech_detected = True
            speech_frames.append(frame)
        elif speech_detected:
            # Stop when speech ends
            break

    with wave.open(output_filename, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)  # 2 bytes for int16
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(speech_frames))

    print(f"Recording saved to {output_filename}")
    return output_filename