In [22]:

!pip install git+https://github.com/openai/whisper.git
!pip install transformers
!pip install edge-tts
!pip install pydub


import whisper
from transformers import pipeline, GPT2Tokenizer
import edge_tts
import asyncio
import numpy as np
import scipy.io.wavfile as wav
import requests
from pydub import AudioSegment


def download_audio(url, filename):
    response = requests.get(url)
    with open(filename, 'wb') as file:
        file.write(response.content)

    try:
        audio = AudioSegment.from_file(filename)

        audio = audio.set_frame_rate(16000).set_sample_width(2)
        audio.export(filename, format="wav")
    except Exception as e:
        print(f"Error converting audio: {e}")


def load_whisper_model():
    model = whisper.load_model("base")
    return model

def audio_to_text(model, audio_file):

    try:
        _, audio_data = wav.read(audio_file)
    except Exception as e:
        print(f"Error loading audio with scipy: {e}")
        return ""

    result = model.transcribe(audio_file)
    return result['text']


def load_llm():
    generator = pipeline('text-generation', model='gpt2')  # Example: GPT-2
    return generator


def generate_response(generator, text_input):
    response = generator(text_input, max_length=50, truncation=True)[0]['generated_text']
    return response


async def text_to_speech(text, output_file):
    communicate = edge_tts.Communicate(text)
    await communicate.save( output_file)




Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-c0v5k_0t
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-c0v5k_0t
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [23]:
def run_pipeline():

    audio_input = "input_audio.wav"
    output_audio = "output_speech.mp3"

    #sample audio
    download_audio("https://www.soundjay.com/button/sounds/beep-07a.wav", audio_input)


    whisper_model = load_whisper_model()
    llm = load_llm()

    #Convert audio to text
    text = audio_to_text(whisper_model, audio_input)
    print(f"Transcribed Text: {text}")


    response = generate_response(llm, text)
    print(f"LLM Response: {response}")

    #Convert text to speech

    !pip install nest_asyncio
    import nest_asyncio
    nest_asyncio.apply()
    asyncio.run(text_to_speech(response, output_audio))


    print(f"Audio saved as {output_audio}")


run_pipeline()

Error converting audio: Decoding failed. ffmpeg returned error code: 1

Output from ffmpeg/avlib:

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh -

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Error loading audio with scipy: File format b'<!DO' not understood. Only 'RIFF' and 'RIFX' supported.
Transcribed Text: 
LLM Response: 
The UESPWiki – Your source for The Elder Scrolls since 1995

The Hinterland

It is sometimes hard to keep track of all of the quests in Skyrim. There is always an asterisk next to something.

Audio saved as output_speech.mp3


In [24]:
from IPython.display import Audio


Audio("output_speech.mp3")
