In [29]:
from dotenv import load_dotenv

import streamlit as st
import noisereduce as nr
from scipy.io import wavfile
import tempfile
import os
import azure.cognitiveservices.speech as speechsdk
from pydub import AudioSegment

In [30]:
AZURE_SPEECH_KEY, AZURE_SERVICE_REGION = "601c0ddb54cd40709ed8efe586d8ed42", "southeastasia"

In [31]:
def convert_to_wav(input_file, output_file):
    audio = AudioSegment.from_file(input_file)
    audio.export(output_file, format="wav")

def reduce_noise(audio_path):
    """Apply noise reduction to the audio file."""
    if not audio_path.endswith(".wav"):
        audio_path = convert_to_wav(audio_path)

    rate, data = wavfile.read(audio_path)
    reduced_noise = nr.reduce_noise(y=data, sr=rate)

    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    wavfile.write(temp_file.name, rate, reduced_noise)
    return temp_file.name

def azure_speech_to_text(audio_path):
    """Transcribe audio using Azure Speech Services."""
    speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SERVICE_REGION)
    audio_config = speechsdk.audio.AudioConfig(filename=audio_path)

    # Enable auto language detection
    auto_language_config = speechsdk.languageconfig.AutoDetectSourceLanguageConfig(languages=["en-US", "id-ID"])

    speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config,
        audio_config=audio_config,
        auto_detect_source_language_config=auto_language_config
    )

    all_transcriptions = []

    def recognized_handler(evt):
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            detected_language = evt.result.language
            text = evt.result.text
            all_transcriptions.append(f"[{detected_language}] {text}")

    speech_recognizer.recognized.connect(recognized_handler)

    try:
        speech_recognizer.start_continuous_recognition()
        speech_recognizer.stop_continuous_recognition()
    except Exception as e:
        return f"Error during Azure transcription: {e}"

    return "/n".join(all_transcriptions)

In [32]:
audio_path = "D:/Github/Speech-to-Text-Summarization/data/audio/Natadesa/Recording 6.m4a"

convert_to_wav(audio_path, "temp_audio.wav")
speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SERVICE_REGION)
audio_config = speechsdk.audio.AudioConfig(filename="temp_audio.wav")
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
speech_recognition_result = speech_recognizer.recognize_once_async().get()
speech_recognition_result.text

'Tapi tu sangat.'

''

In [34]:
import os
import azure.cognitiveservices.speech as speechsdk

def recognize_from_microphone():
    # This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
    #speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), region=os.environ.get('SPEECH_REGION'))
    speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SERVICE_REGION)
    speech_config.speech_recognition_language="en-US"

    #audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    audio_config = speechsdk.audio.AudioConfig(filename="temp_audio.wav")
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    print("Speak into your microphone.")
    speech_recognition_result = speech_recognizer.recognize_once_async().get()

    if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("Recognized: {}".format(speech_recognition_result.text))
    elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(speech_recognition_result.no_match_details))
    elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = speech_recognition_result.cancellation_details
        print("Speech Recognition canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))
            print("Did you set the speech resource key and region values?")

recognize_from_microphone()

Speak into your microphone.


KeyboardInterrupt: 

In [38]:
import os
import azure.cognitiveservices.speech as speechsdk

def recognize_from_audio_file_continuous():
    #AZURE_SPEECH_KEY = "your_speech_service_key"
    #AZURE_SERVICE_REGION = "your_service_region"

    speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SERVICE_REGION)
    speech_config.speech_recognition_language = "id-ID"

    audio_file_path = "temp_audio.wav"
    audio_config = speechsdk.audio.AudioConfig(filename=audio_file_path)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    print("Processing the audio file for continuous recognition...")

    def recognized_callback(evt):
        print("Recognized: {}".format(evt.result.text))

    done = False

    def stop_cb(evt):
        nonlocal done
        done = True

    # Connect callbacks
    speech_recognizer.recognized.connect(recognized_callback)
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        pass
    speech_recognizer.stop_continuous_recognition()

recognize_from_audio_file_continuous()


Processing the audio file for continuous recognition...
Recognized: Tetapi juga dan kawasan hukum.
Recognized: Terus kita juga ada kerja sama sama uh biasanya harus narik manajemen hilang.
Recognized: Buat bantu ngelola jadi harus bekerja sama dengan manajemen film kayak kita kan ngambilnya elf hva.
Recognized: Jadi kita boleh dishare enggak sih pak? Kalau materi ini ini materi prestasi ppt paling nanti ih brosurnya aja ya yang yang petah betah tadi boleh nih ada di.
Recognized: Di sini jujur anaknya rebel ini sih.
Recognized: 3 episode enggak ada ya? Di website enggak ada enggak ada.
Recognized: Makanya kalau enggak datang ke sini enggak tahu. Iya, makanya iya kalau enggak ketemu mbak lisa juga enggak tahu. Enggak tahu sih.
Recognized: Oh ini estimate troy nya ini memang di state ya wow sampai 6 bisa di Bali.
Recognized: Jujur lebih jauh daripada.
Recognized: Oh tipe dia lebih cepat. Itu kan kayak sewa tahu.
Recognized: Lebih lebih gede.


In [39]:
def recognize_with_segment_timestamps():
    #AZURE_SPEECH_KEY = "your_speech_service_key"
    #AZURE_SERVICE_REGION = "your_service_region"

    speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SERVICE_REGION)
    speech_config.speech_recognition_language = "id-ID"

    # Request detailed output
    speech_config.output_format = speechsdk.OutputFormat.Detailed

    audio_file_path = "temp_audio.wav"
    audio_config = speechsdk.audio.AudioConfig(filename=audio_file_path)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    print("Processing the audio file for transcription with timestamps...")

    def recognized_callback(evt):
        # Access the detailed recognition result
        result_json = evt.result.json
        result_dict = eval(result_json)  # Convert JSON string to a dictionary

        recognized_text = result_dict["DisplayText"]
        offset = result_dict["Offset"]  # Start time in 100-nanoseconds
        duration = result_dict["Duration"]  # Duration in 100-nanoseconds

        # Convert offset to MM:SS format
        start_seconds = offset / 10**7
        start_minutes = int(start_seconds // 60)
        start_seconds = int(start_seconds % 60)

        # Print timestamp and text
        print(f"[{start_minutes:02}:{start_seconds:02}] {recognized_text}")

    done = False

    def stop_cb(evt):
        nonlocal done
        done = True

    # Connect callbacks
    speech_recognizer.recognized.connect(recognized_callback)
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        pass
    speech_recognizer.stop_continuous_recognition()

recognize_with_segment_timestamps()

Processing the audio file for transcription with timestamps...
[00:02] Tetapi juga dan kawasan hukum.
[00:05] Terus kita juga ada kerja sama sama uh biasanya harus narik manajemen hilang.
[00:12] Buat bantu ngelola jadi harus bekerja sama dengan manajemen film kayak kita kan ngambilnya elf hva.
[00:24] Jadi kita boleh dishare enggak sih pak? Kalau materi ini ini materi prestasi ppt paling nanti ih brosurnya aja ya yang yang petah betah tadi boleh nih ada di.
[00:36] Di sini jujur anaknya rebel ini sih.
[00:40] 3 episode enggak ada ya? Di website enggak ada enggak ada.
[00:45] Makanya kalau enggak datang ke sini enggak tahu. Iya, makanya iya kalau enggak ketemu mbak lisa juga enggak tahu. Enggak tahu sih.
[00:52] Oh ini estimate troy nya ini memang di state ya wow sampai 6 bisa di Bali.
[00:59] Jujur lebih jauh daripada.
[01:03] Oh tipe dia lebih cepat. Itu kan kayak sewa tahu.
[01:06] Lebih lebih gede.
