<a href="https://colab.research.google.com/github/bori0824/G3-finalproject/blob/main/DLTESOL/Intonation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intonation contour: visible intonation (pitch track)

In [None]:
#@markdown 📌 Run this code before you start
%%capture
!pip install pyqrcode gradio pandas gtts requests librosa matplotlib pydub

In [None]:
#@markdown Generate speech and show intonation
from gtts import gTTS
from pydub import AudioSegment
from io import BytesIO
from IPython.display import Audio
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import os

# Function to generate and save a WAV file
def generate_and_save_wav(word, filename='output.wav'):
    tts = gTTS(text=word, lang='en')
    mp3_fp = BytesIO()
    tts.write_to_fp(mp3_fp)
    mp3_fp.seek(0)
    sound = AudioSegment.from_file(mp3_fp, format="mp3")
    sound.export(filename, format="wav")
    return filename

# Function to extract and plot the pitch contour
def plot_pitch_contour(audio_file_path):
    y, sr = librosa.load(audio_file_path, sr=None)
    fmin = librosa.note_to_hz('C2')
    fmax = librosa.note_to_hz('C6')
    pitch, voiced_flag, voiced_probs = librosa.pyin(y, fmin=fmin, fmax=fmax, sr=sr)
    pitch[~np.isfinite(pitch)] = 0

    plt.figure(figsize=(14, 5))
    librosa.display.waveshow(y, sr=sr)
    times = librosa.times_like(pitch, sr=sr)
    for i in range(len(pitch)):
        if pitch[i] > 0:
            plt.plot(times[i], pitch[i], 'ro')

    plt.title('Pitch Contour')
    plt.xlabel('Time (s)')
    plt.ylabel('Pitch (Hz)')
    plt.ylim(0, 350)
    plt.show()

# Get user input
mytext = input('Type a word or sentence: ')

# Generate speech and save to a WAV file
audio_file = generate_and_save_wav(mytext)

# Play the audio
print(f"Generated speech for: {mytext}")
Audio(audio_file)

# Display the pitch contour
print(f"Pitch contour for: {mytext}")
plot_pitch_contour(audio_file)


# Gradio

In [None]:
!pip install gradio

In [None]:
import gradio as gr
from gtts import gTTS
from pydub import AudioSegment
from io import BytesIO
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import numpy as np

def generate_and_save_wav(word):
    tts = gTTS(text=word, lang='en')
    mp3_fp = BytesIO()
    tts.write_to_fp(mp3_fp)
    mp3_fp.seek(0)
    sound = AudioSegment.from_file(mp3_fp, format="mp3")
    buffer = BytesIO()
    sound.export(buffer, format="wav")
    buffer.seek(0)
    return buffer.read()  # Return the bytes of the audio file

def plot_pitch_contour(word):
    audio_bytes = generate_and_save_wav(word)
    audio_buffer = BytesIO(audio_bytes)
    y, sr = librosa.load(audio_buffer, sr=None)
    fmin = librosa.note_to_hz('C2')
    fmax = librosa.note_to_hz('C6')
    pitch, voiced_flag, voiced_probs = librosa.pyin(y, fmin=fmin, fmax=fmax, sr=sr)
    pitch[~np.isfinite(pitch)] = 0

    plt.figure(figsize=(14, 5))
    librosa.display.waveshow(y, sr=sr)
    times = librosa.times_like(pitch, sr=sr)
    plt.scatter(times, pitch, color='red', s=1)  # Use scatter for better visualization

    plt.title('Pitch Contour')
    plt.xlabel('Time (s)')
    plt.ylabel('Pitch (Hz)')
    plt.ylim(0, 350)

    buf = BytesIO()
    plt.savefig(buf, format="png")
    plt.close()
    buf.seek(0)

    img = Image.open(buf)
    img_array = np.array(img)
    return img_array  # Return numpy array directly usable by Gradio

def process_text(word):
    audio_bytes = generate_and_save_wav(word)
    plot_img_array = plot_pitch_contour(word)
    return audio_bytes, plot_img_array

iface = gr.Interface(
    fn=process_text,
    inputs="text",
    outputs=["audio", "image"],
    title="Speech Synthesis and Intonation Display",
    description="Type a word or sentence to generate speech and visualize the intonation pitch contour."
)

# iface.launch()

iface.launch(debug=True)

In [None]:
import librosa
import matplotlib
import numpy
print("Librosa:", librosa.__version__)
print("Matplotlib:", matplotlib.__version__)
print("NumPy:", numpy.__version__)


This is the code on huggingface (2024.0417)

In [None]:
import gradio as gr
from gtts import gTTS
from pydub import AudioSegment
from io import BytesIO
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import numpy as np

import matplotlib
matplotlib.use('Agg')  # Set the backend to 'Agg' for non-GUI environments


def generate_and_save_wav(word):
    tts = gTTS(text=word, lang='en')
    mp3_fp = BytesIO()
    tts.write_to_fp(mp3_fp)
    mp3_fp.seek(0)
    sound = AudioSegment.from_file(mp3_fp, format="mp3")
    buffer = BytesIO()
    sound.export(buffer, format="wav")
    buffer.seek(0)
    return buffer.read()  # Return the bytes of the audio file

def plot_pitch_contour(word):
    audio_bytes = generate_and_save_wav(word)
    audio_buffer = BytesIO(audio_bytes)
    y, sr = librosa.load(audio_buffer, sr=None)
    fmin = librosa.note_to_hz('C2')
    fmax = librosa.note_to_hz('C6')
    pitch, voiced_flag, voiced_probs = librosa.pyin(y, fmin=fmin, fmax=fmax, sr=sr)
    pitch[~np.isfinite(pitch)] = 0

    plt.figure(figsize=(14, 5))
    librosa.display.waveshow(y, sr=sr)
    times = librosa.times_like(pitch, sr=sr)
    plt.scatter(times, pitch, color='red', s=1)  # Use scatter for better visualization

    plt.title('Pitch Contour')
    plt.xlabel('Time (s)')
    plt.ylabel('Pitch (Hz)')
    plt.ylim(0, 350)

    buf = BytesIO()
    plt.savefig(buf, format="png")
    plt.close()
    buf.seek(0)

    img = Image.open(buf)
    img_array = np.array(img)
    return img_array  # Return numpy array directly usable by Gradio

def process_text(word):
    audio_bytes = generate_and_save_wav(word)
    plot_img_array = plot_pitch_contour(word)
    return audio_bytes, plot_img_array

iface = gr.Interface(
    fn=process_text,
    inputs="text",
    outputs=["audio", "image"],
    title="Speech Synthesis and Intonation Display",
    description="Type a word or sentence to generate speech and visualize the intonation pitch contour."
)

# iface.launch()

iface.launch(debug=True)