### Notes

MFA recommends 16kHz mono audio
https://montreal-forced-aligner.readthedocs.io/en/latest/user_guide/corpus_structure.html

In [1]:
#!pip install montreal-forced-aligner
#!pip install pydub
#!pip install praatio
#!conda install -c conda-forge montreal-forced-aligner


import os
import subprocess
from pydub import AudioSegment
from praatio import textgrid

# Run MFA via terminal
recommended

In [None]:
# install english_mfa model if not already installed
!mfa model download acoustic english_mfa

#install US dictionary
!mfa model download dictionary english_us_mfa

INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []


In [None]:
#Sample test using postgres instead of default sqlite due to a bug
#202004_29 missing from the sample data
!mfa server init # for first time only
!mfa server start # when you want to start the server
!mfa align --clean --verbose data/sample_data english_us_mfa english_mfa combined_output2 --use_postgres --auto_server --beam 100 --retry_beam 400

^C


INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
INFO:mfa:Initializing the global MFA database server...
ERROR:mfa:The server directory already exists, if you would like to make a new server, please run `mfa server delete` first, or run `mfa server start` to start the existing one.


# (Run MFA via Python)
not recommended since terminal outputs critical information that python cannot

In [None]:
import os
import subprocess

def run_mfa_align():
    server_init_command = ['mfa', 'server', 'init']
    server_init_result = subprocess.run(server_init_command, capture_output=True, text=True)
    print(server_init_result.stdout)
    print(server_init_result.stderr)

    sample_data_dir = 'data/sample_data'
    combined_output_dir = 'combined_output'
    
    # Start the MFA server
    server_start_command = ['mfa', 'server', 'start']
    server_start_result = subprocess.run(server_start_command, capture_output=True, text=True)
    print(server_start_result.stdout)
    print(server_start_result.stderr)
    
    for filename in os.listdir(sample_data_dir):
        sample_file_path = os.path.join(sample_data_dir, filename)
        combined_file_path = os.path.join(combined_output_dir, filename)
        
        if os.path.exists(combined_file_path):
            print(f"Skipping {filename}, already exists in {combined_output_dir}")
            continue
        
        command = [
            'mfa', 'align', '--clean', '--verbose', sample_data_dir, 
            'english_us_mfa', 'english_mfa', combined_output_dir, 
            '--use_postgres', '--auto_server', '--beam', '100', '--retry_beam', '400'
        ]
        
        result = subprocess.run(command, capture_output=True, text=True)
        print(result.stdout)
        print(result.stderr)


run_mfa_align()


INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
INFO:mfa:Initializing the global MFA database server...
ERROR:mfa:The server directory already exists, if you would like to make a new server, please run `mfa server delete` first, or run `mfa server start` to start the existing one.



In [22]:
# Paths for necessary files
MFA_EXECUTABLE = "mfa"  # Adjust this if the `mfa` command is not in your PATH
ACOUSTIC_MODEL = "english_mfa"  # Pretrained acoustic model
PRONUNCIATION_DICTIONARY = "english.dict"  # Path to CMU dictionary or equivalent

def run_mfa_alignment(audio_dir, transcript_dir, output_dir):
    """
    Run MFA to align audio and transcription.
    Args:
        audio_dir (str): Directory containing `.wav` files.
        transcript_dir (str): Directory containing `.txt` transcription files.
        output_dir (str): Directory to store the alignment results.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Combine audio and transcription files into a single input directory for MFA
    combined_input_dir = "combined_input"
    if not os.path.exists(combined_input_dir):
        os.makedirs(combined_input_dir)

    # Symlink or copy files into the combined directory
    for audio_file in os.listdir(audio_dir):
        if audio_file.endswith(".wav"):
            audio_path = os.path.join(audio_dir, audio_file)
            os.symlink(audio_path, os.path.join(combined_input_dir, audio_file))

            # Match transcription files
            transcript_file = os.path.splitext(audio_file)[0] + ".txt"
            transcript_path = os.path.join(transcript_dir, transcript_file)
            if os.path.exists(transcript_path):
                os.symlink(transcript_path, os.path.join(combined_input_dir, transcript_file))
            else:
                print(f"Warning: No transcript found for {audio_file}")

    # Run MFA align command
    command = [
        MFA_EXECUTABLE,
        "align",
        combined_input_dir,
        PRONUNCIATION_DICTIONARY,
        output_dir,
        ACOUSTIC_MODEL,
    ]
    try:
        subprocess.run(command, check=True)
        print(f"Alignment completed. Results stored in: {output_dir}")
    except subprocess.CalledProcessError as e:
        print(f"Error running MFA alignment: {e}")
        exit(1)

# Chunk audio into smaller pieces
specify threshold

In [122]:
#!pip install tgt
from pydub import AudioSegment
import os
import tgt

#break_threshold is whenever a longer pause happens of at least x threshold -> too large threshold might lead to longer segments
def split_audio(audio_path, textgrid_path, output_audio, output_text, break_threshold=0.3):
    tg_file = os.path.splitext(os.path.basename(textgrid_path))[0]
    audio = AudioSegment.from_wav(audio_path)
    print(tg_file)
    
    # Read the TextGrid file & extract intervals
    tg = tgt.read_textgrid(textgrid_path, include_empty_intervals=True)
    word_tier = tg.get_tier_by_name("words")
    intervals = [(interval.start_time, interval.end_time, interval.text) for interval in word_tier.intervals]
    
    # Split the audio based on intervals, ensuring each segment is less than 30 seconds
    current_start = 0
    current_end = intervals[0][1] # start chunking segments from the end of the first interval, since it is normally a break
    for i, (start, end, text) in enumerate(intervals[1:]):
        if end - start > break_threshold and text == "":
            current_start += 1
            audio_segment = audio[current_end*1000:start*1000]
            #audio_segment.export(os.path.join(output_audio, f"{tg_file}_{current_start}.wav"), format="wav")

            with open(os.path.join(output_text, f"{tg_file}_{current_start}.txt"), "w") as text_file:
                text_file.write(" ".join([interval[2] for interval in intervals if interval[0] >= current_end and interval[1] <= start]))

            
            current_end = end
    print("Average length of a segment:", end/current_start, "Number of segments:", current_start)


textgrid_path = "combined_output"
output_audio = "data/split_audio"
output_text = "data/split_text"
audio_path="data/audio_files"


for tg_file in os.listdir(textgrid_path):
    file_name_format = os.path.splitext(os.path.basename(tg_file))[0]
    if tg_file.endswith(".TextGrid"):
        audio_file = os.path.join(audio_path, file_name_format + ".wav")
        split_audio(audio_file, textgrid_path + "/" + tg_file, output_audio, output_text)

20190130_FOMC


Average length of a segment: 4.2199446789396164 Number of segments: 679
20190320_FOMC
Average length of a segment: 3.716889461434371 Number of segments: 739
20190501_FOMC
Average length of a segment: 4.085311157024793 Number of segments: 605
20190619_FOMC
Average length of a segment: 3.658091585872576 Number of segments: 722
20190918_FOMC
Average length of a segment: 3.794030298416565 Number of segments: 821
20191030_FOMC
Average length of a segment: 4.111583423850575 Number of segments: 696
20191211_FOMC
Average length of a segment: 3.864035268414482 Number of segments: 801
20200129_FOMC
Average length of a segment: 4.0920997793190415 Number of segments: 793
20200429_FOMC
Average length of a segment: 3.7056354827144684 Number of segments: 781
20200610_FOMC
Average length of a segment: 3.8011677559912855 Number of segments: 918
20200729_FOMC
Average length of a segment: 3.849745140388769 Number of segments: 926
20200916_FOMC
Average length of a segment: 3.883576332994924 Number of segm

In [163]:
import random
from pydub.playback import play
from IPython.display import Audio


output_audio = "data/split_audio"
output_text = "data/split_text"


# shows random example of an aligned, chunked audio file and its transcription fro manual inspection
random_wav = random.choice(os.listdir(output_audio))
with open(os.path.join(output_text, random_wav[:-4]+".txt"), "r") as txt:
    print(txt.read())
Audio(os.path.join(output_audio, random_wav))

as we reiterated in today's statement with inflation running persistently below 2 percent
