In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
num_cores = os.cpu_count()
print(f"You have {num_cores} CPU cores available in this Colab session.")

You have 2 CPU cores available in this Colab session.


In [None]:
%%writefile install_mfa.sh
#!/bin/bash

root_dir=${1:-/tmp/mfa}
mkdir -p $root_dir
cd $root_dir

wget -q --show-progress https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh -b -p $root_dir/miniconda3 -f

$root_dir/miniconda3/bin/conda create -n aligner -c conda-forge montreal-forced-aligner -y

echo -e "\n======== DONE =========="
echo -e "\nTo activate MFA, run: source $root_dir/miniconda3/bin/activate aligner"
echo -e "\nTo delete MFA, run: rm -rf $root_dir"
echo -e "\nSee: https://montreal-forced-aligner.readthedocs.io/en/latest/aligning.html to know how to use MFA"

Writing install_mfa.sh


In [None]:
# download and install mfa
INSTALL_DIR="/tmp/mfa"

!bash ./install_mfa.sh {INSTALL_DIR}
!{INSTALL_DIR}/miniconda3/bin/conda run -n aligner conda install -c conda-forge llvm-openmp -y

PREFIX=/tmp/mfa/miniconda3
Unpacking payload ...

Installing base environment...

Preparing transaction: ...working... done
Executing transaction: ...working... done
installation finished.
    You currently have a PYTHONPATH environment variable set. This may cause
    unexpected behavior when running the Python interpreter in Miniconda3.
    For best results, please verify that your PYTHONPATH only points to
    directories of packages that are compatible with the Python interpreter
    in Miniconda3: /tmp/mfa/miniconda3
Channels:
 - conda-forge
 - defaults
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / done
Solving environment: \ | / - \ done


    current version: 25.1.1
    latest version: 25.3.1

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment loca

In [None]:
!export MPLBACKEND=Agg && {INSTALL_DIR}/miniconda3/bin/conda run -n aligner mfa train --help

                                                                                                    
 Usage: mfa train [OPTIONS] CORPUS_DIRECTORY DICTIONARY_PATH OUTPUT_MODEL_PATH                      
                                                                                                    
 Train a new acoustic model on a corpus and optionally export alignments                            
                                                                                                    
╭─ Options ────────────────────────────────────────────────────────────────────────────────────────╮
│ --output_directory                    DIRECTORY                     Path to save alignments.     │
│ --config_path                 -c      FILE                          Path to config file to use   │
│                                                                     for training. See            │
│                                                                     https://github.com/Mo

In [None]:
import matplotlib
os.environ["MPLBACKEND"] = "Agg"
matplotlib.use("Agg")

In [None]:
!export MPLBACKEND=Agg && {INSTALL_DIR}/miniconda3/bin/conda run -n aligner pip install matplotlib




In [None]:
DATASET_PATH = "/content/drive/path/to/dataset" # path to audio datasets
AUDIO_PATH = f"{DATASET_PATH}/MFA_Output/MFA_align" # audio with word-level transcriptions per audio file
LEXICON_PATH = f"{DATASET_PATH}/Marungko_Pronunciation_Dictionary_CMU.txt"  # Phonetic lexicon
OUTPUT_PATH = f"{DATASET_PATH}/MFA_Output"

In [None]:
!source {INSTALL_DIR}/miniconda3/bin/activate aligner && mfa version

3.2.1


In [None]:
# VALIDATE
!source {INSTALL_DIR}/miniconda3/bin/activate aligner; \
mfa validate $AUDIO_PATH $LEXICON_PATH

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                                          
[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m6[0m files, average number of utterances per speaker: [1;36m6.0[0m             
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                                      
[2;36m [0m         jobs. Use the --single_speaker flag if you would like to split utterances across jobs     
[2;36m [0m         regardless of their speaker.                                                              
[2;36m [0m[32mINFO    [0m Text already normalized.                                                                  
[2;36m [0m[32mINFO    [0m Features already generated.                                                               
[2;36m [0m[32mINFO    [0m Corpus                                                                                    

In [None]:
# TRAIN
!source {INSTALL_DIR}/miniconda3/bin/activate aligner && mfa train $AUDIO_PATH $LEXICON_PATH $OUTPUT_PATH/acoustic_model.zip --use_mp #--num_jobs 6 --single_speaker

[2;36m [0m[32mINFO    [0m Using previous initialization.                                                            
[2;36m [0m[32mINFO    [0m Pronunciation probability estimation already done, loading saved probabilities[33m...[0m         
[2;36m [0m[32mINFO    [0m Initializing training for sat_3[33m...[0m                                                        
[2;36m [0m[32mINFO    [0m Initialization complete!                                                                  
[2;36m [0m[32mINFO    [0m sat_3 - Iteration [1;36m1[0m of [1;36m35[0m                                                                 
[2;36m [0m[32mINFO    [0m sat_3 - Iteration [1;36m2[0m of [1;36m35[0m                                                                 
[2;36m [0m[32mINFO    [0m sat_3 - Iteration [1;36m3[0m of [1;36m35[0m                                                                 
[2;36m [0m[32mINFO    [0m sat_3 - Iteration [1;36m4[0m of [1;3

In [None]:
# Align the Data Using the Trained Model
!source {INSTALL_DIR}/miniconda3/bin/activate aligner && mfa align $AUDIO_PATH $LEXICON_PATH $OUTPUT_PATH/acoustic_model.zip {OUTPUT_PATH}/MFA_results_add --beam 100 --retry_beam --400

[2;36m [0m[32mINFO    [0m Setting up corpus information[33m...[0m                                                          
[2;36m [0m[32mINFO    [0m Found [1;36m1[0m speaker across [1;36m6[0m files, average number of utterances per speaker: [1;36m6.0[0m             
[2;36m [0m[32mINFO    [0m Initializing multiprocessing jobs[33m...[0m                                                      
[2;36m [0m         jobs. Use the --single_speaker flag if you would like to split utterances across jobs     
[2;36m [0m         regardless of their speaker.                                                              
[2;36m [0m[32mINFO    [0m Text already normalized.                                                                  
[2;36m [0m[32mINFO    [0m Features already generated.                                                               
[2;36m [0m[32mINFO    [0m Compiling training graphs[33m...[0m                                                     

In [None]:
# Zip & Download Results (Trained Model + TextGrid Files)
!zip -r {OUTPUT_PATH}/mfa_results.zip {OUTPUT_PATH}

## To save and reload progress

In [None]:
# SAVE PROGRESS
import shutil

# Source and destination paths
src_path = "/root/Documents/MFA" #/BiRa_Dataset, /joblib_cache, /pretrained_models
backup_path = "/content/drive/path/to/backup"

# Backup
if os.path.exists(backup_path):
    print("Backup folder already exists in Drive, updating...")
    shutil.rmtree(backup_path)
shutil.copytree(src_path, backup_path)
print("MFA training directory successfully backed up to Google Drive.")

Backup folder already exists in Drive, updating...
MFA training directory successfully backed up to Google Drive.


In [None]:
# RELOAD PROGRESS
import shutil

# Restore from backup
backup_path = "/content/drive/path/to/backup"
restore_path = "/root/Documents/MFA"

# Clean existing directory if needed
if os.path.exists(restore_path):
    print("Old MFA directory exists. Deleting before restore...")
    shutil.rmtree(restore_path)

# Restore from Drive
shutil.copytree(backup_path, restore_path)
print("MFA training directory successfully restored from Google Drive.")

Old MFA directory exists. Deleting before restore...
MFA training directory successfully restored from Google Drive.
