This notebook is my process of cleaning and preparing the data for model training. I must convert the mp3 files to wav files, as well as clean the corresponding chat (transcribed) files. The transcriptions are also tokenized.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torchaudio pandas librosa sentencepiece pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
!apt-get install ffmpeg -y

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


This code converts all of my mp3 files to wav files which is what the model requires.

In [None]:
import os
from pydub import AudioSegment

# Define paths
input_folder = '/content/drive/MyDrive/FinalProject570/Data/mp3data'
output_folder = '/content/drive/MyDrive/FinalProject570/Data/wavdata'

# Convert all mp3 files to wav and save in the output folder
def convert_mp3_to_wav(input_folder, output_folder, sample_rate=16000):
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Loop through all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.mp3'):
            # Define full path for input and output files
            mp3_path = os.path.join(input_folder, filename)
            wav_filename = os.path.splitext(filename)[0] + '.wav'
            wav_path = os.path.join(output_folder, wav_filename)

            # Load and convert the audio
            audio = AudioSegment.from_mp3(mp3_path)
            audio = audio.set_frame_rate(sample_rate).set_channels(1)  # Set to mono and resample
            audio.export(wav_path, format='wav')

            print(f'Converted {filename} to {wav_filename}')

# Run the conversion function
convert_mp3_to_wav(input_folder, output_folder)


Converted herring1.mp3 to herring1.wav
Converted herring2.mp3 to herring2.wav
Converted herring3.mp3 to herring3.wav
Converted herring5.mp3 to herring5.wav
Converted herring6.mp3 to herring6.wav
Converted herring7.mp3 to herring7.wav
Converted herring8.mp3 to herring8.wav
Converted herring9.mp3 to herring9.wav
Converted herring10.mp3 to herring10.wav
Converted herring11.mp3 to herring11.wav
Converted herring12.mp3 to herring12.wav
Converted herring13.mp3 to herring13.wav
Converted herring14.mp3 to herring14.wav
Converted herring15.mp3 to herring15.wav
Converted herring16.mp3 to herring16.wav
Converted herring17.mp3 to herring17.wav
Converted maria1.mp3 to maria1.wav
Converted maria2.mp3 to maria2.wav
Converted maria4.mp3 to maria4.wav
Converted maria7.mp3 to maria7.wav
Converted maria10.mp3 to maria10.wav
Converted maria16.mp3 to maria16.wav
Converted maria18.mp3 to maria18.wav
Converted maria19.mp3 to maria19.wav
Converted maria20.mp3 to maria20.wav
Converted maria21.mp3 to maria21.wa

This next function cleans the chat files I have.

In [None]:
import os
import re

# Define the path to the .cha files
cha_folder = '/content/drive/MyDrive/FinalProject570/Data/chatdata'

def clean_cha_file(cha_file):
    """
    Extract and clean transcription from a .cha file.
    """
    with open(cha_file, 'r') as file:
        lines = file.readlines()

    # Extract utterances and add language markers as needed
    transcription = []
    for line in lines:
        # Only keep lines that start with "*", which indicate actual speech lines
        if line.startswith('*'):
            # Clean and remove non-speech symbols (this may vary depending on .cha format specifics)
            clean_line = re.sub(r'[^\w\s<ENG><SPA>]', '', line)
            transcription.append(clean_line.strip())

    return ' '.join(transcription)

# Process each .cha file
for filename in os.listdir(cha_folder):
    if filename.endswith('.cha'):
        cha_path = os.path.join(cha_folder, filename)
        transcription = clean_cha_file(cha_path)
        print(f"Processed {filename}:\n{transcription}\n")


Output hidden; open in https://colab.research.google.com to view.

Next, I will align the .cha and .wav files in a dictionary data structure so they are organized next to each other.

In [2]:
import os

# Paths to the .wav and .cha directories
wav_folder = '/content/drive/MyDrive/FinalProject570/Data/wavdata'
cha_folder = '/content/drive/MyDrive/FinalProject570/Data/chatdata'

def create_alignment_dict(wav_folder, cha_folder):
    """
    Creates a dictionary that maps each .wav file to its corresponding .cha transcription file.

    Args:
        wav_folder (str): Path to the directory containing .wav files.
        cha_folder (str): Path to the directory containing .cha files.

    Returns:
        dict: A dictionary with keys as .wav file paths and values as corresponding .cha file contents.
    """
    alignment_dict = {}

    # Get the list of .wav and .cha files
    wav_files = [f for f in os.listdir(wav_folder) if f.endswith('.wav')]
    cha_files = [f for f in os.listdir(cha_folder) if f.endswith('.cha')]

    # Convert cha_files list to a dictionary for quick lookup by filename
    cha_dict = {os.path.splitext(f)[0]: os.path.join(cha_folder, f) for f in cha_files}

    for wav_file in wav_files:
        wav_name = os.path.splitext(wav_file)[0]  # Remove the .wav extension
        if wav_name in cha_dict:
            # Read the transcription content from the corresponding .cha file
            cha_path = cha_dict[wav_name]
            with open(cha_path, 'r') as file:
                transcription = file.read()
            # Store the wav path and cleaned transcription in the dictionary
            alignment_dict[os.path.join(wav_folder, wav_file)] = transcription
        else:
            print(f"No matching .cha file found for {wav_file}")

    return alignment_dict

# Create the alignment dictionary
alignment_dict = create_alignment_dict(wav_folder, cha_folder)

# Print the first few entries to verify
for wav_path, transcription in list(alignment_dict.items())[:5]:
    print(f"WAV file: {wav_path}\nTranscription: {transcription[:100]}...\n")


WAV file: /content/drive/MyDrive/FinalProject570/Data/wavdata/herring1.wav
Transcription: @Font:	Win95:Lucida Sans Unicode:-16:0
@UTF8
@Begin
@Languages:	eng, spa
@Participants:	LAU Lauren A...

WAV file: /content/drive/MyDrive/FinalProject570/Data/wavdata/herring2.wav
Transcription: @Font:	Win95:Arial Unicode MS:-16:0
@UTF8
@Begin
@Languages:	spa, eng
@Participants:	TOM Tomás Adult...

WAV file: /content/drive/MyDrive/FinalProject570/Data/wavdata/herring3.wav
Transcription: @Font:	Win95:Arial:-21:0
@UTF8
@Begin
@Languages:	spa, eng
@Participants:	ASH Ashley Adult, JAC Jack...

WAV file: /content/drive/MyDrive/FinalProject570/Data/wavdata/herring5.wav
Transcription: @Font:	Win95:Arial Unicode MS:-21:0
@UTF8
@Begin
@Languages:	spa, eng
@Participants:	NOA Noah Adult,...

WAV file: /content/drive/MyDrive/FinalProject570/Data/wavdata/herring6.wav
Transcription: @Font:	Win95:Arial:-16:0
@UTF8
@Begin
@Languages:	eng, spa
@Participants:	JES Jessica Adult, NIC Nic...



Now that we have the alignment dictionary with .wav file paths and corresponding .cha transcriptions, the next step is to clean and tokenize the transcriptions and prepare them for model training.

In [3]:
import re

def clean_transcription(transcription):
    """
    Cleans the transcription by removing unwanted characters and adding language markers.

    Args:
        transcription (str): Raw transcription text from .cha file.

    Returns:
        str: Cleaned transcription with language markers.
    """
    # Example: Add language markers (adjust regex for your data if needed)
    transcription = re.sub(r'\[.*?\]', '', transcription)  # Remove metadata in brackets
    transcription = re.sub(r'[^\w\s]', '', transcription)  # Remove special characters except spaces
    transcription = transcription.replace("ENG:", "<ENG>").replace("SPA:", "<SPA>")  # Example markers
    transcription = re.sub(r'\s+', ' ', transcription)  # Replace multiple spaces with a single space
    return transcription.strip()

# Apply cleaning to all transcriptions in the alignment dictionary
for wav_path in alignment_dict:
    alignment_dict[wav_path] = clean_transcription(alignment_dict[wav_path])

# Print a few cleaned transcriptions for verification
for wav_path, transcription in list(alignment_dict.items())[:5]:
    print(f"Audio file: {wav_path}\nCleaned Transcription: {transcription[:100]}...\n")


Audio file: /content/drive/MyDrive/FinalProject570/Data/wavdata/herring1.wav
Cleaned Transcription: Font Win95Lucida Sans Unicode160 UTF8 Begin Languages eng spa Participants LAU Lauren Adult CHL Chlo...

Audio file: /content/drive/MyDrive/FinalProject570/Data/wavdata/herring2.wav
Cleaned Transcription: Font Win95Arial Unicode MS160 UTF8 Begin Languages spa eng Participants TOM Tomás Adult MIG Miguel A...

Audio file: /content/drive/MyDrive/FinalProject570/Data/wavdata/herring3.wav
Cleaned Transcription: Font Win95Arial210 UTF8 Begin Languages spa eng Participants ASH Ashley Adult JAC Jack Adult OSE non...

Audio file: /content/drive/MyDrive/FinalProject570/Data/wavdata/herring5.wav
Cleaned Transcription: Font Win95Arial Unicode MS210 UTF8 Begin Languages spa eng Participants NOA Noah Adult MEG Megan Adu...

Audio file: /content/drive/MyDrive/FinalProject570/Data/wavdata/herring6.wav
Cleaned Transcription: Font Win95Arial160 UTF8 Begin Languages eng spa Participants JES Jessica Adult N

In [6]:
!pip install tokenizers



In [7]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from tokenizers.pre_tokenizers import Whitespace

# Load and prepare transcriptions
with open('transcriptions.txt', 'r') as f:
    lines = [line.strip() for line in f if line.strip()]

# Initialize BPE tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = Whitespace()

# Train tokenizer on your transcriptions
trainer = trainers.BpeTrainer(vocab_size=8000, min_frequency=2, special_tokens=["<PAD>", "<UNK>", "<ENG>", "<SPA>"])
tokenizer.train_from_iterator(lines, trainer)

# Save the tokenizer model
tokenizer.save("bpe_tokenizer.json")

# Example: Tokenize a sample text
encoding = tokenizer.encode("This is an example sentence.")
print("Tokenized:", encoding.tokens)


Tokenized: ['T', 'his', 'is', 'an', 'example', 'sent', 'ence']


In [8]:
!pip install nltk



In [9]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

# Download necessary NLTK data
nltk.download('punkt')

# Load transcriptions from the alignment dictionary (you already have this)
lines = list(alignment_dict.values())

# Tokenize all transcriptions at the word level
all_tokens = []
for line in lines:
    tokens = word_tokenize(line.lower())  # Tokenize and make lowercase for consistency
    all_tokens.extend(tokens)

# Build vocabulary with a fixed size
vocab_size = 8000
vocab = [token for token, _ in Counter(all_tokens).most_common(vocab_size)]

# Create a vocabulary dictionary for token-to-id mapping
vocab_dict = {token: idx for idx, token in enumerate(vocab, start=1)}  # Start indexing from 1

# Define a function to convert sentences to token IDs
def text_to_token_ids(text, vocab_dict, unknown_token=0):
    tokens = word_tokenize(text.lower())
    token_ids = [vocab_dict.get(token, unknown_token) for token in tokens]
    return token_ids

# Example usage
example_text = "This is an example sentence."
token_ids = text_to_token_ids(example_text, vocab_dict)
print("Token IDs:", token_ids)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Token IDs: [84, 27, 248, 1708, 0, 0]


In [11]:
import json

# Create a list to store the (audio_path, token_ids) pairs
training_data = []

for wav_path, transcription in alignment_dict.items():
    token_ids = text_to_token_ids(transcription, vocab_dict)
    training_data.append({
        "audio_path": wav_path,
        "token_ids": token_ids
    })

output_path = '/content/drive/MyDrive/FinalProject570/training_data.json'
with open(output_path, 'w') as f:
    json.dump(training_data, f)

print(f"Training data saved as '{output_path}'.")


Training data saved as '/content/drive/MyDrive/FinalProject570/training_data.json'.


In [12]:
!pip install transformers datasets soundfile

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [13]:
import json
from datasets import Dataset, Audio

# Load training data from Google Drive
data_path = '/content/drive/MyDrive/FinalProject570/training_data.json'
with open(data_path, 'r') as f:
    training_data = json.load(f)

# Convert to Hugging Face Dataset format
dataset = Dataset.from_dict({
    "audio_path": [entry["audio_path"] for entry in training_data],
    "transcription": [entry["token_ids"] for entry in training_data]
})

# Add audio column using file paths
dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16000))

In [15]:
!rm -rf ~/.cache/huggingface

In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
cha_path = '/content/drive/MyDrive/FinalProject570/Data/chatdata/sastre3.cha'
wav_path = '/content/drive/MyDrive/FinalProject570/Data/wavdata/sastre3.wav'
output_dir = '/content/drive/MyDrive/FinalProject570/Data/segments'  # Directory to save segmented audio files


In [6]:
import os
import re
from pydub import AudioSegment

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Define regex to match timestamps and language markers
timestamp_pattern = re.compile(r"(\d+)_(\d+)")
language_pattern = re.compile(r"@s:(eng|spa)")

def parse_cha_file(cha_path):
    segments = []
    current_language = None

    with open(cha_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Check if line contains a timestamp and a language marker
            timestamp_match = timestamp_pattern.search(line)
            language_match = language_pattern.search(line)

            if timestamp_match and language_match:
                start_time = int(timestamp_match.group(1))
                end_time = int(timestamp_match.group(2))
                language = language_match.group(1)  # 'eng' or 'spa'

                # Add segment info to list
                segments.append({
                    'start_time': start_time,
                    'end_time': end_time,
                    'language': 0 if language == 'eng' else 1  # 0 for English, 1 for Spanish
                })

    return segments

# Parse the .cha file to get labeled segments
segments = parse_cha_file(cha_path)
print(f"Parsed {len(segments)} segments.")


Parsed 221 segments.


In [7]:
# Load the full audio file
audio = AudioSegment.from_wav(wav_path)

def save_segments(audio, segments, output_dir):
    for i, segment in enumerate(segments):
        start = segment['start_time']
        end = segment['end_time']
        language = segment['language']

        # Extract the audio segment
        audio_segment = audio[start:end]

        # Save the audio segment with a language label in the filename
        segment_filename = f"sastre3_segment_{i}_lang_{language}.wav"
        audio_segment.export(os.path.join(output_dir, segment_filename), format="wav")

# Save segments to the output directory
save_segments(audio, segments, output_dir)
print(f"Saved segmented audio files to {output_dir}")


Saved segmented audio files to /content/drive/MyDrive/FinalProject570/Data/segments


In [8]:
!pip install datasets transformers soundfile


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [9]:
import json
import os

# Define the output JSON file path
json_path = '/content/drive/MyDrive/FinalProject570/Data/training_data.json'

# Gather paths and labels for each segmented audio file
data = []
for filename in os.listdir(output_dir):
    if filename.endswith('.wav'):
        # Extract the language label from the filename
        label = int(filename.split('_')[-1].replace('.wav', ''))

        # Append file info and label to the data list
        data.append({
            'audio_path': os.path.join(output_dir, filename),
            'label': label  # 0 for English, 1 for Spanish
        })

# Save the data to a JSON file
with open(json_path, 'w') as f:
    json.dump(data, f)

print(f"Training data saved to {json_path}")


Training data saved to /content/drive/MyDrive/FinalProject570/Data/training_data.json


In [10]:
from datasets import load_dataset, Audio

# Load the dataset from the JSON file
dataset = load_dataset('json', data_files=json_path, split='train')

# Cast the audio_path column to Audio type for processing
dataset = dataset.cast_column('audio_path', Audio(sampling_rate=16000))

# Verify the dataset structure
print(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['audio_path', 'label'],
    num_rows: 221
})
