In [1]:
import pandas as pd
import numpy as np
import numpy as np
def dummy_npwarn_decorator_factory():
  def npwarn_decorator(x):
    return x
  return npwarn_decorator
np._no_nep50_warning = getattr(np, '_no_nep50_warning', dummy_npwarn_decorator_factory)
from pathlib import Path
import librosa
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchaudio
from sklearn.model_selection import GroupKFold
import random

In [11]:
def reduce_phoneme(phoneme):
    """Apply the reduced phoneme table from the paper"""
    reductions = {
        'tʃ': 'tS', 'tʒ': 'tS', 'dʒ': 'dZ',
        'kχ': 'k', 'ks': 'k',
        'h': 'h', 'x': 'x'
    }
    return reductions.get(phoneme, phoneme)

def create_syllable_table():
    """
    Create syllable labels following the paper's method:
    - No delimiter between onset and coda
    - Use reduced phoneme table
    - Silence represented by 'x'
    """
    segment_info = pd.read_csv('../../segment_info/segment_info.csv')
    
    # Apply phoneme reductions
    segment_info['onset_reduced'] = segment_info['onset_phoneme'].apply(reduce_phoneme)
    segment_info['coda_reduced'] = segment_info['coda_phoneme'].apply(reduce_phoneme)
    
    # Create syllable strings (no delimiter)
    segment_info['syllable'] = segment_info.apply(
        lambda row: f"{row['onset_reduced']}{row['coda_reduced']}", 
        axis=1
    )
    
    # Create syllable ID mapping
    unique_syllables = sorted(segment_info['syllable'].unique())
    syllable_to_id = {syl: idx for idx, syl in enumerate(unique_syllables)}
    
    # Add syllable IDs
    segment_info['syllable_id'] = segment_info['syllable'].map(syllable_to_id)
    
    # Print statistics
    print(f"Total unique syllables: {len(unique_syllables)}")
    print("\nSyllable ID mapping:")
    for syl, idx in syllable_to_id.items():
        print(f"{syl}: {idx}")
    
    # Print example distribution
    print("\nExample syllables for each instrument:")
    for instrument in segment_info['instrument_label'].unique():
        examples = segment_info[segment_info['instrument_label'] == instrument].iloc[:3]
        print(f"\n{instrument}:")
        for _, row in examples.iterrows():
            print(f"  {row['syllable']} (ID: {row['syllable_id']})")
            
    return segment_info, syllable_to_id


In [3]:
def compute_log_mel_patch(input_audio, sr=44100, duration=0.560, 
                         n_mels=64, n_fft=2048, hop_length=512):
    """
    Compute log-Mel spectrogram patch from audio file or array.
    
    Args:
        input_audio: Either a path to an audio file or a numpy array of audio samples
        sr: Sample rate (default: 44100)
        duration: Duration in seconds (default: 0.560)
        n_mels: Number of Mel bands (default: 64)
        n_fft: FFT window size (default: 2048)
        hop_length: Number of samples between successive frames (default: 512)
    """
    # Load audio if path is provided
    if isinstance(input_audio, (str, Path)):
        audio, _ = librosa.load(input_audio, sr=sr)
    else:
        audio = input_audio
    
    # Cut/pad to fixed duration
    target_length = int(duration * sr)
    if len(audio) > target_length:
        audio = audio[:target_length]
    else:
        audio = np.pad(audio, (0, target_length - len(audio)))
    
    # Compute Mel spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=audio,
        sr=sr,
        n_mels=n_mels,
        n_fft=n_fft,
        hop_length=hop_length
    )
    
    # Convert to log scale
    log_mel = np.log(mel_spec + 1e-4)
    
    # Min-max normalization
    log_mel = (log_mel - log_mel.min()) / (log_mel.max() - log_mel.min())
    
    # Add channel dimension
    log_mel = np.expand_dims(log_mel, axis=0)
    
    return log_mel

def augment_audio(audio, sr=44100):
    """
    Apply random pitch shift and time stretch augmentation.
    Returns 10 augmented versions.
    """
    augmented = []
    for _ in range(10):
        # Random parameters
        pitch_shift = random.uniform(-1.5, 1.5)
        time_stretch = random.uniform(0.8, 1.2)
        
        # Randomly choose order of operations
        if random.random() < 0.5:
            # Pitch shift then stretch
            aug_audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=pitch_shift)
            aug_audio = librosa.effects.time_stretch(aug_audio, rate=time_stretch)
        else:
            # Stretch then pitch shift
            aug_audio = librosa.effects.time_stretch(audio, rate=time_stretch)
            aug_audio = librosa.effects.pitch_shift(aug_audio, sr=sr, n_steps=pitch_shift)
        
        augmented.append(aug_audio)
    
    return augmented

In [4]:
class SyllableCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        
        # Convolutional blocks
        self.conv_blocks = nn.Sequential(
            # Block 1: Input (1, 64, 48) -> Output (8, 32, 24)
            nn.Conv2d(1, 8, 3, padding=1),
            nn.BatchNorm2d(8),
            nn.ReLU(),
            nn.Conv2d(8, 8, 3, padding=1),
            nn.BatchNorm2d(8),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            
            # Block 2: (8, 32, 24) -> (16, 16, 12)
            nn.Conv2d(8, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 16, 3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            
            # Block 3: (16, 16, 12) -> (32, 8, 6)
            nn.Conv2d(16, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Conv2d(32, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            
            # Block 4: (32, 8, 6) -> (64, 4, 3)
            nn.Conv2d(32, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )
        
        # Calculate the size of flattened features
        self.flat_features = 64 * 4 * 3  # 768 features after convolution
        
        # Embedding layer
        self.embedding = nn.Linear(self.flat_features, 1024)
        
        # Classification layer
        self.classifier = nn.Linear(1024, num_classes)
    
    def forward(self, x):
        # Convolutional blocks
        x = self.conv_blocks(x)
        
        # Flatten: (batch_size, 64, 4, 3) -> (batch_size, 768)
        x = x.view(x.size(0), -1)
        
        # Embedding: (batch_size, 768) -> (batch_size, 1024)
        embeddings = self.embedding(x)
        x = nn.ReLU()(embeddings)
        
        # Classification: (batch_size, 1024) -> (batch_size, num_classes)
        x = self.classifier(x)
        
        return x, embeddings

    def get_embedding_dim(self):
        return self.flat_features

In [5]:
def train_fold(model, train_loader, val_loader, device, fold):
    """
    Train one fold of the CNN model.
    """
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.1, patience=5, verbose=True
    )
    
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    for epoch in range(100):  # Maximum 100 epochs
        # Training
        model.train()
        train_loss = 0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            outputs, _ = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                outputs, _ = model(batch_x)
                val_loss += criterion(outputs, batch_y).item()
        
        # Update learning rate
        scheduler.step(val_loss)
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
        
        print(f"Epoch {epoch}: train_loss = {train_loss:.4f}, val_loss = {val_loss:.4f}")
        
        # Early stopping
        if patience_counter >= 10:
            print("Early stopping triggered")
            break
    
    # Save best model
    torch.save(best_model_state, f'cnn_model_fold_{fold}.pt')
    return best_model_state

In [6]:
import os
# Test file paths
def test_file_paths():
    segment_info = pd.read_csv('../../segment_info/segment_info.csv')
    print("Testing first 5 file paths...")
    for idx, row in segment_info.head().iterrows():
        file_path = row['segment_path']
        print(f"\nChecking file: {file_path}")
        full_path = os.path.abspath(os.path.join(os.path.dirname('../../segment_info/segment_info.csv'), file_path))
        print(f"Full path: {full_path}")
        print(f"File exists: {os.path.exists(full_path)}")

test_file_paths()

Testing first 5 file paths...

Checking file: ../segments/AVP_P1_hhc_0000.wav
Full path: /Users/arul/ML/BEATBOX/projectFiles/segments/AVP_P1_hhc_0000.wav
File exists: True

Checking file: ../segments/AVP_P1_hhc_0001.wav
Full path: /Users/arul/ML/BEATBOX/projectFiles/segments/AVP_P1_hhc_0001.wav
File exists: True

Checking file: ../segments/AVP_P1_hhc_0002.wav
Full path: /Users/arul/ML/BEATBOX/projectFiles/segments/AVP_P1_hhc_0002.wav
File exists: True

Checking file: ../segments/AVP_P1_hhc_0003.wav
Full path: /Users/arul/ML/BEATBOX/projectFiles/segments/AVP_P1_hhc_0003.wav
File exists: True

Checking file: ../segments/AVP_P1_hhc_0004.wav
Full path: /Users/arul/ML/BEATBOX/projectFiles/segments/AVP_P1_hhc_0004.wav
File exists: True


In [7]:
def main():
    print("Starting CNN training pipeline...")
    
    # 1. Create syllable table and fix paths
    print("\nStep 1: Creating syllable table and fixing paths...")
    segment_info, syllable_to_id = create_syllable_table()
    
    # Fix the paths by adding an extra '../'
    segment_info['segment_path'] = segment_info['segment_path'].apply(
        lambda x: f"../../{x[3:]}" if x.startswith('../') else x
    )
    
    num_classes = len(syllable_to_id)
    print(f"Created syllable mapping with {num_classes} classes")
    
    # Test a few paths to make sure they're correct
    print("\nTesting fixed paths:")
    for path in segment_info['segment_path'].head():
        print(f"Fixed path: {path}")
        print(f"Exists: {os.path.exists(path)}")
        
    # 2. Create participant-stratified folds
    print("\nStep 2: Creating participant folds...")
    group_kfold = GroupKFold(n_splits=5)
    participants = segment_info['participant_id'].unique()
    print(f"Total participants: {len(participants)}")
    
    # Store embeddings for all segments
    print("\nInitializing embedding storage...")
    all_embeddings = np.zeros((len(segment_info), 1024))
    print(f"Will store embeddings for {len(segment_info)} segments")
    
    # Process each fold
    for fold, (train_val_idx, test_idx) in enumerate(group_kfold.split(
            np.arange(len(participants)), groups=participants)):  # Modified this line
        
        print(f"\nProcessing Fold {fold + 1}/5")
        
        train_val_participants = participants[train_val_idx]
        print(f"Train/Val participants: {len(train_val_participants)}")
        
        # Split train_val into train and validation
        train_participants = train_val_participants[:32]
        val_participants = train_val_participants[32:]
        print(f"Training participants: {len(train_participants)}")
        print(f"Validation participants: {len(val_participants)}")
        
        # Get segment indices for each set
        train_mask = segment_info['participant_id'].isin(train_participants)
        val_mask = segment_info['participant_id'].isin(val_participants)
        
        print("\nProcessing training data...")
        # Process training data with augmentation
        X_train = []
        y_train = []
        for idx in segment_info[train_mask].index:
            if len(X_train) % 100 == 0:
                print(f"Processed {len(X_train)} training samples...")
                
            # Original segment
            file_path = segment_info.loc[idx, 'segment_path']
            print(f"Processing file: {file_path}")  # Debug print
            
            try:
                mel_spec = compute_log_mel_patch(file_path)
                X_train.append(mel_spec)
                y_train.append(segment_info.loc[idx, 'syllable_id'])
                
                # Augmented versions
                audio, sr = librosa.load(file_path, sr=44100)
                aug_audios = augment_audio(audio, sr)
                for aug_audio in aug_audios:
                    mel_spec = compute_log_mel_patch(aug_audio)
                    X_train.append(mel_spec)
                    y_train.append(segment_info.loc[idx, 'syllable_id'])
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
                continue
        
        print("\nProcessing validation data...")
        # Process validation data (no augmentation)
        X_val = []
        y_val = []
        for idx in segment_info[val_mask].index:
            if len(X_val) % 100 == 0:
                print(f"Processed {len(X_val)} validation samples...")
                
            try:
                mel_spec = compute_log_mel_patch(segment_info.loc[idx, 'segment_path'])
                X_val.append(mel_spec)
                y_val.append(segment_info.loc[idx, 'syllable_id'])
            except Exception as e:
                print(f"Error processing validation sample: {str(e)}")
                continue
        
        print("\nConverting to numpy arrays...")
        # Convert to numpy arrays
        X_train = np.stack(X_train)
        y_train = np.array(y_train)
        X_val = np.stack(X_val)
        y_val = np.array(y_val)
        
        print(f"\nFold {fold + 1} data shapes:")
        print(f"X_train shape: {X_train.shape}")
        print(f"X_val shape: {X_val.shape}")
        print("\nTraining syllable distribution:")
        print(pd.Series(y_train).value_counts().sort_index())
        
        print("\nCreating data loaders...")
        # Create data loaders
        train_dataset = torch.utils.data.TensorDataset(
            torch.FloatTensor(X_train),
            torch.LongTensor(y_train)
        )
        val_dataset = torch.utils.data.TensorDataset(
            torch.FloatTensor(X_val),
            torch.LongTensor(y_val)
        )
        
        train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=128)
        
        print("\nInitializing model...")
        # Train model
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {device}")
        model = SyllableCNN(num_classes).to(device)
        
        print("\nTraining model...")
        best_model_state = train_fold(model, train_loader, val_loader, device, fold)
        
        print("\nGenerating embeddings...")
        # Generate embeddings for all segments using this fold's model
        model.load_state_dict(best_model_state)
        model.eval()
        
        with torch.no_grad():
            for idx in range(len(segment_info)):
                if idx % 100 == 0:
                    print(f"Generated embeddings for {idx}/{len(segment_info)} segments...")
                try:
                    mel_spec = compute_log_mel_patch(segment_info.loc[idx, 'segment_path'])
                    mel_spec = torch.FloatTensor(mel_spec).unsqueeze(0).to(device)
                    _, embedding = model(mel_spec)
                    all_embeddings[idx] = embedding.cpu().numpy()
                except Exception as e:
                    print(f"Error generating embedding for segment {idx}: {str(e)}")
                    continue
    
    print("\nSaving embeddings...")
    np.save('cnn_syllable_features.npy', all_embeddings)
    print("Done!")

if __name__ == "__main__":
    main()

Starting CNN training pipeline...

Step 1: Creating syllable table and fixing paths...
Total unique syllables: 51

Syllable ID mapping:
!-x: 0
dʒ-u: 1
k-a: 2
k-h: 3
k-x: 4
k-ʊ: 5
kg-h: 6
kg-x: 7
kʃ-h: 8
kʃ-x: 9
p-a: 10
p-h: 11
p-i: 12
p-o: 13
p-u: 14
p-x: 15
p-ə: 16
p-ɯ: 17
p-ʊ: 18
p-ʌ: 19
s-x: 20
t-a: 21
t-e: 22
t-h: 23
t-i: 24
t-u: 25
t-x: 26
t-ɐ: 27
t-ɘ: 28
t-ɪ: 29
t-ɯ: 30
t-ʊ: 31
ts-a: 32
ts-h: 33
ts-x: 34
ts-ɪ: 35
tɕ-h: 36
tɕ-x: 37
tʃ-I: 38
tʃ-a: 39
tʃ-h: 40
tʃ-i: 41
tʃ-x: 42
tʃ-æ: 43
tʃ-œ: 44
tʃ-ɘ: 45
tʒ-h: 46
tʒ-x: 47
ʔ-a: 48
ʡʢ-u: 49
ʡʢ-x: 50

Example syllables for each instrument:

hhc:
  tʃ-x (ID: 42)
  tʃ-æ (ID: 43)
  tʃ-æ (ID: 43)

hho:
  tʃ-x (ID: 42)
  tʃ-i (ID: 41)
  tʃ-i (ID: 41)

kd:
  p-ə (ID: 16)
  p-ə (ID: 16)
  p-ə (ID: 16)

sd:
  t-e (ID: 22)
  t-e (ID: 22)
  t-e (ID: 22)
Created syllable mapping with 51 classes

Testing fixed paths:
Fixed path: ../../segments/AVP_P1_hhc_0000.wav
Exists: True
Fixed path: ../../segments/AVP_P1_hhc_0001.wav
Exists: True
Fixed path: 



Epoch 0: train_loss = 201.7105, val_loss = 6.9221
Epoch 1: train_loss = 29.4677, val_loss = 7.9766
Epoch 2: train_loss = 16.6139, val_loss = 7.6182
Epoch 3: train_loss = 10.2109, val_loss = 7.8149
Epoch 4: train_loss = 10.6441, val_loss = 10.3311
Epoch 5: train_loss = 7.8755, val_loss = 7.6677
Epoch 6: train_loss = 5.6896, val_loss = 9.3008
Epoch 7: train_loss = 0.7059, val_loss = 9.1726
Epoch 8: train_loss = 0.2131, val_loss = 9.3077
Epoch 9: train_loss = 0.1543, val_loss = 9.4032
Epoch 10: train_loss = 0.1208, val_loss = 9.5091
Early stopping triggered

Generating embeddings...
Generated embeddings for 0/5714 segments...
Generated embeddings for 100/5714 segments...
Generated embeddings for 200/5714 segments...
Generated embeddings for 300/5714 segments...
Generated embeddings for 400/5714 segments...
Generated embeddings for 500/5714 segments...
Generated embeddings for 600/5714 segments...
Generated embeddings for 700/5714 segments...
Generated embeddings for 800/5714 segments...
G