In [1]:
# Install the kaggle library
!pip install kaggle



In [2]:
from google.colab import files
import pandas as pd
import numpy as np
import os
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [3]:
# Prompt the user to upload the kaggle.json file
print("Please upload your kaggle.json file")
files.upload()

Please upload your kaggle.json file


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"blidzanorai","key":"cb18b4d5f7d0813dfa9911a21f99246b"}'}

In [4]:
# Set up the kaggle directory and permissions
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
# Download the dataset from kaggle
print("Downloading the dataset")
!kaggle datasets download -d ikrbasak/sep-28k

Downloading the dataset
Dataset URL: https://www.kaggle.com/datasets/ikrbasak/sep-28k
License(s): Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
Downloading sep-28k.zip to /content
 99% 2.14G/2.17G [00:18<00:00, 221MB/s]
100% 2.17G/2.17G [00:18<00:00, 125MB/s]


In [6]:
# Unzip the downloaded file
print("Unzipping the file")
!unzip -q sep-28k.zip

Unzipping the file


In [7]:
episodes_file = 'SEP-28k_episodes.csv'
labels_file = 'SEP-28k_labels.csv'

In [8]:
episodes_df = pd.read_csv(episodes_file)
labels_df = pd.read_csv(labels_file)

In [9]:
# Information for the Episodes DataFrame
print("Episodes DataFrame Info")
print(episodes_df.head())
print("\nBasic Information:")
episodes_df.info()

Episodes DataFrame Info
  He_Stutters_Podcast_–_Make_Room_For_The_Stuttering  \
0  He_Stutters_Podcast_–_Make_Room_For_The_Stutte...   
1  He_Stutters_Podcast_–_Make_Room_For_The_Stutte...   
2  He_Stutters_Podcast_–_Make_Room_For_The_Stutte...   
3  He_Stutters_Podcast_–_Make_Room_For_The_Stutte...   
4  He_Stutters_Podcast_–_Make_Room_For_The_Stutte...   

    episode-208-with-kelsey-h  \
0   episode-208-with-kelsey-h   
1   episode-208-with-kelsey-h   
2   episode-208-with-kelsey-h   
3   episode-208-with-kelsey-h   
4   episode-208-with-kelsey-h   

   https://stutterrockstar.files.wordpress.com/2011/05/male-episode-1-with-alan1.mp3  \
0   https://stutterrockstar.files.wordpress.com/2...                                   
1   https://stutterrockstar.files.wordpress.com/2...                                   
2   https://stutterrockstar.files.wordpress.com/2...                                   
3   https://stutterrockstar.files.wordpress.com/2...                                   


In [10]:
# Information for the Labels DataFrame
print("\nLabels DataFrame Info")
print(labels_df.head())
print("\nBasic Information:")
labels_df.info()


Labels DataFrame Info
         Show  EpId  ClipId     Start      Stop  Unsure  PoorAudioQuality  \
0  HeStutters     0       0  31900320  31948320       0                 0   
1  HeStutters     0       1  31977120  32025120       0                 0   
2  HeStutters     0       2  34809760  34857760       0                 0   
3  HeStutters     0       3  35200640  35248640       0                 0   
4  HeStutters     0       4  35721920  35769920       0                 0   

   Prolongation  Block  SoundRep  WordRep  DifficultToUnderstand  \
0             0      0         0        0                      0   
1             0      0         0        0                      0   
2             0      0         0        0                      0   
3             1      0         0        0                      0   
4             0      0         0        0                      0   

   Interjection  NoStutteredWords  NaturalPause  Music  NoSpeech  
0             0                 3     

In [11]:
# 1. Load, Combine, and Process Data
print("Loading and combining datasets")
clips_dir = 'clips/stuttering-clips/clips'
sep_labels_df = pd.read_csv('SEP-28k_labels.csv')
fluency_labels_df = pd.read_csv('fluencybank_labels.csv')
combined_df = pd.concat([sep_labels_df, fluency_labels_df], ignore_index=True)

Loading and combining datasets


In [12]:
print("Processing labels and cleaning data")
stutter_columns = ['Prolongation', 'Block', 'SoundRep', 'WordRep', 'Interjection']
all_event_columns = stutter_columns + ['NoStutteredWords']
combined_df['label'] = combined_df[all_event_columns].idxmax(axis=1)

clean_df = combined_df[(combined_df['PoorAudioQuality'] == 0) & (combined_df['Music'] == 0) & (combined_df['NoSpeech'] == 0)].copy()

Processing labels and cleaning data


In [13]:
def generate_correct_path(row):
    filename = f"{row['Show']}_{row['EpId']}_{row['ClipId']}.wav"
    return os.path.join(clips_dir, filename)

In [14]:
clean_df['file_path'] = clean_df.apply(generate_correct_path, axis=1)
final_df = clean_df[clean_df['file_path'].apply(os.path.exists)].copy()
print(f"Using {len(final_df)} clips with existing audio files")

Using 26098 clips with existing audio files


In [15]:
label_encoder = LabelEncoder()
final_df['encoded_label'] = label_encoder.fit_transform(final_df['label'])

In [16]:
# 2. Prepare Data for PyTorch
print("Splitting data")
X = final_df['file_path'].values
y = final_df['encoded_label'].values
num_classes = len(label_encoder.classes_)

X_train_paths, X_val_paths, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Splitting data


In [17]:
# 3. Create a PyTorch Dataset
class StutteringDataset(Dataset):
    def __init__(self, file_paths, labels, n_mels=128, max_pad_len=188):
        self.file_paths = file_paths
        self.labels = labels
        self.n_mels = n_mels
        self.max_pad_len = max_pad_len
        self.sr = 16000
    def __len__(self):
        return len(self.file_paths)
    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]
        audio, _ = librosa.load(file_path, sr=self.sr)
        if len(audio) == 0:
            return torch.zeros((1, self.n_mels, self.max_pad_len), dtype=torch.float32), torch.tensor(label, dtype=torch.long)
        mel = librosa.feature.melspectrogram(y=audio, sr=self.sr, n_mels=self.n_mels)
        mel_db = librosa.power_to_db(mel, ref=np.max)
        if mel_db.shape[1] > self.max_pad_len:
            mel_db = mel_db[:, :self.max_pad_len]
        else:
            pad_width = self.max_pad_len - mel_db.shape[1]
            mel_db = np.pad(mel_db, pad_width=((0, 0), (0, pad_width)), mode='constant')
        mel_db = np.expand_dims(mel_db, axis=0)
        return torch.tensor(mel_db, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

In [18]:
train_dataset = StutteringDataset(X_train_paths, y_train)
val_dataset = StutteringDataset(X_val_paths, y_val)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

In [19]:
# 4. Define a Dynamic CNN Model
class StutteringCNN(nn.Module):
    def __init__(self, num_classes):
        super(StutteringCNN, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(3, 3)), nn.ReLU(), nn.MaxPool2d(kernel_size=(2, 2)),
            nn.Conv2d(32, 64, kernel_size=(3, 3)), nn.ReLU(), nn.MaxPool2d(kernel_size=(2, 2)))
        self.flatten = nn.Flatten()
        self.fc_layers = nn.Sequential(
            nn.LazyLinear(128), nn.ReLU(), nn.Dropout(0.5), nn.Linear(128, num_classes))
    def forward(self, x):
        x = self.conv_layers(x)
        x = self.flatten(x)
        x = self.fc_layers(x)
        return x

In [20]:
# 5. Training Loop
print("Starting training")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print("Calculating class weights for the loss function")
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

model = StutteringCNN(num_classes).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 10

best_val_loss = float('inf')

for epoch in range(epochs):
    # Training phase
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * correct_train / total_train

    # Validation phase
    model.eval()
    val_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    epoch_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct_val / total_val

    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, Val Loss: {epoch_val_loss:.4f}, Val Acc: {val_accuracy:.2f}%')

    if epoch_val_loss < best_val_loss:
        best_val_loss = epoch_val_loss
        torch.save(model.state_dict(), 'best_model_weights.pth')
        print(f"New best model saved with validation loss: {best_val_loss:.4f}")

print("Training complete")

Starting training
Using device: cuda
Calculating class weights for the loss function
Epoch 1/10, Train Loss: 2.6111, Train Acc: 19.24%, Val Loss: 1.7919, Val Acc: 44.08%
New best model saved with validation loss: 1.7919
Epoch 2/10, Train Loss: 1.7918, Train Acc: 26.29%, Val Loss: 1.7880, Val Acc: 43.60%
New best model saved with validation loss: 1.7880
Epoch 3/10, Train Loss: 1.7920, Train Acc: 33.40%, Val Loss: 1.7905, Val Acc: 35.73%
Epoch 4/10, Train Loss: 1.7900, Train Acc: 30.10%, Val Loss: 1.7903, Val Acc: 37.91%
Epoch 5/10, Train Loss: 1.7898, Train Acc: 26.14%, Val Loss: 1.7915, Val Acc: 44.58%
Epoch 6/10, Train Loss: 1.7880, Train Acc: 33.07%, Val Loss: 1.7890, Val Acc: 14.35%
Epoch 7/10, Train Loss: 1.7879, Train Acc: 22.46%, Val Loss: 1.7884, Val Acc: 15.90%
Epoch 8/10, Train Loss: 1.7869, Train Acc: 22.77%, Val Loss: 1.7969, Val Acc: 18.14%
Epoch 9/10, Train Loss: 1.7876, Train Acc: 22.19%, Val Loss: 1.7925, Val Acc: 18.54%
Epoch 10/10, Train Loss: 1.7882, Train Acc: 32.98%