In [3]:
!pip install pip==24.0.0
!pip install soundstorm-pytorch==0.0.21
!pip uninstall -y fairseq
!pip install git+https://github.com/Tps-F/fairseq.git@main
!pip install audiolm-pytorch
!pip install soundfile


Collecting pip==24.0.0
  Downloading pip-24.0-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.9/2.1 MB[0m [31m27.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.0
Collecting soundstorm-pytorch==0.0.21
  Downloading soundstorm_pytorch-0.0.21-py3-none-any.whl.metadata (953 bytes)
Collecting audiolm-pytorch>=1.2.8 (from soundstorm-pytorch==0.0.21)
  Downloading audiolm_pytorch-2.4.0-py3-none-any.whl.metadata (1.5 kB)
Collecting classifier-free-guidance-py



In [2]:
import torchaudio
import os
import torchaudio.transforms as T
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchaudio import functional as AF
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from pathlib import Path

In [1]:
from google.colab import drive
drive.mount('/content/drive')

# !cp -r "/content/drive/MyDrive/soundstorm_data/librispeech/LibriSpeech/train-clean-100" \
#       "/content/drive/MyDrive/CIS7000_project/train-clean-100"


Mounted at /content/drive


In [3]:
# Root folder where your LibriSpeech audio is stored on Drive
import torch
import torchaudio
from pathlib import Path

librispeech_path = "/content/drive/MyDrive/CIS7000_project/train-clean-100"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
print("Dataset path:", librispeech_path)


Using device: cuda
Dataset path: /content/drive/MyDrive/CIS7000_project/train-clean-100


In [4]:
!pip install soundfile




In [5]:
import soundfile as sf
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from pathlib import Path
from torchaudio import functional as AF

class AudioDataset_LibriSpeech(Dataset):
    def __init__(self, path, files, target_seconds=10.0):
        super().__init__()
        self.path = Path(path)
        self.files = files

        # Load first file to determine sample rate
        audio, sr = sf.read(self.files[0])
        if audio.ndim > 1:
            audio = audio.mean(axis=1)
        self.frequency = sr

        self.target_seconds = target_seconds
        self.target_len = int(sr * target_seconds)

        print(f"Loaded {len(self.files)} files, sample rate = {self.frequency} Hz")

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        audio, sr = sf.read(self.files[idx])

        # Mono conversion
        if audio.ndim > 1:
            audio = audio.mean(axis=1)

        audio = torch.tensor(audio, dtype=torch.float32)

        # Resample if needed
        if sr != self.frequency:
            audio = AF.resample(audio, orig_freq=sr, new_freq=self.frequency)

        # Pad or clip
        if len(audio) < self.target_len:
            audio = F.pad(audio, (0, self.target_len - len(audio)))
        else:
            audio = audio[:self.target_len]

        return audio, audio


In [6]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [8]:
from sklearn.model_selection import train_test_split
import glob
from pathlib import Path

all_files = sorted(
    glob.glob(str(Path(librispeech_path) / "**/*.flac"), recursive=True)
)

print("Total LibriSpeech files found:", len(all_files))

train_files, val_files = train_test_split(all_files, test_size=0.10, random_state=42)

print("Train:", len(train_files))
print("Val:", len(val_files))


Total LibriSpeech files found: 14581
Train: 13122
Val: 1459


In [9]:
import beartype
beartype.beartype = lambda func: func


In [10]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torchaudio
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm.auto import tqdm


In [11]:
batch_size = 8  # same as original notebook

train_set = AudioDataset_LibriSpeech(librispeech_path, train_files)
val_set   = AudioDataset_LibriSpeech(librispeech_path, val_files)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_set, batch_size=batch_size, shuffle=False)

print("Train batches:", len(train_loader))
print("Val batches:", len(val_loader))


Loaded 13122 files, sample rate = 16000 Hz
Loaded 1459 files, sample rate = 16000 Hz
Train batches: 1641
Val batches: 183


In [13]:
from soundstorm_pytorch import SoundStorm, ConformerWrapper, SoundStream
from audiolm_pytorch import SoundStreamTrainer, EncodecWrapper


In [14]:
class EncodecForSoundStorm:
    def __init__(self):
        self.encodec = EncodecWrapper()
        self.codebook_size = 1024
        self.num_quantizers = 8
        self.rq_num_quantizers = 8
        self.rq_groups = 1

    def __call__(self, *args, **kwargs):
        return self.encodec(*args, **kwargs)

    def encode(self, *args, **kwargs):
        return self.encodec.encode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        return self.encodec.decode(*args, **kwargs)

    def __getattr__(self, name):
        return getattr(self.encodec, name)

    def to(self, *args, **kwargs):
        self.encodec = self.encodec.to(*args, **kwargs)
        return self

    def train(self, mode=True):
        self.encodec.train(mode)
        return self

    def eval(self):
        self.encodec.eval()
        return self


In [15]:
import beartype
beartype.beartype = lambda func: func

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

conformer = ConformerWrapper(
    codebook_size = 1024,
    num_quantizers = 8,
    conformer = dict(
        dim = 512,
        depth = 2
    ),
)

soundstream = SoundStream(
    codebook_size = 1024,
    rq_num_quantizers = 8,
    attn_window_size = 128,
    attn_depth = 2
)

# enc = EncodecForSoundStorm()
# enc.eval()

# model = SoundStorm(conformer, soundstream = enc).to(device)


Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda


In [16]:
enc = EncodecForSoundStorm()
enc.eval()
enc = enc.to(device)

model = SoundStorm(conformer, soundstream = enc).to(device)


Downloading: "https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th" to /root/.cache/torch/hub/checkpoints/encodec_24khz-d7cc33bc.th


100%|██████████| 88.9M/88.9M [00:00<00:00, 186MB/s]


In [17]:
optimizer = optim.Adam(model.parameters(), lr = 0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience = 4)


In [18]:
def train_one_epoch(model, train_dataloader, optimizer, device, batches):
    running_loss = 0

    for i, data in enumerate(train_dataloader):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()

        loss, _ = model(inputs)
        loss.backward()

        optimizer.step()
        running_loss += loss.item()

    last_loss = running_loss / batches
    return last_loss


In [19]:
def train(model, n_batches, train_dataloader, val_dataloader, scheduler, optimizer, device, epochs=20):
    epoch_number = 0
    losses = []
    vlosses = []

    for epoch in range(epochs):
        print('EPOCH {}:'.format(epoch_number + 1))

        model.train(True)

        avg_loss = train_one_epoch(model, train_dataloader, optimizer, device, n_batches)

        model.eval()
        running_vloss = 0.0

        with torch.no_grad():
            for i, vdata in enumerate(val_dataloader):
                vinputs, vlabels = vdata
                vinputs, vlabels = vinputs.to(device), vlabels.to(device)
                vloss, _ = model(vinputs)
                running_vloss += vloss.item()

        avg_vloss = running_vloss / (i + 1)
        scheduler.step(avg_vloss)

        losses.append(avg_loss)
        vlosses.append(avg_vloss)
        print('LOSS train {} val {}'.format(avg_loss, avg_vloss))

        epoch_number += 1

    df = pd.DataFrame({'loss': losses, 'val_loss': vlosses})
    df.to_csv('librispeech_losses.csv', index=False)


In [None]:
n_train_batches = len(train_loader)

train(
    model,
    n_train_batches,
    train_loader,
    val_loader,
    scheduler,
    optimizer,
    device,
    epochs = 20   # or set to 1 just to sanity test
)



In [None]:
from google.colab import runtime

runtime.unassign()