# Setup Drive & Repo

In [1]:
import os
from google.colab import drive

# 1. Mount Drive (Your Data)
drive.mount('/content/drive')

# 2. Sync Code from GitHub
REPO_URL = "https://github.com/brauner68/gm_proj.git"
REPO_NAME = "gm_proj"

if not os.path.exists(f"/content/{REPO_NAME}"):
    print(f"‚¨áÔ∏è Cloning {REPO_NAME}...")
    !git clone {REPO_URL}
else:
    print(f"üîÑ Repo exists. Pulling latest changes...")
    %cd /content/{REPO_NAME}
    !git pull

# 3. Enter Project Folder
%cd /content/{REPO_NAME}

# 4. Install Dependencies (The Magic Step)
# This reads your requirements.txt and installs everything at once
print("üì¶ Installing libraries from requirements.txt...")
!pip install -q -r requirements.txt

print("\n‚úÖ SETUP COMPLETE! You are ready to work.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üîÑ Repo exists. Pulling latest changes...
/content/gm_proj
Already up to date.
/content/gm_proj
üì¶ Installing libraries from requirements.txt...

‚úÖ SETUP COMPLETE! You are ready to work.


# Unzip Data

In [2]:
# Create a folder for the unzipped data
!mkdir -p /content/nsynth_data

# Unzip the file (This might take a minute)
# The -C flag tells it where to put the files
print("‚è≥ Unzipping data...")
!tar -xzf "/content/drive/Shareddrives/gm_proj/nsynth-valid.jsonwav.tar.gz" -C /content/nsynth_data
print("‚úÖ Done!")

‚è≥ Unzipping data...
‚úÖ Done!


# Train

In [3]:
from src.trainer import DiffusionTrainer

# 1. Configuration
config = {
    'data_path': '/content/nsynth_data/nsynth-valid',       # Where you unzipped the data
    'max_samples': 1000,                                    # How many samples to load
    'output_dir': '/content/gm_proj/results/run_01',        # Where to save models/images
    'selected_families': ['guitar', 'mallet', 'brass'],     # The instruments you want
    'epochs': 15,                                            # How long to train
    'batch_size': 16,                                       # Adjust based on GPU memory
    'lr': 1e-4,                                             # Learning Rate
    'save_interval': 5,                                     # Save every 5 epochs
    'cfg_prob': 0.1                                         # 10% Dropout for CFG
}

# 2. Run
print("üöÄ Starting Training...")
trainer = DiffusionTrainer(config)
trainer.train()

Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.
Flax classes are deprecated and will be removed in Diffusers v1.0.0. We recommend migrating to PyTorch classes or pinning your version of Diffusers.


üöÄ Starting Training...
Label Map: {'guitar': 0, 'mallet': 1, 'brass': 2}
Dataset initialized. 1000 files selected.
Trainer Initialized on cuda
Classes: {'guitar': 0, 'mallet': 1, 'brass': 2}
Model will handle 4 embeddings (Index 3 is NULL)
Epoch 1/15


  0%|          | 0/63 [00:00<?, ?it/s]

Average Epoch Loss: 0.1535
Epoch 2/15


  0%|          | 0/63 [00:00<?, ?it/s]

Average Epoch Loss: 0.0538
Epoch 3/15


  0%|          | 0/63 [00:00<?, ?it/s]

Average Epoch Loss: 0.0417
Epoch 4/15


  0%|          | 0/63 [00:00<?, ?it/s]

Average Epoch Loss: 0.0331
Epoch 5/15


  0%|          | 0/63 [00:00<?, ?it/s]

Average Epoch Loss: 0.0296
Saved model to /content/gm_proj/results/run_01/checkpoint_epoch_4.pt
Generating Validation Samples...


  0%|          | 0/1000 [00:00<?, ?it/s]

Saved validation plot to /content/gm_proj/results/run_01/sample_epoch_4.png
Epoch 6/15


  plt.tight_layout()


  0%|          | 0/63 [00:00<?, ?it/s]

Average Epoch Loss: 0.0298
Epoch 7/15


  0%|          | 0/63 [00:00<?, ?it/s]

KeyboardInterrupt: 

Test Audio

In [5]:
from src.vocoder import Vocoder
from src.dataset import NSynthDataset
import torch
import IPython.display as ipd

# 1. Initialize
vocoder = Vocoder(device='cuda' if torch.cuda.is_available() else 'cpu')

# Load Spectogram
valid_path = '/content/nsynth_data/nsynth-valid'
dataset = NSynthDataset(
    data_path=valid_path,
    max_samples=None,
    selected_families=['mallet']
)

dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)
images, labels = next(iter(dataloader))
spec = images[0]

# 3. Convert to Audio
audio = vocoder.decode(spec)

# 4. Save and Play
save_path = "/content/test_tone.wav"
vocoder.save_audio(audio, save_path)

print("Playing Audio:")
ipd.Audio(save_path)

Label Map: {'mallet': 0}
Dataset initialized. 663 files selected.
Saved audio to /content/test_tone.wav
Playing Audio:
