In [1]:
import pandas as pd

In [17]:
#df=pd.read_csv('../data/processed/1000dataset_spec_genre.csv')
df=pd.read_csv('../data/processed/1000dataset_spec_valence_cluster.csv')

In [18]:
# label encode df
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])

#df.to_csv('../data/processed/L1000dataset_spec_genre.csv', index=False)
df.to_csv('../data/processed/L1000dataset_spec_valence.csv', index=False)

## Create dataset + dataloader


In [2]:
from collections import defaultdict
from sklearn.model_selection import train_test_split
import pandas as pd
import random
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os

# Step 1: Group images by track ID and class
def group_tracks_by_class_and_id(association_csv):
    df = pd.read_csv(association_csv)
    df['track_id'] = df['image_path'].apply(lambda x: "_".join(os.path.basename(x).split("_")[:-2]))
    class_groups = defaultdict(list)
    
    # Group track IDs by their class
    for track_id, group in df.groupby('track_id'):
        track_class = group.iloc[0]['class']
        class_groups[track_class].append(track_id)
    
    return class_groups, df

# Step 2: Split track IDs for each class
def split_tracks_by_class(class_groups, train_ratio=0.7, val_ratio=0.1, test_ratio=0.2):
    train_ids, val_ids, test_ids = [], [], []
    
    for track_class, track_ids in class_groups.items():
        random.shuffle(track_ids)  # Shuffle track IDs within the class
        
        # Perform splits
        train, temp = train_test_split(track_ids, test_size=(1 - train_ratio))
        val, test = train_test_split(temp, test_size=(test_ratio / (test_ratio + val_ratio)))
        
        # Append to respective splits
        train_ids.extend(train)
        val_ids.extend(val)
        test_ids.extend(test)
    
    return train_ids, val_ids, test_ids


In [3]:
# Step 3: Create a custom PyTorch Dataset
class SpectrogramDataset(Dataset):
    def __init__(self, df, track_ids, transform=None):
        self.data = df[df['track_id'].isin(track_ids)]
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = row['image_path']
        label = row['class']
        
        # Load image
        image = Image.open(img_path).convert("RGB")
        
        # Apply transformations
        if self.transform:
            image = self.transform(image)
        
        return image, label

In [4]:
# Step 4: Create DataLoaders
def create_balanced_dataloaders(image_folder, association_csv, batch_size=32):
    # Group by class and track ID
    class_groups, df = group_tracks_by_class_and_id(association_csv)
    
    # Perform class-balanced splits
    train_ids, val_ids, test_ids = split_tracks_by_class(class_groups)
    
    # Define image transformations (e.g., resizing, normalization)
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize to a consistent size
        transforms.ToTensor(),         # Convert to tensor
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize
    ])
    
    # Create datasets
    train_dataset = SpectrogramDataset(df, train_ids, transform=transform)
    val_dataset = SpectrogramDataset(df, val_ids, transform=transform)
    test_dataset = SpectrogramDataset(df, test_ids, transform=transform)
    
    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader, test_loader


In [5]:
image_folder = "../data/raw/1000dataset_5/specs"
#association_csv = "../data/processed/L1000dataset_spec_genre.csv"
association_csv = "../data/processed/L1000dataset_5seg_valence.csv"

train_loader, val_loader, test_loader = create_balanced_dataloaders(image_folder, association_csv)

# Verify the splits
print(f"Number of training samples: {len(train_loader.dataset)}")
print(f"Number of validation samples: {len(val_loader.dataset)}")
print(f"Number of testing samples: {len(test_loader.dataset)}")


Number of training samples: 3486
Number of validation samples: 495
Number of testing samples: 1012


In [6]:
dataset = train_loader.dataset
print(dataset[0])

(tensor([[[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -0.9922],
         [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -0.9922],
         [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -0.9922],
         ...,
         [ 0.6627,  0.6078,  0.6863,  ...,  0.9059,  0.9294,  0.9608],
         [ 0.6627,  0.6784,  0.6078,  ...,  0.9294,  0.9451,  0.9686],
         [ 0.6157,  0.6235,  0.4980,  ...,  0.9529,  0.9686,  0.9765]],

        [[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -0.9922],
         [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -0.9922],
         [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -0.9922],
         ...,
         [-0.4275, -0.4431, -0.4039,  ...,  0.8588,  0.6941,  0.4902],
         [-0.4275, -0.4196, -0.4745,  ...,  0.6627,  0.6000,  0.4275],
         [-0.4667, -0.4588, -0.5451,  ...,  0.3098,  0.4118,  0.3176]],

        [[-0.9686, -0.9686, -0.9686,  ..., -0.9686, -0.9686, -0.9373],
         [-0.9686, -0.9686, -0.9686,  ..., -

# Model Creation

In [7]:
import torch
from torch import nn
#from audioset_tagging_cnn.models import Cnn14  
import torch.nn as nn
from torchvision import models

num_classes = 3

# Load a pre-trained model
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, num_classes)

#Modify the final layer for the number of genres 
#model.fc = nn.Sequential(
#    nn.Linear(model.fc.in_features, 256),  # Intermediate layer
#    nn.ReLU(),
#    nn.Dropout(0.3),
#    nn.Linear(256, num_classes)  # Output layer for genres
#)

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)




# Training/Validation loop

In [8]:
import wandb

# Initialize a new W&B run
wandb.init(project='Audio_Class',  
    
    config={
    "learning_rate": 0.001,
    "epochs": 10,
    "batch_size": 32,
    # Add other hyperparameters as needed
})

# Access the configuration
config = wandb.config


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbabisbabis[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [9]:
from tqdm import tqdm
import os
import torch
import wandb

# Initialize a new W&B run
#wandb.init(project='Audio_Class',  
#    
#    config={
#    "learning_rate": 0.001,
#    "epochs": 10,
#    "batch_size": 32,
#    # Add other hyperparameters as needed
#})
#
## Access the configuration
#config = wandb.config


def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, save_interval=1):
    for epoch in range(num_epochs):
        model.train()
        train_loss, train_correct = 0, 0

        # Initialize tqdm progress bar for training
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training", leave=True)
        
        for images, labels in progress_bar:
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Calculate metrics
            train_loss += loss.item() * images.size(0)
            train_correct += (outputs.argmax(1) == labels).sum().item()

            # Update tqdm bar
            progress_bar.set_postfix({"Loss": f"{loss.item():.4f}"})

        train_loss /= len(train_loader.dataset)
        train_accuracy = train_correct / len(train_loader.dataset)
        progress_bar.close()
        
        # Log training metrics to W&B
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "train_accuracy": train_accuracy,
        })

        # Validation phase
        model.eval()
        val_loss, val_correct = 0, 0

        # Initialize tqdm progress bar for validation
        progress_bar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation", leave=True)
        
        with torch.no_grad():
            for images, labels in progress_bar:
                images, labels = images.to(device), labels.to(device)

                # Forward pass
                outputs = model(images)
                loss = criterion(outputs, labels)

                # Calculate metrics
                val_loss += loss.item() * images.size(0)
                val_correct += (outputs.argmax(1) == labels).sum().item()

                # Update tqdm bar
                progress_bar.set_postfix({"Loss": f"{loss.item():.4f}"})
        
        val_loss /= len(val_loader.dataset)
        val_accuracy = val_correct / len(val_loader.dataset)
        progress_bar.close()
        
                # Log validation metrics to W&B
        wandb.log({
            "epoch": epoch + 1,
            "val_loss": val_loss,
            "val_accuracy": val_accuracy,
        })

        # Print epoch summary
        print(f"Epoch [{epoch+1}/{num_epochs}]: "
              f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

        # Save the model at specific intervals
        if (epoch + 1) % save_interval == 0:
            save_path = os.path.join("saved_models", f"model_epoch_{epoch+1}.pth")
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': train_loss,
                'val_loss': val_loss,
            }, save_path)
            print(f"Model saved to {save_path}")


In [10]:
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def test_model(model, test_loader, device, class_names=None):
    """
    Test the trained model on the test dataset.

    Args:
        model: Trained PyTorch model.
        test_loader: DataLoader for the test set.
        device: Torch device (CPU or CUDA).
        class_names: List of class names corresponding to the labels (optional).
    
    Returns:
        None. Prints accuracy, precision, recall, and F1-score.
    """
    model.eval()  # Set the model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient calculation for testing
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)

            # Labels are already numeric (encoded 0-4), no need to transform
            # Forward pass
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1)  # Get predicted class indices

            # Collect predictions and true labels
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    # Print results
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")

    # Confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    if class_names:
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title('Confusion Matrix')
        plt.show()
        # save confusion matrix locally
        plt.savefig('confusion_matrix.png')


In [11]:
from sklearn.utils.class_weight import compute_class_weight
import torch

#class_weights = compute_class_weight('balanced', classes=np.unique(dataset.labels), y=dataset.labels)
#class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
#
#criterion = nn.CrossEntropyLoss(weight=class_weights)
#criterion = nn.CrossEntropyLoss()
#optimizer = torch.optim.Adam(model.parameters(), lr=0.00003, weight_decay=1e-3)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.0001)
save_folder = "../models/classifier"

In [12]:
# Assuming model, train_loader, val_loader, criterion, optimizer are defined
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on device: {device}")


Training on device: cuda


In [13]:
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)

Epoch 1/10 - Training: 100%|██████████| 109/109 [01:23<00:00,  1.30it/s, Loss=1.0758]
Epoch 1/10 - Validation: 100%|██████████| 16/16 [00:10<00:00,  1.58it/s, Loss=1.1676]


Epoch [1/10]: Train Loss: 0.9107, Train Accuracy: 0.5622, Val Loss: 0.9652, Val Accuracy: 0.5414
Model saved to saved_models\model_epoch_1.pth


Epoch 2/10 - Training: 100%|██████████| 109/109 [00:57<00:00,  1.91it/s, Loss=0.4014]
Epoch 2/10 - Validation: 100%|██████████| 16/16 [00:07<00:00,  2.11it/s, Loss=1.0196]


Epoch [2/10]: Train Loss: 0.4724, Train Accuracy: 0.8216, Val Loss: 1.2643, Val Accuracy: 0.5232
Model saved to saved_models\model_epoch_2.pth


Epoch 3/10 - Training: 100%|██████████| 109/109 [00:56<00:00,  1.94it/s, Loss=0.0545]
Epoch 3/10 - Validation: 100%|██████████| 16/16 [00:07<00:00,  2.08it/s, Loss=1.4816]


Epoch [3/10]: Train Loss: 0.1436, Train Accuracy: 0.9584, Val Loss: 1.4595, Val Accuracy: 0.5192
Model saved to saved_models\model_epoch_3.pth


Epoch 4/10 - Training: 100%|██████████| 109/109 [00:56<00:00,  1.92it/s, Loss=0.0994]
Epoch 4/10 - Validation: 100%|██████████| 16/16 [00:07<00:00,  2.12it/s, Loss=2.0436]


Epoch [4/10]: Train Loss: 0.0499, Train Accuracy: 0.9923, Val Loss: 1.4584, Val Accuracy: 0.5354
Model saved to saved_models\model_epoch_4.pth


Epoch 5/10 - Training: 100%|██████████| 109/109 [00:57<00:00,  1.91it/s, Loss=0.0064]
Epoch 5/10 - Validation: 100%|██████████| 16/16 [00:07<00:00,  2.03it/s, Loss=2.0351]


Epoch [5/10]: Train Loss: 0.0253, Train Accuracy: 0.9948, Val Loss: 1.7968, Val Accuracy: 0.5111
Model saved to saved_models\model_epoch_5.pth


Epoch 6/10 - Training: 100%|██████████| 109/109 [00:57<00:00,  1.91it/s, Loss=0.0901]
Epoch 6/10 - Validation: 100%|██████████| 16/16 [00:07<00:00,  2.11it/s, Loss=1.3111]


Epoch [6/10]: Train Loss: 0.0273, Train Accuracy: 0.9937, Val Loss: 1.7433, Val Accuracy: 0.5556
Model saved to saved_models\model_epoch_6.pth


Epoch 7/10 - Training: 100%|██████████| 109/109 [00:56<00:00,  1.94it/s, Loss=0.0893]
Epoch 7/10 - Validation: 100%|██████████| 16/16 [00:07<00:00,  2.10it/s, Loss=1.2149]


Epoch [7/10]: Train Loss: 0.0587, Train Accuracy: 0.9808, Val Loss: 1.7491, Val Accuracy: 0.5414
Model saved to saved_models\model_epoch_7.pth


Epoch 8/10 - Training: 100%|██████████| 109/109 [00:56<00:00,  1.93it/s, Loss=0.3508]
Epoch 8/10 - Validation: 100%|██████████| 16/16 [00:07<00:00,  2.09it/s, Loss=2.6156]


Epoch [8/10]: Train Loss: 0.0983, Train Accuracy: 0.9627, Val Loss: 1.8887, Val Accuracy: 0.5111
Model saved to saved_models\model_epoch_8.pth


Epoch 9/10 - Training: 100%|██████████| 109/109 [00:56<00:00,  1.91it/s, Loss=0.0952]
Epoch 9/10 - Validation: 100%|██████████| 16/16 [00:07<00:00,  2.11it/s, Loss=0.7411]


Epoch [9/10]: Train Loss: 0.1332, Train Accuracy: 0.9521, Val Loss: 2.2765, Val Accuracy: 0.5434
Model saved to saved_models\model_epoch_9.pth


Epoch 10/10 - Training: 100%|██████████| 109/109 [00:57<00:00,  1.91it/s, Loss=0.1259]
Epoch 10/10 - Validation: 100%|██████████| 16/16 [00:07<00:00,  2.14it/s, Loss=1.9335]

Epoch [10/10]: Train Loss: 0.0939, Train Accuracy: 0.9624, Val Loss: 1.9689, Val Accuracy: 0.4909
Model saved to saved_models\model_epoch_10.pth





In [14]:
# test model
test_model(model, test_loader, device)

Test Accuracy: 0.5109
Test Precision: 0.5179
Test Recall: 0.5109
Test F1 Score: 0.5125


## Guide diffusion

from diffusers import StableDiffusionPipeline
import torch
import os n 

# Initialize the pipeline
pipeline = StableDiffusionPipeline.from_pretrained("riffusion/riffusion-model-v1", torch_dtype=torch.float16)
pipeline.to("cuda")

# List of prompts for generating images
prompts = [
    "a jazz song with guitar and drums",
    "a classical piano solo in a grand hall",
    "an energetic rock concert with electric guitars",
    "a relaxing hip-hop rap beat",
    "a vibrant electronic dance beat with colorful visuals",
    "a soulful saxophone performance in a smoky bar",
    "a folk song with banjo and violin in the countryside",
    "a heavy metal track with loud drums and guitar solos",
    "a calm meditation track with flutes and soft tones",
    "an electronic techno beat with strong bass"
]

# Directory to save the generated images
output_dir = "generated_images"
os.makedirs(output_dir, exist_ok=True)

# Generate and save images
for prompt in prompts:
    # Generate the image
    image = pipeline(prompt).images[0]
    
    # Clean the prompt to use as filename
    filename = os.path.join(output_dir, prompt.replace(" ", "_").replace("/", "_") + ".png")
    
    # Save the image
    image.save(filename)
    print(f"Saved: {filename}")


In [None]:
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline
import torch

# Load the local trained classifier model in models/classifier/model_epoch_10.pth
classifier = models.resnet18(pretrained=False)
classifier.fc = nn.Linear(classifier.fc.in_features, num_classes)
classifier.load_state_dict(torch.load("models/classifier/model_epoch_10.pth")['model_state_dict'])
classifier.eval()
classifier.to("cuda")

# Function to apply classifier guidance
def apply_classifier_guidance(image, prompt, strength=0.5):
    # Convert image to tensor and normalize
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    image_tensor = transform(image).unsqueeze(0).to("cuda")

    # Get classifier predictions
    with torch.no_grad():
        outputs = classifier(image_tensor)
        _, predicted = torch.max(outputs, 1)

    # Apply guidance based on classifier prediction
    if predicted.item() == 0:  # Assuming class 0 is the desired class
        return image
    else:
        # Modify the image based on the classifier's prediction
        # This is a placeholder for actual guidance logic
        return image

# Initialize the pipeline
pipeline = StableDiffusionImg2ImgPipeline.from_pretrained("riffusion/riffusion-model-v1", torch_dtype=torch.float16)
pipeline.to("cuda")

# List of prompts for generating images
prompts = [
    "a jazz song with guitar and drums",
    "a classical piano solo in a grand hall",
    "an energetic rock concert with electric guitars",
    "a relaxing hip-hop rap beat",
    "a vibrant electronic dance beat with colorful visuals",
    "a soulful saxophone performance in a smoky bar",
    "a folk song with banjo and violin in the countryside",
    "a heavy metal track with loud drums and guitar solos",
    "a calm meditation track with flutes and soft tones",
    "an electronic techno beat with strong bass"
]

# Directory to save the generated images
output_dir = "generated_images"
os.makedirs(output_dir, exist_ok=True)

# Generate and save images with classifier guidance
for prompt in prompts:
    # Generate the image
    image = pipeline(prompt).images[0]
    
    # Apply classifier guidance
    guided_image = apply_classifier_guidance(image, prompt)
    
    # Clean the prompt to use as filename
    filename = os.path.join(output_dir, prompt.replace(" ", "_").replace("/", "_") + "_guided.png")
    
    # Save the image
    guided_image.save(filename)
    print(f"Saved: {filename}")