In [2]:
import pandas as pd
import pathlib

# Purpose of this function is to remove leading zeros from the filename column
# and change extension from .ogg to .wav
def process_filenames(df, column_name):
    """
    Purpose of this function is to remove leading zeros from the values in the 
    filename column and to change the file extension from ogg to wav
    """
    def modify_filename(filename):
        # Split the filename and extension
        name, ext = filename.rsplit('.', 1)
        # Remove leading zeros and change the extension
        if ext.lower() == 'ogg':
            name = str(int(name))  # Convert to int to remove leading zeros, then back to string
            return f"{name}.wav"
        return filename  # Return unchanged if not .ogg
    
    # Apply the modification to the specified column
    df[column_name] = df[column_name].apply(modify_filename)
    return df
    
train_csv = "music-classification-wav/versions/3/IA/train.csv"
genres_csv = "music-classification-wav/versions/3/IA/genres.csv"
train_df = pd.read_csv(train_csv)
genres_df = pd.read_csv(genres_csv)


## Process the DataFrame
processed_df = process_filenames(train_df, 'filename')
processed_df = processed_df.drop(columns=["filepath"])
#processed_df.to_csv("processed_train.csv")
#print(processed_df)


In [4]:
# # This is a df where "filename.ogg" has been changed to "filename.wav".
# cleaned_df = pd.read_csv("processed.csv", index_col=0)  
cleaned_df = processed_df.copy()
# Now I want to check this CSV against the files we actually have 
# and remove any entries from the CSV that don't have a corresponding file
audio_dir = pathlib.Path("music-classification-wav/versions/3/IA/train/")
audio_files = audio_dir.iterdir()
actual_filenames = [file.name for file in audio_files]


csv_filenames = cleaned_df.loc[:, "filename"]
print(f"CSV Entries: {len(csv_filenames)}")
print(f"Actual Files: {len(actual_filenames)}")
print(f"CSV has {len(csv_filenames) - len(actual_filenames)} more entries than we have files.\n")
size_before = cleaned_df.shape[0]
print("cleaning...\n")
deleted_entries_count = 0
deleted_entries_list = []
for filename in csv_filenames:
    if filename not in actual_filenames:
        deleted_entries_count += 1
        deleted_entries_list.append(filename)
        cleaned_df = cleaned_df[cleaned_df['filename'] != filename]
        
print(f"Size before cleaning: {size_before}")
print(f"Size after cleaning: {cleaned_df.shape[0]}")
print(f"{deleted_entries_count} entries deleted!")
print(f"entries deleted:")
[print("\t", file) for file in deleted_entries_list]
cleaned_csv_path = "cleaned_train.csv"
cleaned_df.to_csv(cleaned_csv_path)
print(f"Wrote csv file to {cleaned_csv_path}")

CSV Entries: 19922
Actual Files: 19896
CSV has 26 more entries than we have files.

cleaning...

Size before cleaning: 19922
Size after cleaning: 19896
26 entries deleted!
entries deleted:
	 2549.wav
	 3137.wav
	 20407.wav
	 11088.wav
	 16312.wav
	 1239.wav
	 7795.wav
	 19765.wav
	 24899.wav
	 4040.wav
	 8897.wav
	 9963.wav
	 15980.wav
	 22698.wav
	 2643.wav
	 17475.wav
	 13702.wav
	 23369.wav
	 23078.wav
	 17940.wav
	 22295.wav
	 20462.wav
	 952.wav
	 20445.wav
	 3071.wav
	 13954.wav
Wrote csv file to cleaned_train.csv


In [None]:
If the csv file created by the above code has already been generated, you can start here.

In [3]:
import pandas as pd

cleaned_df = pd.read_csv("cleaned_train.csv")
# # Find how many examples in each genre.
# cleaned_csv_path = "cleaned_train.csv"
# cleaned_df = pd.read_csv(cleaned_csv_path, index_col=0)
freq_dict = cleaned_df["genre"].value_counts().to_dict()
for key, value in freq_dict.items():
    print(f"{key}: {value}")

Rock: 3095
Electronic: 3071
Punk: 2582
Experimental: 1799
Hip-Hop: 1756
Folk: 1214
Chiptune / Glitch: 1181
Instrumental: 1044
Pop: 944
International: 814
Ambient Electronic: 796
Classical: 495
Old-Time / Historic: 408
Jazz: 306
Country: 142
Soul-RnB: 94
Spoken: 94
Blues: 58


In [6]:
import librosa
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
import time
import pathlib

# Function to extract Mel Spectrogram features
def extract_mel_spectrogram(file_path, n_mels=64, duration=5, sr=22050):
    y, sr = librosa.load(file_path, sr=sr, duration=duration)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_db = (mel_spec_db - mel_spec_db.mean()) / mel_spec_db.std()
    return mel_spec_db.T  # Transpose to get (time, features)

audio_dir = pathlib.Path("music-classification-wav/versions/3/IA/train/")

audio_files = [str(audio_dir / filename) for filename in cleaned_df.loc[:, "filename"]]

start_time = time.time()
features = [extract_mel_spectrogram(file) for file in audio_files]
features = [torch.tensor(f, dtype=torch.float32) for f in features]
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")

Elapsed time: 95.75 seconds


In [7]:
import pickle

with open('features.pkl', 'wb') as file:
    pickle.dump(features, file)

with open('features.pkl', 'rb') as file:
    loaded_data = pickle.load(file)

In [8]:
class AudioTransformer(nn.Module):
    def __init__(self, input_dim, d_model=128, nhead=4, num_layers=4):
        super(AudioTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.pooling = nn.AdaptiveAvgPool1d(1)

    def forward(self, x):
        # x: (batch_size, seq_len, input_dim)
        x = self.embedding(x)  # (batch_size, seq_len, d_model)
        x = x.permute(1, 0, 2)  # (seq_len, batch_size, d_model)
        x = self.transformer_encoder(x)
        x = x.permute(1, 2, 0)  # (batch_size, d_model, seq_len)
        x = self.pooling(x).squeeze(-1)  # (batch_size, d_model)
        return x
        
start_time = time.time()
# Instantiate the Transformer model
input_dim = features[0].shape[1]
transformer = AudioTransformer(input_dim=input_dim)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")

# Convert the list of features to a batch
features = torch.stack(features)


Elapsed time: 0.02 seconds




RuntimeError: stack expects each tensor to be equal size, but got [216, 64] at entry 0 and [22, 64] at entry 6182

In [2]:
class AudioTransformer(nn.Module):
    def __init__(self, input_dim, d_model=128, nhead=4, num_layers=4):
        super(AudioTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.pooling = nn.AdaptiveAvgPool1d(1)

    def forward(self, x):
        # x: (batch_size, seq_len, input_dim)
        x = self.embedding(x)  # (batch_size, seq_len, d_model)
        x = x.permute(1, 0, 2)  # (seq_len, batch_size, d_model)
        x = self.transformer_encoder(x)
        x = x.permute(1, 2, 0)  # (batch_size, d_model, seq_len)
        x = self.pooling(x).squeeze(-1)  # (batch_size, d_model)
        return x

# Instantiate the Transformer model
input_dim = features[0].shape[1]
transformer = AudioTransformer(input_dim=input_dim)

# Convert the list of features to a batch
features = torch.stack(features)


NameError: name 'features' is not defined

In [None]:
with torch.no_grad():
    embeddings = transformer(features)


In [None]:
from sklearn.cluster import KMeans

n_clusters = 5
embeddings_np = embeddings.numpy()
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_labels = kmeans.fit_predict(embeddings_np)
cluster_centers = kmeans.cluster_centers_


In [None]:
class ClusteringLayer(nn.Module):
    def __init__(self, cluster_centers):
        super(ClusteringLayer, self).__init__()
        self.cluster_centers = nn.Parameter(torch.tensor(cluster_centers, dtype=torch.float32))

    def forward(self, x):
        q = 1.0 / (1.0 + torch.sum((x.unsqueeze(1) - self.cluster_centers) ** 2, dim=2))
        q = q / torch.sum(q, dim=1, keepdim=True)
        return q

# Define the DEC model
class DECModel(nn.Module):
    def __init__(self, transformer, cluster_centers):
        super(DECModel, self).__init__()
        self.transformer = transformer
        self.clustering_layer = ClusteringLayer(cluster_centers)

    def forward(self, x):
        embeddings = self.transformer(x)
        q = self.clustering_layer(embeddings)
        return q

# Instantiate the DEC model
dec_model = DECModel(transformer, cluster_centers)


In [None]:
import torch.optim as optim

def kl_divergence_loss(p, q):
    return torch.sum(p * torch.log(p / (q + 1e-10)))

# Target distribution function
def target_distribution(q):
    weight = q ** 2 / torch.sum(q, dim=0)
    return (weight.T / torch.sum(weight, dim=1)).T

# Fine-tuning with DEC
optimizer = optim.Adam(dec_model.parameters(), lr=1e-4)
epochs = 100

for epoch in range(epochs):
    q = dec_model(features)
    p = target_distribution(q).detach()

    loss = kl_divergence_loss(p, q)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")


In [None]:
with torch.no_grad():
    final_q = dec_model(features)
    final_assignments = torch.argmax(final_q, dim=1).numpy()

print("Final cluster assignments:", final_assignments)
