## Set up dependencies

In [None]:
!pip install kaggle
!pip install torch torchaudio torchvision
!pip install matplotlib
!sudo apt install libsox-dev
!echo '{"username":"antoinedangeard","key":"445fa2e3c51d7c9afd628cc57cd7fa33"}' > ~/.kaggle/kaggle.json

In [3]:
import torch
import torchaudio
import torchvision
import matplotlib.pyplot as plt
from IPython.display import Audio
from PIL import Image
import random

## Load dataset

In [4]:
!kaggle datasets download -d andradaolteanu/gtzan-dataset-music-genre-classification
!unzip gtzan-dataset-music-genre-classification.zip -d GTZAN
!rm gtzan-dataset-music-genre-classification.zip

Downloading gtzan-dataset-music-genre-classification.zip to /content
100% 1.21G/1.21G [00:41<00:00, 33.9MB/s]
100% 1.21G/1.21G [00:41<00:00, 31.1MB/s]
Archive:  gtzan-dataset-music-genre-classification.zip
replace GTZAN/Data/features_30_sec.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
  inflating: GTZAN/Data/features_3_sec.csv  


In [6]:
GENRE_TO_LABEL_MAPPING = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]
GTZAN_SAMPLE_RATE = 22050
TARGET_SAMPLE_RATE = 11025

# Load the dataset
print(f"Loading raw images from dataset at content/GTZAN")
samples = []
labels = []
with open("GTZAN/Data/features_30_sec.csv", 'r') as file:
    for line in file:
        fields = line.strip().split(",")
        genre = fields[-1]
        wav_filename = "GTZAN/Data/genres_original/{}/{}".format(genre, fields[0])
        try:
          sample, _ = torchaudio.load(wav_filename)
          label = GENRE_TO_LABEL_MAPPING.index(genre)
          samples.append(sample)
          labels.append(label)
        except RuntimeError:
          print("Missing sample {}".format(wav_filename))

# Sanity check: play one of the samples to check it is correctly loaded
random_index = random.randint(0, len(samples)-1)
print("Listening to {}:".format(GENRE_TO_LABEL_MAPPING[labels[random_index]]))
Audio(samples[random_index].squeeze().numpy(), rate=GTZAN_SAMPLE_RATE)

Loading raw images from dataset at content/GTZAN
Missing sample GTZAN/Data/genres_original/label/filename
Missing sample GTZAN/Data/genres_original/jazz/jazz.00054.wav
Listening to hiphop:


## Augment Dataset

In [7]:
speed_factors = [0.9, 1.0, 1.1]
# gains = range(-3, 4)
pitches = [100 * n for n in range(-6, 7)]

total_iterations = len(samples) * len(speed_factors) * len(pitches)
iterations_completed = 0

print("Adding {} new samples to the dataset with augmentations...".format(total_iterations - len(samples)))

augmented_samples = []
augmented_labels = []

for i in range(len(samples)):
  for speed_factor in speed_factors:
    for cent_pitch_shift in pitches:
      print("{}%               ".format(100 * iterations_completed / total_iterations))

      effects = [
          ["speed", str(speed_factor)],
          ["pitch", str(cent_pitch_shift)]
      ]

      augmented_sample, _ = torchaudio.sox_effects.apply_effects_tensor(samples[i], TARGET_SAMPLE_RATE, effects)
      augmented_samples.append(augmented_sample)
      augmented_labels.append(labels[i])
      iterations_completed += 1

print("Added {} new samples to the dataset with augmentations.".format(len(augmented_samples) - len(samples)))

# Sanity check: listen to a raw vs. augmented version of the same sample to ensure it is not too extreme
random_raw_sample_index = random.randint(0, len(samples)-1)
print("Original sample of {}".format(GENRE_TO_LABEL_MAPPING[labels[random_index]]))
Audio(samples[random_raw_sample_index].squeeze().numpy(), rate=TARGET_SAMPLE_RATE)
augmented_samples_per_original_sample = int(total_iterations / len(samples))
random_augmented_sample_index = random.randint(random_raw_sample_index * augmented_samples_per_original_sample, ((random_raw_sample_index + 1) * augmented_samples_per_original_sample) - 1)
print("Augmented sample of {}".format(GENRE_TO_LABEL_MAPPING[labels[random_index]]))
Audio(samples[random_raw_sample_index].squeeze().numpy(), rate=TARGET_SAMPLE_RATE)

Adding 37962 new samples to the dataset with augmentations...
0.0%               
0.0025666692333359%               
0.0051333384666718%               
0.0077000077000077%               
0.0102666769333436%               
0.0128333461666795%               
0.0154000154000154%               
0.0179666846333513%               
0.0205333538666872%               
0.0231000231000231%               
0.025666692333359%               
0.0282333615666949%               
0.0308000308000308%               
0.0333667000333667%               
0.0359333692667026%               
0.0385000385000385%               
0.0410667077333744%               
0.0436333769667103%               
0.0462000462000462%               
0.0487667154333821%               
0.051333384666718%               
0.0539000539000539%               
0.0564667231333898%               
0.0590333923667257%               
0.0616000616000616%               
0.0641667308333975%               
0.0667334000667334%               
0.06930006

KeyboardInterrupt: 

In [None]:
# train / test split
test_ratio = 0.2
test_size = int(test_ratio * len(dataset))
train_size = len(dataset) - test_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
print(f"{train_size} images for training, {test_size} images for testing.")