In [2]:
import torch
import torchvision
import torchaudio
import random
import numpy as np
import librosa
import librosa.display
import pandas as pd
import os
from PIL import Image
import pathlib
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from torchvision.transforms import ToTensor
from torchvision.utils import make_grid
from torch.utils.data.dataloader import DataLoader
from torch.utils.data import random_split
from tqdm.autonotebook import tqdm
import IPython.display as ipd
from IPython.display import Audio
import torchvision.transforms as T
import warnings
from classification_model import Net
import soundfile as sf
%matplotlib inline
warnings.filterwarnings('ignore')

In [3]:
cmap = plt.get_cmap('inferno')

In [4]:
Net()

Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /root/.cache/torch/hub/checkpoints/resnet34-b627a593.pth


  0%|          | 0.00/83.3M [00:00<?, ?B/s]

Net(
  (network): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running

In [5]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [6]:
device = get_default_device()
device

device(type='cuda')

In [7]:
model = to_device(Net(), device)

In [8]:
torch.cuda.empty_cache()

In [9]:
# Running for Emotion Classification
model.load_state_dict(torch.load('./model_emotion.pth', map_location=device), strict=False)

<All keys matched successfully>

In [10]:
# On a new generated music
songname = './chaconne_generated_clip.wav'
pathlib.Path(f'./output/').mkdir(parents=True, exist_ok=True)
song_amazement = '.data/emotifymusic/electronic/39.mp3'
song_calmness = '.data/emotifymusic/classical/52.mp3'
song_nostalgia = '.data/emotifymusic/classical/1.mp3'
song_solemnity = '.data/emotifymusic/pop/54.mp3'
song_tenderness = '.data/emotifymusic/classical/14.mp3'
y, sr = librosa.load(songname, mono=True, duration=10)
print(y.shape)
y_amazement, sr_amazement  = librosa.load(song_amazement, mono=True, duration=10)
y_calmness, sr_calmness = librosa.load(song_calmness, mono=True, duration=10)
y_nostalgia, sr_nostalgia = librosa.load(song_nostalgia, mono=True, duration=10)
y_solemnity, sr_solemnity= librosa.load(song_solemnity, mono=True, duration=10)
y_tenderness, sr_tenderness = librosa.load(song_tenderness, mono=True, duration=10)


plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_emotion.png')
plt.clf()

(220500,)


<Figure size 432x288 with 0 Axes>

In [11]:
# generating amazement
noise = np.random.normal(0, .1, y.shape)
y_amaze_gen = 0.5 * y + y_amazement + 0.01 * noise
plt.specgram(y_amaze_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_amazement.png')
plt.clf()


<Figure size 432x288 with 0 Axes>

In [12]:
Audio(y_amaze_gen, rate=sr)

In [13]:
sf.write('./output/output1_amaze.wav', y_amaze_gen, sr, 'PCM_24')

In [14]:
# generating calmness
noise = np.random.normal(0, .1, y.shape)
y_calmn_gen = 0.5 * y + y_calmness + 0.01 * noise
plt.specgram(y_calmn_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_calmness.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [15]:
Audio(y_calmn_gen, rate=sr)

In [16]:
sf.write('./output/output1_calm.wav', y_calmn_gen, sr, 'PCM_24')

In [17]:
# generating nostalgia
noise = np.random.normal(0, .1, y.shape)
y_nostalgia_gen = 0.5 * y + y_nostalgia + 0.01 * noise
plt.specgram(y_nostalgia_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_nostalgia.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [18]:
Audio(y_nostalgia_gen, rate=sr)

In [19]:
sf.write('./output/output1_nostalgia.wav', y_nostalgia_gen, sr, 'PCM_24')

In [20]:
# generating solemnity
noise = np.random.normal(0, .1, y.shape)
y_solemn_gen = 0.5 * y + y_solemnity + 0.01 * noise
plt.specgram(y_solemn_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_solemnity.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [21]:
Audio(y_solemn_gen, rate=sr)

In [22]:
sf.write('./output/output1_solemn.wav', y_solemn_gen, sr, 'PCM_24')

In [23]:
# generating tenderness
noise = np.random.normal(0, .1, y.shape)
y_tender_gen = 0.5 * y + y_tenderness + 0.01 * noise
plt.specgram(y_tender_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_tenderness.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [24]:
Audio(y_tender_gen, rate=sr)

In [25]:
sf.write('./output/output1_tender.wav', y_tender_gen, sr, 'PCM_24')

In [27]:
# Classifying emotion of a any music 

emotions = {'1': 'Amazement', '2':'Solemnity', '3':'Tenderness', '4':'Nostalgia', '5':'Calmness'}
model.eval()
image = Image.open('./output/output1_emotion.png').convert('RGB')
val_trms = T.Compose([
                        T.Resize(224),
                        T.ToTensor()
                        ])
image = val_trms(image)

image.unsqueeze_(0)
image = image.to(device)
prediction = model.predict_output(image)
predicted_class = torch.argmax(prediction)
print('Emotion is {0}'.format(emotions[str(int(predicted_class)+ 1)]))

Emotion is Tenderness


In [28]:

torch.cuda.empty_cache()

In [29]:
# Running for Genre Classification
model.load_state_dict(torch.load('./model_genre.pth', map_location=device), strict=False)

<All keys matched successfully>

In [32]:
# On a new generated music
songname = './chaconne_generated_clip.wav'
pathlib.Path(f'./output/').mkdir(parents=True, exist_ok=True)

song_blue = '.data/GTZAN-dataset/genres_original/blues/blues.00007.wav'
song_classical = '.data/GTZAN-dataset/genres_original/classical/classical.00007.wav'
song_country = './drive/MyDrive/GTZAN-dataset/genres_original/country/country.00007.wav'
song_disco = '.data/GTZAN-dataset/genres_original/disco/disco.00007.wav'
song_hiphop = '.data/GTZAN-dataset/genres_original/hiphop/hiphop.00007.wav'
song_jazz = '.data/GTZAN-dataset/genres_original/jazz/jazz.00007.wav'
song_metal = '.data/GTZAN-dataset/genres_original/metal/metal.00007.wav'
song_pop = '.data/GTZAN-dataset/genres_original/pop/pop.00007.wav'
song_reggae = '.data/GTZAN-dataset/genres_original/reggae/reggae.00007.wav'
song_rock = '.data/GTZAN-dataset/genres_original/rock/rock.00007.wav'
y, sr = librosa.load(songname, mono=True, duration=10)
print(y.shape)
y_blue, sr_blue  = librosa.load(song_blue, mono=True, duration=10)
y_classical, sr_classical = librosa.load(song_classical, mono=True, duration=10)
y_country, sr_country = librosa.load(song_country, mono=True, duration=10)
y_disco, sr_disco = librosa.load(song_disco, mono=True, duration=10)
y_hiphop, sr_hiphop = librosa.load(song_hiphop, mono=True, duration=10)
y_jazz, sr_jazz = librosa.load(song_jazz, mono=True, duration=10)
y_metal, sr_metal = librosa.load(song_metal, mono=True, duration=10)
y_pop, sr_pop = librosa.load(song_pop, mono=True, duration=10)
y_reggae, sr_reggae = librosa.load(song_reggae, mono=True, duration=10)
y_rock, sr_rock = librosa.load(song_rock, mono=True, duration=10)

plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_genre.png')
plt.clf()

(220500,)


<Figure size 432x288 with 0 Axes>

In [33]:
# generating blue
noise = np.random.normal(0, .1, y.shape)
y_blue_gen = 0.5 * y + y_blue + 0.01 * noise
plt.specgram(y_blue_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_blue.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [34]:
Audio(y_blue_gen, rate=sr)

In [61]:
sf.write('./output/output1_blue.wav', y_blue_gen, sr, 'PCM_24')

In [62]:
# generating classical
noise = np.random.normal(0, .1, y.shape)
y_classical_gen = 0.5 * y + y_classical + 0.01 * noise
plt.specgram(y_classical_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_classical.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [64]:
Audio(y_classical_gen, rate=sr)

In [66]:
sf.write('./output/output1_classical.wav', y_classical_gen, sr, 'PCM_24')

In [35]:
# generating country
noise = np.random.normal(0, .1, y.shape)
y_country_gen = 0.5 * y + y_country + 0.01 * noise
plt.specgram(y_country_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_country.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [36]:
Audio(y_country_gen, rate=sr)

In [60]:
sf.write('./output/output1_country.wav', y_country_gen, sr, 'PCM_24')

In [37]:
# generating disco
noise = np.random.normal(0, .1, y.shape)
y_disco_gen = 0.5 * y + y_disco + 0.01 * noise
plt.specgram(y_disco_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_disco.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [38]:
Audio(y_disco_gen, rate=sr)

In [59]:
sf.write('./output/output1_disco.wav', y_disco_gen, sr, 'PCM_24')

In [39]:
# generating hiphop
noise = np.random.normal(0, .1, y.shape)
y_hiphop_gen = 0.5 * y + y_hiphop + 0.01 * noise
plt.specgram(y_hiphop_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_hiphop.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [40]:
Audio(y_hiphop_gen, rate=sr)

In [57]:
sf.write('./output/output1_hiphop.wav', y_hiphop_gen, sr, 'PCM_24')

In [41]:
# generating jazz
noise = np.random.normal(0, .1, y.shape)
y_jazz_gen = 0.5 * y + y_jazz + 0.01 * noise
plt.specgram(y_jazz_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_jazz.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [42]:
Audio(y_jazz_gen, rate=sr)

In [55]:
sf.write('./output/output1_jazz.wav', y_jazz_gen, sr, 'PCM_24')

In [43]:
# generating metal
noise = np.random.normal(0, .1, y.shape)
y_metal_gen = 0.5 * y + y_metal + 0.01 * noise
plt.specgram(y_metal_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_metal.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [44]:
Audio(y_metal_gen, rate=sr)

In [56]:
sf.write('./output/output1_metal.wav', y_metal_gen, sr, 'PCM_24')

In [45]:
# generating pop
noise = np.random.normal(0, .1, y.shape)
y_pop_gen = 0.5 * y + y_pop + 0.01 * noise
plt.specgram(y_pop_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_pop.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [46]:
Audio(y_pop_gen, rate=sr)

In [54]:
sf.write('./output/output1_pop.wav', y_pop_gen, sr, 'PCM_24')

In [47]:
# generating reggae
noise = np.random.normal(0, .1, y.shape)
y_reggae_gen = 0.5 * y + y_reggae + 0.01 * noise
plt.specgram(y_reggae_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_reggae.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [48]:
Audio(y_reggae_gen, rate=sr)

In [53]:
sf.write('./output/output1_reggae.wav', y_reggae_gen, sr, 'PCM_24')

In [49]:
# generating rock
noise = np.random.normal(0, .1, y.shape)
y_rock_gen = 0.5 * y + y_rock + 0.01 * noise
plt.specgram(y_rock_gen, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
plt.axis('off');
plt.savefig(f'./output/output1_rock.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [50]:
Audio(y_rock_gen, rate=sr)

In [52]:
sf.write('./output/output1_rock.wav', y_rock_gen, sr, 'PCM_24')

In [51]:
# Classifying genre of any music 
model.eval()
genres = {1:'blues',2: 'classical', 3:'country', 4:'disco', 5:'hiphop', 6:'jazz', 7:'metal', 8:'pop', 9:'reggae', 10:'rock'}
image = Image.open('./output/output1_genre.png').convert('RGB')
val_trms = T.Compose([
                        T.Resize(224),
                        T.ToTensor()
                        ])
image = val_trms(image)

image.unsqueeze_(0)
image = image.to(device)
prediction = model.predict_output(image)
predicted_class = torch.argmax(prediction)
predicted_class
print('Genre is {0}'.format(genres[(int(predicted_class)+ 1)]))

Genre is classical
