In [40]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import matplotlib.pyplot as plt
import librosa
import torchvision
from torchvision import datasets, models, transforms
import pathlib
from pathlib import Path
import numpy as np
from PIL import Image

import json

from soundnet import SoundNet
import os

In [41]:
# check if you are on cpu or gpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [46]:
# === Image model ===
# Data augmentation and normalization for training

data_augmentation = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

input_image_path = Path.cwd()

input_image = datasets.ImageFolder(input_image_path, data_augmentation)

image_model = models.resnet18(pretrained=True)

num_features = image_model.fc.in_features

class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()
        
    def forward(self, x):
        return x

# image_model.fc = Identity()
# image_model.eval()
image_model.to(device)
input = input_image[0][0].unsqueeze(0)
input.to(device)

output = image_model(input)[0]
output = output.detach().numpy()
print(output)
print(output.size)


[-5.55680215e-01 -4.41814840e-01 -4.34907943e-01 -1.37110877e+00
 -5.71288526e-01 -1.65057823e-01 -4.40182686e-01  5.46261251e-01
  3.49889278e-01 -9.02370930e-01 -9.21970427e-01 -8.88355196e-01
 -3.02617639e-01 -8.94486308e-01 -1.06233501e+00 -5.82640827e-01
 -8.00310433e-01 -2.53822803e-01 -5.32462120e-01 -6.26027822e-01
 -1.42695689e+00 -7.85346746e-01 -1.40502489e+00  1.40229970e-01
 -9.96132314e-01 -1.07509172e+00 -7.51569211e-01 -1.12033308e+00
 -8.58875513e-01 -4.11000580e-01 -8.76609802e-01 -8.13729346e-01
 -4.81220126e-01 -5.58560193e-01 -3.39714795e-01 -4.83072072e-01
  5.67166865e-01 -7.64063060e-01 -3.83824676e-01  1.05657309e-01
 -6.89047813e-01 -8.21364939e-01 -9.24404085e-01 -3.23156923e-01
 -5.67218006e-01 -4.31197196e-01 -7.64891207e-01 -4.31077302e-01
 -1.05955839e+00 -9.73168075e-01 -3.99565637e-01  6.05039775e-01
 -4.06594396e-01 -6.38710141e-01 -1.04213409e-01 -1.17939985e+00
 -3.26205701e-01 -1.39287293e+00 -4.60900843e-01 -5.40153801e-01
  8.08660805e-01  2.74330

In [43]:
# === Sound model ===

soundnet_model = SoundNet()
soundnet_model.load_state_dict(torch.load("soundnet8_final.pth"))

sound_dataset_path = Path.cwd().joinpath('data/sound_samples')

#practice sound sample input
sample_loaded, rate = librosa.load(sound_dataset_path.joinpath('9447523993_9_9_3.mp3'))
sample_loaded = sample_loaded * 255.0
sample_loaded = np.reshape(sample_loaded, (1, 1, sample_loaded.shape[0], 1))

#convert to tensor
sample_loaded = torch.tensor(sample_loaded)
sample_loaded.to(device)
pred = soundnet_model(torch.tensor(sample_loaded))

pred = pred[0].detach().numpy()
pred = pred[:,0][:,0]
print(pred)
print(pred.size)


  app.launch_new_instance()


[-120.72404  -145.66644  -139.93918  -122.216606 -115.98013  -113.21068
 -128.37582  -113.49824  -143.45058  -122.799416 -123.21407  -134.93053
 -128.09787  -113.348495 -111.89171  -114.92555  -114.04423  -119.88331
 -126.03649  -106.68708  -131.46101  -136.66978  -106.64196  -105.64099
 -142.37512   -92.7204    -99.77843   -90.08237   -80.96749  -133.01341
 -110.55024  -104.09697  -112.955345 -118.68225   -88.91125  -121.279564
 -121.21668  -159.41295  -111.46681  -115.90193   -98.036194 -123.01772
 -104.89643  -105.33513  -103.865524  -76.61178   -94.49102  -108.794716
 -120.08398  -119.567474 -119.694756 -106.470856 -112.870415  -96.152725
  -90.14673   -90.326744  -90.81937  -109.79061  -112.34728   -84.869835
  -64.06586   -94.416214  -92.36306  -103.68907   -95.13508  -105.63158
  -86.96474   -79.25774   -92.89505  -101.91007  -123.776276  -82.774254
 -108.16808   -96.86319   -98.33144  -115.17392  -105.580315  -95.555595
 -117.167465 -101.779724 -114.98691  -131.71545  -115.7491

In [44]:
# Create JSON file with JSON object of sound_samples
# key is the file path and the value is the 1000 length array
sound_samples = {}

for filename in os.listdir(sound_dataset_path):
    
    sample_loaded, rate = librosa.load(sound_dataset_path.joinpath(filename))
    if len(sample_loaded) > 230000:
        sample_loaded = np.reshape(sample_loaded, (1, 1, sample_loaded.shape[0], 1))
        #convert to tensor
        sample_loaded = torch.tensor(sample_loaded)
        sample_loaded.to(device)
        pred = soundnet_model(torch.tensor(sample_loaded))
        pred = pred[0].detach().numpy()
        pred = pred[:,0][:,0]


        #store 1000 length array as value in dictionary
        sound_samples[filename] = pred.tolist()

with open('sound_samples.json', 'w') as outfile:
    json.dump(sound_samples, outfile, ensure_ascii=False, indent=4, sort_keys=True, separators=(',', ': '))


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


In [45]:
from heapq import nsmallest 
# take the l2 distance of the image array and each array in sound samples and sort by how close they are
f = open('sound_samples.json')
sound_data = json.load(f)
l2dist = {}
for mp3 in sound_data:
    l2dist[mp3] = np.linalg.norm(sound_data[mp3]-output)
    # print(sound_data[mp3])
f.close()

# take the first three (closest)
smallest3 = nsmallest(3, l2dist, key = l2dist.get) 

print(smallest3)

['10226698643_6_4_3.mp3', '5372081133_1_3_3.mp3', '6265975524_5_2_4.mp3']
