In [None]:
import os
import librosa
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
sample_audio_file_path = "/content/drive/MyDrive/CSCI-534/RawData/03-02-01-01-01-01-01.wav"


In [None]:
!pip install resampy

In [None]:
def load_audio(file_path, target_sr=16000):
    # Load the file with librosa, which automatically resamples to the target_sr
    audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)
    return audio,sample_rate



In [None]:
import torch.nn as nn
class multiple_class_model(torch.nn.Module):
    def __init__(self,input_size,emo_size=8,hidden_size=256,hidden_size2=128,hidden_size3 = 64,drop_out=0.2, denseN=10,norm_type="Batch"):
        super(multiple_class_model, self).__init__()
        self.input_size = input_size #X_train.shape[1]
        self.emo_size = emo_size
        self.hidden1 = hidden_size
        self.hidden2 = hidden_size2
        self.drop_out = drop_out
        self.dense = denseN
        if norm_type == "Batch":
            self.Norm1 = nn.BatchNorm1d(hidden_size)
            self.Norm2 = nn.BatchNorm1d(hidden_size3)
        elif norm_type == "Layer":
            self.Norm1 = nn.LayerNorm(hidden_size)
            self.Norm2 = nn.LayerNorm(hidden_size3)
        self.Conv1 = nn.Conv1d(in_channels=1,
                                out_channels=hidden_size,
                                kernel_size=5,
                                padding='same')
        self.maxP = nn.MaxPool1d(kernel_size=8, stride=8, padding=0)

        self.Conv2 = nn.Conv1d(in_channels=hidden_size,
                                out_channels=hidden_size2,
                                kernel_size=5,
                                padding='same')

        self.Conv3 = nn.Conv1d(in_channels=hidden_size2,
                                out_channels=hidden_size3,
                                kernel_size=5,
                                padding='same')
        # self.whole_model = nn.Sequential(
        #     nn.Conv1d(in_channels=input_size,
        #                         out_channels=hidden_size,
        #                         kernel_size=5,
        #                         padding='same'),
        #     self.Norm1,
        #     nn.ReLU(),
        #     nn.Conv1d(in_channels=hidden_size,
        #                         out_channels=hidden_size2,
        #                         kernel_size=5,
        #                         padding='same'),
        #     nn.ReLU(),
        #     nn.Dropout(p=drop_out),
        #     nn.MaxPool1d(kernel_size=8, stride=8, padding=0),
        #     nn.Conv1d(in_channels=hidden_size2,
        #                         out_channels=hidden_size3,
        #                         kernel_size=5,
        #                         padding='same'),
        #     self.Norm2,
        #     nn.ReLU(),
        #     nn.Flatten()
        # )
        self.gender_linear = nn.Linear(hidden_size3 * (input_size//8),2)
        self.emo_linear = nn.Linear(hidden_size3 * (input_size//8),emo_size)

    def forward(self,x):
        # print(type(x))
        x = torch.tensor(x)
        x = x.to(device)
        #print(x.shape)
        x = x.unsqueeze(1)
        x = self.Conv1(x)
        x = self.Norm1(x)
        x = nn.ReLU()(x)
        x = self.Conv2(x)
        x = nn.Dropout(p=self.drop_out)(x)
        x = self.maxP(x)
        x = self.Conv3(x)
        x = self.Norm2(x)
        x = nn.ReLU()(x)
        x = nn.Flatten()(x)
        gender = self.gender_linear(x)
        emo = self.emo_linear(x)
        final_layer = nn.Softmax(dim=1)
        gender = final_layer(gender)
        emo = final_layer(emo)
        #print(gender)
        #print(emo)
        return emo,gender

In [None]:
import tensorflow_hub as hub
def init():
  yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
  pre_model = hub.load(yamnet_model_handle)
  input_n = 1024
  model = multiple_class_model(input_n)
  model.load_state_dict(torch.load("/content/drive/MyDrive/CSCI-534/best_model.pth", map_location=torch.device('cpu')))
  return pre_model, model

In [None]:
def predict(path):
  waveform, sr = load_audio(sample_audio_file_path)
  scores, embeddings, spectrogram = pre_model(waveform)
  print(embeddings)
  features = np.mean(embeddings, axis=0)
  features = features.reshape((1, 1024))
  print(embeddings.shape)
  print(features.shape)
  emo, gender = model.forward(features)
  return emo, gender

In [None]:
pre_model, model = init()
emo, gender = predict(sample_audio_file_path)
print(emo)
print(emo.shape)
print(gender)
print(gender.shape)