# VGG Model for Gender / Age Group prediction

# 0 Init

In [1]:
# a set of switches
preprocessing = False

In [2]:
pwd = './'
# pwd = '/content/drive/MyDrive/UCB/21Fall/225D/'

In [3]:
if not pwd == './':
    from google.colab import drive
    drive.mount('/content/drive')

In [4]:
import numpy as np
from PIL import Image
from matplotlib import cm
import os
from torch.utils import data
from torch.utils.data import Dataset, dataset
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile
from pathlib import Path 
from tqdm import tqdm
from numpy import asarray
from numpy import save
import json
import torch

import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
import librosa
import torch.nn as nn



## Hyper parameters

In [6]:
LR = 1e-3
MOMENTUM = 0.9
LR_DECAY = 0.95
EPOCHS = 100
BATCH_SIZE = 32
CKPT_DIR = pwd + "ckpt/"

# 1 Preprocessing
- waveform --> mel-spectrogram
- (.wav)   --> (.npy)

## 1.1 Convert to spectrogram and save

In [7]:
# lst = []
# for wav in tqdm(os.listdir(wav_dir)):
#   p1 = os.path.join(wav_dir, wav)
#   sample_rate, samples = wavfile.read(p1)
#   frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
#   lst.append(spectrogram.shape[1])
# sum(lst)//len(lst)

In [8]:
def wav2spec(filename, length = 300, mx = 50):
  # sample_rate, samples = wavfile.read(file_path)
  # frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
  waveform, sample_rate = torchaudio.load(filename)
  print(filename)
  transform = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_fft=1000, n_mels=200)
  spectrogram = transform(waveform)
  # transform & convert
  array = librosa.power_to_db(spectrogram[0])  # power --> db
  # plt.imshow(array, origin='lower')
  if array.shape[1] >= length:
    # crop
    return array[:,:length]/mx
  else:
    # repeat
    times = length // array.shape[1] + 1
    res = array
    for i in range(times):
        res = np.concatenate((res, array), axis=1)
    res = res[:,:length]
    return res/mx

In [10]:
wav_path = "example/IMG-9322.wav"
spec_path = "example/spec.npy"
arr = wav2spec(wav_path, 200)
save(spec_path, arr)
npy_data = arr

example/IMG-9322.wav


## 3.1 model definition

In [11]:
import torchvision.models as models
model = models.efficientnet_b1(pretrained=False)
model.features[0] = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
model.classifier[-1] = nn.Linear(in_features=1280, out_features=2, bias=True)

In [12]:
# check
x = torch.rand(1, 1, 128, 400)
model(x).shape

torch.Size([1, 2])

In [13]:
ckpt_path = "ckpt/gender spectrogram/ckpt_005.ckpt"
model.load_state_dict(torch.load(ckpt_path)['state_dict'])

<All keys matched successfully>

In [18]:
wav_data = torch.tensor(arr)[None, None, :, :]

In [19]:
pred = model(wav_data)

In [20]:
pred

tensor([[ 1.3556, -0.7831]], grad_fn=<AddmmBackward0>)

In [25]:
pred_dest_path = "confusion_matrix_source_data/gender_spec_pred.npy"
label_dest_path = "confusion_matrix_source_data/gender_spec_label.npy"
pred_list = []
label_list = []
for batch in tqdm(validation_loader):
    pred = torch.argmax(model(batch['x']), axis = 1)
    label = batch['gender']
    pred_list += pred.tolist()
    label_list += label.tolist()

data_list = [pred_list, label_list]
np.save(pred_dest_path,np.array(pred_list))
np.save(label_dest_path,np.array(label_list))
data = np.load(pred_dest_path)
data

100%|██████████| 171/171 [01:54<00:00,  1.49it/s]


array([0, 0, 1, ..., 1, 1, 0])

In [None]:
y = batch['emotion']

In [None]:
torch.argmax(res, dim = 1)

In [None]:
res[:10]

## 5.3 Accuracy on Datasets with GT

In [None]:
wav_dir_gt = "225D_DataSet_GT/CREMA/CREMA_wav/"
npy_dir_gt = "225D_DataSet_GT/CREMA/npy/"
label_dir_gt = "225D_DataSet_GT/CREMA/CREMA_label/"

### 5.3.1 GT Preprocessing

In [None]:
preprocessing = False

In [None]:
def wav2spec(filename, length = 300, mx = 50):
  # sample_rate, samples = wavfile.read(file_path)
  # frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)
  waveform, sample_rate = torchaudio.load(filename)
  transform = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_fft=1000, n_mels=200)
  spectrogram = transform(waveform)
  # transform & convert
  array = librosa.power_to_db(spectrogram[0])  # power --> db
  if array.shape[1] >= length:
    # crop
    return array[:,:length]/mx
  else:
    # repeat
    times = length // array.shape[1] + 1
    res = array
    for i in range(times):
        res = np.concatenate((res, array), axis=1)
    res = res[:,:length]
    return res/mx

if preprocessing:
    # convert
    for wav in tqdm(os.listdir(wav_dir_gt)):
      p1 = os.path.join(wav_dir_gt, wav)
      p2 = os.path.join(npy_dir_gt, wav)
      arr = wav2spec(p1, 200)
      # print(np.max(arr))
      save(p2[:-3] + "npy", arr)

### 5.3.2 Dataloader

In [None]:
class GTDataset(Dataset):
    def __init__(self, npy_dir_gt, label_dir_gt):
        self._npy_paths = sorted(os.listdir(npy_dir_gt))
        self._label_paths = sorted(os.listdir(label_dir_gt))
        for i in range(len(self._npy_paths)):
            self._npy_paths[i] = npy_dir_gt + self._npy_paths[i]
        for i in range(len(self._label_paths)):
            self._label_paths[i] = label_dir_gt + self._label_paths[i]

    def __len__(self):
        return len(self._npy_paths)

    def __getitem__(self, idx):
        npy_path = self._npy_paths[idx]
        label_path  = self._label_paths[idx]
        # normalize to [0,1]
        npy = np.load(npy_path)/2000
        with open(label_path) as f:
            sample = json.load(f)
        sample['x'] = npy[None,:,:]
        return sample


In [None]:
gt_dataset = GTDataset(npy_dir_gt, label_dir_gt)
gt_dataloader = torch.utils.data.DataLoader(gt_dataset, batch_size=1, shuffle=False)

In [None]:
# check
for sample in gt_dataloader:
    print(sample['x'].shape)
    break

### 5.3.2 Calculate Acc

In [None]:
ckpt_path = "ckpt/age spectrogram/ckpt_005.ckpt"
model.load_state_dict(torch.load(ckpt_path)['state_dict'])

In [None]:
correct = 0
total = 0
for sample in tqdm(gt_dataloader):
    total += 1
    pred = model(sample['x'])
    label = sample['gender']
    print(pred, label)
    if pred == label:
        correct += 1
        

In [None]:
print(correct/total)

# Scratch

In [None]:
a = torch.tensor([1,2,3])
b = torch.tensor([1,3,3])
sum(a == b)

In [None]:
sample_rate, samples = wavfile.read('/content/drive/MyDrive/UCB/21Fall/225D/smallset/wav/0LbtndiXJC0-00003.wav')
frequencies, times, spectrogram = signal.spectrogram(samples, sample_rate)

plt.pcolormesh(times, frequencies, np.log(spectrogram))
# plt.imshow(np.log(spectrogram))
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.show()

In [None]:
print(np.max(spectrogram))

In [None]:
spectrogram

In [None]:
np.unique(spectrogram.astype(np.uint8))

In [None]:
(spectrogram*255).astype(np.uint8)

In [None]:
spectrogram

In [None]:
np.unique(spectrogram.astype(np.uint8))

In [None]:
Image.fromarray(spectrogram.astype(np.uint8))

In [None]:
type(spectrogram),spectrogram.shape

In [None]:
frequencies

In [None]:

p1 = '/content/drive/MyDrive/UCB/21Fall/225D/smallset/wav/0LbtndiXJC0-00003.wav'

arr = wav2spec(p1, 600)
print(np.max(arr))

In [None]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform(m.weight)
        m.bias.data.fill_(0.01)

net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
net.apply(init_weights)

In [None]:
arr = sig2spec(samples, sample_rate)

In [None]:
filename = "/content/drive/MyDrive/UCB/21Fall/225D/225D_DataSet/wav_files/0yeElGrMVpI-00003.wav"

In [None]:
waveform, sample_rate = torchaudio.load(filename, normalize=True)
transform = torchaudio.transforms.MelSpectrogram(n_fft=800)
spectrogram = transform(waveform)
# transform & convert
array = librosa.power_to_db(spectrogram[0])  # power --> db
# # store
# target_name = npy_path + file_name[:-4] + '.npy'
# with open(target_name, 'wb') as f:
#   np.save(f, array)

In [None]:
type(spectrogram)

In [None]:
spectrogram.shape

In [None]:
plt.imshow(array, origin='lower')

In [None]:
array.shape

In [None]:
np.max(array)

In [None]:
mx = 0
length = []
i = 0
for file_name in tqdm(os.listdir(wav_dir)):
  if i % 100 == 0:
    # read & compute mel-spectrogram
    path_name = wav_dir + file_name
    waveform, sample_rate = torchaudio.load(path_name, normalize=True)
    transform = torchaudio.transforms.MelSpectrogram(n_fft=800)
    spectrogram = transform(waveform)
    # transform & convert
    array = librosa.power_to_db(spectrogram[0])  # power --> db
    mx = max(np.max(array), mx)
    length.append(array.shape[1])
  i += 1

In [None]:
mx

In [None]:
sum(length)/len(length)

In [None]:
% cd /content/drive/MyDrive/UCB/21Fall/225D/225D_DataSet

In [None]:
import os
len(os.listdir("json"))

In [None]:
! unzip 225D_DataSet.zip 