<a href="https://colab.research.google.com/github/bryanjiang1204/aurora/blob/main/AURORA_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aurora Embeddings

## Video

In [None]:
!pip install fvcore
!pip install av
!pip install pytorchvideo

In [None]:
import torch

In [None]:
model_name = 'x3d_s'
model = torch.hub.load('facebookresearch/pytorchvideo', model_name, pretrained=True)

Downloading: "https://github.com/facebookresearch/pytorchvideo/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/X3D_S.pyth" to /root/.cache/torch/hub/checkpoints/X3D_S.pyth
100%|██████████| 29.4M/29.4M [00:00<00:00, 136MB/s]


In [None]:
import urllib

from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import CenterCropVideo, NormalizeVideo
from pytorchvideo.transforms import ApplyTransformToKey,ShortSideScale, UniformTemporalSubsample
from pytorchvideo.data.encoded_video import EncodedVideo

mean = [0.5, 0.5, 0.5]
std = [0.25, 0.25, 0.25]

num_frames = 13
sampling_rate = 6
frames_per_second = 30

transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(size=182),
            CenterCropVideo(
                crop_size=(182, 182)
            )
        ]
    ),
)

clip_duration = (num_frames * sampling_rate)/frames_per_second

url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try: urllib.URLopener().retrieve(url_link, video_path)
except: urllib.request.urlretrieve(url_link, video_path)

video = EncodedVideo.from_path(video_path)

video_data = video.get_clip(start_sec=0, end_sec=3)
video_data = transform(video_data)

inputs = video_data["video"]



In [None]:
from torchvision.models.feature_extraction import create_feature_extractor

class Video_Embeddings(torch.nn.Module):
    def __init__(self):
        super().__init__()
        model = torch.hub.load('facebookresearch/pytorchvideo', model_name, pretrained=True).eval()
        self.body = create_feature_extractor(model, return_nodes={'blocks.5.pool.post_conv': 'embeddings',})

    def forward(self, x):
        x = self.body(x)
        return x

extractor = Video_Embeddings()
flat_layer = extractor(inputs[None,...])

y = torch.flatten(flat_layer['embeddings'])
y.shape

Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main


torch.Size([8192])

Sanity check, cosine simialrity

## Text

In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
import torch

input_text = "Wise Gurus chant"

input_ids = torch.tensor([tokenizer.encode(input_text)])
with torch.no_grad():
    model_output = model(input_ids)
bert_embeddings = model_output[0]

x = torch.flatten(bert_embeddings)
x.shape
#padding

torch.Size([4608])

## Audio

In [None]:
!pip install torchopenl3

In [None]:
!pip install openl3

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import openl3
import soundfile as sf

audio, sr = sf.read('/content/drive/My Drive/Archive/pock_wash.mp3')
emb, ts = openl3.get_audio_embedding(audio, sr)




In [None]:
emb.shape

In [None]:
emb_f = torch.flatten(torch.tensor(emb))
emb_f.shape

Training our own audio model

In [None]:
import numpy as np
import librosa
def get_melspectrogram_db(file_path, sr=None, n_fft=2048, hop_length=512, n_mels=128, fmin=20, fmax=8300, top_db=80):
  wav,sr = librosa.load(file_path,sr=sr)
  if wav.shape[0]<5*sr:
    wav=np.pad(wav,int(np.ceil((5*sr-wav.shape[0])/2)),mode='reflect')
  else:
    wav=wav[:5*sr]
  spec=librosa.feature.melspectrogram(y=wav, sr=sr, n_fft=n_fft,
              hop_length=hop_length,n_mels=n_mels,fmin=fmin,fmax=fmax)
  spec_db=librosa.power_to_db(spec,top_db=top_db)
  return spec_db

mels = get_melspectrogram_db('/content/drive/My Drive/Archive/pock_wash.mp3')
mels.shape

(128, 431)

In [None]:
def spec_to_image(spec, eps=1e-6):
  mean = spec.mean()
  std = spec.std()
  spec_norm = (spec - mean) / (std + eps)
  spec_min, spec_max = spec_norm.min(), spec_norm.max()
  spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
  spec_scaled = spec_scaled.astype(np.uint8)
  return spec_scaled

audio_img = spec_to_image(mels)
audio_img.shape

(128, 431)

In [None]:
from torchvision.models import resnet34
import torch
import torch.nn as nn
import torch.optim as optim
if torch.cuda.is_available():
  device=torch.device('cuda:0')
else:
  device=torch.device('cpu')
resnet_model = resnet34(pretrained=True)
resnet_model.fc = nn.Linear(512,1)
resnet_model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
resnet_model = resnet_model.to(device)



In [None]:

import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import librosa
from tqdm import tqdm
import os
class ESC50Data(Dataset):
  def __init__(self, base, df, in_col, out_col):
    self.df = df
    self.data = []
    self.labels = []
    self.c2i={}
    self.i2c={}
    self.categories = sorted(df[out_col].unique())
    for i, category in enumerate(self.categories):
      self.c2i[category]=i
      self.i2c[i]=category
    for ind in tqdm(range(len(df))):
      row = df.iloc[ind]
      file_path = os.path.join(base,row[in_col])
      self.data.append(spec_to_image(get_melspectrogram_db(file_path))[np.newaxis,...])
      self.labels.append(self.c2i[row['category']])
  def __len__(self):
    return len(self.data)
  def __getitem__(self, idx):
    return self.data[idx], self.labels[idx]

train = pd.read_csv('/content/drive/My Drive/ESC-50-master/meta/esc50.csv')
train_data = ESC50Data('/content/drive/My Drive/ESC-50-master/audio', train, 'filename', 'category')
valid_data = ESC50Data('/content/drive/My Drive/ESC-50-master/audio', train, 'filename', 'category')
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=16, shuffle=True)

100%|██████████| 2000/2000 [00:51<00:00, 38.47it/s]
100%|██████████| 2000/2000 [00:52<00:00, 38.30it/s]


Basics of a Model

In [None]:
model = NeuralNet()
optimizer = optim.Adam(model.parameters(),lr=2e-4)
epochs = 10
loss_function = nn.CrossEntropyLoss()

def train(model, loss_function, training_set, epochs, optimizer):
  for e in epoch:
    model.train():
    for data in training_set:
      x,y = data
      optimizer.zero_grad()
      y_pred = model(x)
      loss = loss_function(y_pred,y)
      loss.backward()
      optimizer.step()




In [None]:
learning_rate = 2e-4
optimizer = optim.Adam(resnet_model.parameters(), lr=learning_rate)
epochs = 50
loss_fn = nn.CrossEntropyLoss()
resnet_train_losses=[]
resnet_valid_losses=[]
def lr_decay(optimizer, epoch):
  if epoch%10==0:
    new_lr = learning_rate / (10**(epoch//10))
    optimizer = setlr(optimizer, new_lr)
    print(f'Changed learning rate to {new_lr}')
  return optimizer
def train(model, loss_fn, train_loader, valid_loader, epochs, optimizer, train_losses, valid_losses, change_lr=None):
  for epoch in tqdm(range(1,epochs+1)):
    model.train()
    batch_losses=[]
    if change_lr:
      optimizer = change_lr(optimizer, epoch)
    for i, data in enumerate(train_loader):
      x, y = data
      optimizer.zero_grad()
      x = x.to(device, dtype=torch.float32)
      y = y.to(device, dtype=torch.long)
      y_hat = model(x)
      loss = loss_fn(y_hat, y)
      loss.backward()
      batch_losses.append(loss.item())
      optimizer.step()
    train_losses.append(batch_losses)
    print(f'Epoch - {epoch} Train-Loss : {np.mean(train_losses[-1])}')
    model.eval()
    batch_losses=[]
    trace_y = []
    trace_yhat = []
    for i, data in enumerate(valid_loader):
      x, y = data
      x = x.to(device, dtype=torch.float32)
      y = y.to(device, dtype=torch.long)
      y_hat = model(x)
      loss = loss_fn(y_hat, y)
      trace_y.append(y.cpu().detach().numpy())
      trace_yhat.append(y_hat.cpu().detach().numpy())
      batch_losses.append(loss.item())
    valid_losses.append(batch_losses)
    trace_y = np.concatenate(trace_y)
    trace_yhat = np.concatenate(trace_yhat)
    accuracy = np.mean(trace_yhat.argmax(axis=1)==trace_y)
    print(f'Epoch - {epoch} Valid-Loss : {np.mean(valid_losses[-1])} Valid-Accuracy : {accuracy}')
train(resnet_model, loss_fn, train_loader, valid_loader, epochs, optimizer, resnet_train_losses, resnet_valid_losses, lr_decay)

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch - 1 Train-Loss : 1.6206254019737243


  2%|▏         | 1/50 [15:45<12:52:16, 945.64s/it]

Epoch - 1 Valid-Loss : 0.9045860996246338 Valid-Accuracy : 0.7465
Epoch - 2 Train-Loss : 0.8795834827423096


  4%|▍         | 2/50 [31:25<12:33:51, 942.32s/it]

Epoch - 2 Valid-Loss : 0.467736787378788 Valid-Accuracy : 0.87
Epoch - 3 Train-Loss : 0.5614530577659607


  6%|▌         | 3/50 [46:52<12:12:40, 935.34s/it]

Epoch - 3 Valid-Loss : 0.2699013015031815 Valid-Accuracy : 0.9205
Epoch - 4 Train-Loss : 0.4094550713300705


## Concotanate

In [None]:
import numpy as np

z = torch.cat((x,y,emb_f))
z.shape

torch.Size([154112])