In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install einops



In [None]:

from torch import nn, einsum
import torch.nn.functional as F
from einops import rearrange, repeat
from google.colab.patches import cv2_imshow
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.fn = fn
        self.norm = nn.LayerNorm(dim)

    def forward(self, x, *args, **kwargs):
        x = self.norm(x)
        return self.fn(x, *args, **kwargs)
class GEGLU(nn.Module):
    def forward(self, x):
        x, gates = x.chunk(2, dim = -1)
        return x * F.gelu(gates)

class FeedForward(nn.Module):
    def __init__(self, dim, mult = 4, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim * mult * 2),
            GEGLU(),
            nn.Dropout(dropout),
            nn.Linear(dim * mult, dim)
        )

    def forward(self, x):
        return self.net(x)
# attention

def attn(q, k, v):
    sim = einsum('b i d, b j d -> b i j', q, k)
    attn = sim.softmax(dim = -1)
    out = einsum('b i j, b j d -> b i d', attn, v)
    return out

class Attention(nn.Module):
    def __init__(
        self,
        dim,
        dim_head = 64,
        heads = 8,
        dropout = 0.
    ):
        super().__init__()
        self.heads = heads
        self.scale = dim_head ** -0.5
        inner_dim = dim_head * heads

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x, einops_from, einops_to, **einops_dims):
        h = self.heads
        q, k, v = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h = h), (q, k, v))

        q *= self.scale

        # splice out classification token at index 1
        (cls_q, q_), (cls_k, k_), (cls_v, v_) = map(lambda t: (t[:, 0:1], t[:, 1:]), (q, k, v))

        # let classification token attend to key / values of all patches across time and space
        cls_out = attn(cls_q, k, v)

        # rearrange across time or space
        q_, k_, v_ = map(lambda t: rearrange(t, f'{einops_from} -> {einops_to}', **einops_dims), (q_, k_, v_))

        # expand cls token keys and values across time or space and concat
        r = q_.shape[0] // cls_k.shape[0]
        cls_k, cls_v = map(lambda t: repeat(t, 'b () d -> (b r) () d', r = r), (cls_k, cls_v))

        k_ = torch.cat((cls_k, k_), dim = 1)
        v_ = torch.cat((cls_v, v_), dim = 1)

        # attention
        out = attn(q_, k_, v_)

        # merge back time or space
        out = rearrange(out, f'{einops_to} -> {einops_from}', **einops_dims)

        # concat back the cls token
        out = torch.cat((cls_out, out), dim = 1)

        # merge back the heads
        out = rearrange(out, '(b h) n d -> b n (h d)', h = h)

        # combine heads out
        return self.to_out(out)
# main classes
class TimeSformeraudio(nn.Module):
    def __init__(
        self,
        *,
        dim,
        num_frames,
        num_classes,
        image_size = 224,
        patch_size = 16,
        channels = 1,
        depth = 12,
        heads = 8,
        dim_head = 64,
        attn_dropout = 0.,
        ff_dropout = 0.
    ):
        super().__init__()
        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_size // patch_size) ** 2
        num_positions = num_frames * num_patches
        patch_dim = channels * patch_size ** 2

        self.patch_size = patch_size
        self.to_patch_embedding = nn.Linear(patch_dim, dim)
        self.pos_emb = nn.Embedding(num_positions + 1, dim)
        self.cls_token = nn.Parameter(torch.randn(1, dim))

        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                PreNorm(dim, Attention(dim, dim_head = dim_head, heads = heads, dropout = attn_dropout)), # Time attention
                PreNorm(dim, Attention(dim, dim_head = dim_head, heads = heads, dropout = attn_dropout)), # Spatial attention
                PreNorm(dim, FeedForward(dim, dropout = ff_dropout)) # Feed Forward
            ]))

        self.to_out = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, video):
        b, f, _, h, w, *_, device, p = *video.shape, video.device, self.patch_size
        assert h % p == 0 and w % p == 0, f'height {h} and width {w} of video must be divisible by the patch size {p}'

        n = (h // p) * (w // p)
        video = rearrange(video, 'b f c (h p1) (w p2) -> b (f h w) (p1 p2 c)', p1 = p, p2 = p)
        
        tokens = self.to_patch_embedding(video)

        cls_token = repeat(self.cls_token, 'n d -> b n d', b = b)
        x =  torch.cat((cls_token, tokens), dim = 1)
        x += self.pos_emb(torch.arange(x.shape[1], device = device))

        for (time_attn, spatial_attn, ff) in self.layers:
            x = time_attn(x, 'b (f n) d', '(b n) f d', n = n) + x
            x = spatial_attn(x, 'b (f n) d', '(b f) n d', f = f) + x
            x = ff(x) + x

        cls_token = x[:, 0]
        
        return self.to_out(cls_token)

In [None]:
import torch
import cv2
import numpy as np
import os
import math
import torch.optim as optim
import moviepy.editor as mp
import scipy.io
from scipy.io import wavfile
import matplotlib.pyplot as plt
import numpy as np
import os
import cv2
def audiospectogram(pathname):
  clip = mp.VideoFileClip(pathname)
  print(pathname[:-4])
  # Insert Local Audio File Path
  audiopath = "/content/drive/MyDrive/dataset/temp.wav"
  
  clip.audio.write_audiofile(audiopath)
  sr,x = scipy.io.wavfile.read(audiopath)

  ## Parameters: 10ms step, 30ms window
  nstep = int(sr * 0.01)
  nwin  = int(sr * 0.01)
  nfft = nwin

  window = np.hamming(nwin)

  ## will take windows x[n1:n2].  generate
  ## and loop over n2 such that all frames
  ## fit within the waveform
  nn = range(nwin, len(x), nstep)

  X = np.zeros( (len(nn), nfft//2) )

  for i,n in enumerate(nn):
      xseg = x[n-nwin:n]
      z = np.fft.fft(window @ xseg, nfft)
      X[i,:] = np.log(np.abs(z[:nfft//2]))


  ab = cv2.resize(X,(224,X.shape[0]))

  valframe =  ab.shape[0]//224 #number of frames of sound
  dim1 = (valframe)*224
  # print(dim1, val, ab.shape[0])
  Y = (ab[:dim1,0:224].T).reshape(valframe,1,224,224)
  os.remove(audiopath)
  Y = np.asarray(Y)
  return Y
def audioextract(path,start,end):
  # audios = np.asarray(audios)
  audios = []
  labels = []
  count =0  
  for filename in os.listdir(path):
    if count>= start and count <end:
      # break
      specframe = audiospectogram(path + filename)
      audios.append(specframe)
      if filename[5:9] == 'Pain':
        labels.append([1,0])
      else:
        labels.append([0,1])
      
    elif count>=end:
      break
    count+=1
  audio = torch.tensor(np.asarray(audios)).float()
  print(audio.shape)
  del audios
  return audio,labels


In [None]:
DIM = 224
IMAGE_SIZE = 224
PATCH_SIZE = 16
NUM_CLASSES = 2
NUM_FRAMES = 8
DEPTH = 12
HEADS = 8
DIM_HEAD = 64               
ATTN_DROPOUT = 0.1
FF_DROPOUT = 0.1
ITERATIONS = 20
model = torch.nn.Sequential(
    TimeSformeraudio(dim = DIM, image_size = IMAGE_SIZE, patch_size = PATCH_SIZE, num_frames = NUM_FRAMES, num_classes = NUM_CLASSES, depth = DEPTH, heads = HEADS, dim_head = DIM_HEAD, attn_dropout = ATTN_DROPOUT, ff_dropout = FF_DROPOUT),
    nn.Softmax(dim=1)
)

In [None]:
video , audlabel = audioextract('/content/drive/MyDrive/dataset/mix data/',0,98)
loss_fn = torch.nn.BCELoss()
labels = torch.FloatTensor(audlabel)#[[0.4, 0.6] for i in range(len(video))]) # Add here your own labels

learning_rate = 1e-4
for t in range(15):#ITERATIONS):
  y_pred = model(video)

  loss = loss_fn(y_pred, torch.tensor(labels))
  print("#" + str(t), " loss:" + str(loss.item()))
  
  model.zero_grad()
  loss.backward()
  with torch.no_grad():
      for param in model.parameters():
          param -= learning_rate * param.grad