#start

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
pip install einops



In [3]:
BASE_DIR = '/content/drive/My Drive/'
DATA_DIR = BASE_DIR + "data/"
FRAMES_INTERVAL = 10

In [4]:

from torch import nn, einsum
import torch.nn.functional as F
from einops import rearrange, repeat
from google.colab.patches import cv2_imshow
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.fn = fn
        self.norm = nn.LayerNorm(dim)

    def forward(self, x, *args, **kwargs):
        x = self.norm(x)
        return self.fn(x, *args, **kwargs)
class GEGLU(nn.Module):
    def forward(self, x):
        x, gates = x.chunk(2, dim = -1)
        return x * F.gelu(gates)

class FeedForward(nn.Module):
    def __init__(self, dim, mult = 4, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim * mult * 2),
            GEGLU(),
            nn.Dropout(dropout),
            nn.Linear(dim * mult, dim)
        )

    def forward(self, x):
        return self.net(x)
# attention

def attn(q, k, v):
    sim = einsum('b i d, b j d -> b i j', q, k)
    attn = sim.softmax(dim = -1)
    out = einsum('b i j, b j d -> b i d', attn, v)
    return out

class Attention(nn.Module):
    def __init__(
        self,
        dim,
        dim_head = 64,
        heads = 8,
        dropout = 0.
    ):
        super().__init__()
        self.heads = heads
        self.scale = dim_head ** -0.5
        inner_dim = dim_head * heads

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x, einops_from, einops_to, **einops_dims):
        h = self.heads
        q, k, v = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h = h), (q, k, v))

        q *= self.scale

        # splice out classification token at index 1
        (cls_q, q_), (cls_k, k_), (cls_v, v_) = map(lambda t: (t[:, 0:1], t[:, 1:]), (q, k, v))

        # let classification token attend to key / values of all patches across time and space
        cls_out = attn(cls_q, k, v)

        # rearrange across time or space
        q_, k_, v_ = map(lambda t: rearrange(t, f'{einops_from} -> {einops_to}', **einops_dims), (q_, k_, v_))

        # expand cls token keys and values across time or space and concat
        r = q_.shape[0] // cls_k.shape[0]
        cls_k, cls_v = map(lambda t: repeat(t, 'b () d -> (b r) () d', r = r), (cls_k, cls_v))

        k_ = torch.cat((cls_k, k_), dim = 1)
        v_ = torch.cat((cls_v, v_), dim = 1)

        # attention
        out = attn(q_, k_, v_)

        # merge back time or space
        out = rearrange(out, f'{einops_to} -> {einops_from}', **einops_dims)

        # concat back the cls token
        out = torch.cat((cls_out, out), dim = 1)

        # merge back the heads
        out = rearrange(out, '(b h) n d -> b n (h d)', h = h)

        # combine heads out
        return self.to_out(out)
# main classes
class TimeSformer(nn.Module):
    def __init__(
        self,
        *,
        dim,
        num_frames,
        num_classes,
        image_size = 224,
        patch_size = 16,
        channels = 3,
        depth = 12,
        heads = 8,
        dim_head = 64,
        attn_dropout = 0.,
        ff_dropout = 0.
    ):
        super().__init__()
        assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_size // patch_size) ** 2
        num_positions = num_frames * num_patches
        patch_dim = channels * patch_size ** 2

        self.patch_size = patch_size
        self.to_patch_embedding = nn.Linear(patch_dim, dim)
        self.pos_emb = nn.Embedding(num_positions + 1, dim)
        self.cls_token = nn.Parameter(torch.randn(1, dim))

        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                PreNorm(dim, Attention(dim, dim_head = dim_head, heads = heads, dropout = attn_dropout)), # Time attention
                PreNorm(dim, Attention(dim, dim_head = dim_head, heads = heads, dropout = attn_dropout)), # Spatial attention
                PreNorm(dim, FeedForward(dim, dropout = ff_dropout)) # Feed Forward
            ]))

        self.to_out = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, video):
        b, f, _, h, w, *_, device, p = *video.shape, video.device, self.patch_size
        assert h % p == 0 and w % p == 0, f'height {h} and width {w} of video must be divisible by the patch size {p}'

        n = (h // p) * (w // p)
        video = rearrange(video, 'b f c (h p1) (w p2) -> b (f h w) (p1 p2 c)', p1 = p, p2 = p)
        
        tokens = self.to_patch_embedding(video)

        cls_token = repeat(self.cls_token, 'n d -> b n d', b = b)
        x =  torch.cat((cls_token, tokens), dim = 1)
        x += self.pos_emb(torch.arange(x.shape[1], device = device))

        for (time_attn, spatial_attn, ff) in self.layers:
            x = time_attn(x, 'b (f n) d', '(b n) f d', n = n) + x
            x = spatial_attn(x, 'b (f n) d', '(b f) n d', f = f) + x
            x = ff(x) + x

        cls_token = x[:, 0]
        
        return self.to_out(cls_token)

In [5]:
import os
import cv2
import numpy as np
import torch
framelabel = []
paframes = []
# def frameextract():
framepath = '/content/drive/MyDrive/processed/'
for frame_name in os.listdir(framepath):
  # print(frame_name[4:7])
  if frame_name[4:7] == 'non':
    framelabel.append([0,1])
  else:
    framelabel.append([1,0])
  image = cv2.imread(framepath+frame_name)
  # print(image, frame_name)
  image = np.transpose(np.asarray(cv2.resize(image, (224,224))), (2, 0, 1))
  paframes.append(image)
painless = paframes[:50]
# video = torch.tensor(np.asarray(paframes).reshape(204,1,3,224,224)).float()

In [6]:
video = torch.tensor(np.asarray(painless).reshape(50,1,3,224,224)).float()
labelless = framelabel[:50]
video.shape

torch.Size([50, 1, 3, 224, 224])

In [6]:
len(framelabel)

204

In [7]:
DIM = 224
IMAGE_SIZE = 224
PATCH_SIZE = 16
NUM_CLASSES = 2
NUM_FRAMES = 1
DEPTH = 12
HEADS = 8
DIM_HEAD = 64
ATTN_DROPOUT = 0.1
FF_DROPOUT = 0.1
ITERATIONS = 20
model = torch.nn.Sequential(
    TimeSformer(dim = DIM, image_size = IMAGE_SIZE, patch_size = PATCH_SIZE, num_frames = NUM_FRAMES, num_classes = NUM_CLASSES, depth = DEPTH, heads = HEADS, dim_head = DIM_HEAD, attn_dropout = ATTN_DROPOUT, ff_dropout = FF_DROPOUT),
    nn.Softmax(dim=1)
)

loss_fn = torch.nn.BCELoss()
labels = torch.FloatTensor(labelless)#[[0.4, 0.6] for i in range(len(video))]) # Add here your own labels

learning_rate = 1e-4
for t in range(ITERATIONS):
  y_pred = model(torch.FloatTensor(np.asarray(video)))

  loss = loss_fn(y_pred, torch.tensor(labels))
  print("#" + str(t), " loss:" + str(loss.item()))
  
  model.zero_grad()
  loss.backward()
  with torch.no_grad():
      for param in model.parameters():
          param -= learning_rate * param.grad

pred = model(video) # (batch x classes)



#0  loss:0.36317938566207886
#1  loss:0.3863992989063263
#2  loss:0.3552611470222473
#3  loss:0.3513811528682709
#4  loss:0.3417350649833679
#5  loss:0.3430057764053345
#6  loss:0.32745009660720825
#7  loss:0.3202332556247711
#8  loss:0.3221796751022339
#9  loss:0.34400302171707153
#10  loss:0.32600003480911255
#11  loss:0.30187729001045227
#12  loss:0.29524773359298706
#13  loss:0.29629334807395935
#14  loss:0.2759099006652832
#15  loss:0.2979535758495331
#16  loss:0.29341599345207214
#17  loss:0.29895851016044617
#18  loss:0.2890051305294037
#19  loss:0.2848440110683441


#2n part images

In [6]:
DIM = 224
IMAGE_SIZE = 224
PATCH_SIZE = 16
NUM_CLASSES = 2
NUM_FRAMES = 1
DEPTH = 12
HEADS = 8
DIM_HEAD = 64
ATTN_DROPOUT = 0.1
FF_DROPOUT = 0.1
ITERATIONS = 20
PATH = '/content/drive/MyDrive/models/painimg1.pth'
model = torch.nn.Sequential(
    TimeSformer(dim = DIM, image_size = IMAGE_SIZE, patch_size = PATCH_SIZE, num_frames = NUM_FRAMES, num_classes = NUM_CLASSES, depth = DEPTH, heads = HEADS, dim_head = DIM_HEAD, attn_dropout = ATTN_DROPOUT, ff_dropout = FF_DROPOUT),
    nn.Softmax(dim=1)
)
# torch.save(model.state_dict(), PATH)
chkpnt = torch.load(PATH)
model.load_state_dict(chkpnt)

<All keys matched successfully>

In [7]:
# del video
video = torch.tensor(np.asarray(paframes[50:100]).reshape(50,1,3,224,224)).float()
labelless = framelabel[50:100]
video.shape

torch.Size([50, 1, 3, 224, 224])

In [8]:
loss_fn = torch.nn.BCELoss()
labels = torch.FloatTensor(labelless)#[[0.4, 0.6] for i in range(len(video))]) # Add here your own labels

learning_rate = 1e-4
for t in range(10):#ITERATIONS):
  y_pred = model(video)

  loss = loss_fn(y_pred, torch.tensor(labels))
  print("#" + str(t), " loss:" + str(loss.item()))
  
  model.zero_grad()
  loss.backward()
  with torch.no_grad():
      for param in model.parameters():
          param -= learning_rate * param.grad

pred = model(video) # (batch x classes)

  


#0  loss:1.0097123384475708
#1  loss:1.057417869567871
#2  loss:0.9792995452880859
#3  loss:0.98966383934021
#4  loss:0.9770166873931885
#5  loss:0.9365754723548889
#6  loss:0.9193651676177979
#7  loss:0.8632614016532898
#8  loss:0.8908348679542542
#9  loss:0.8753108382225037


In [9]:
PATH = '/content/drive/MyDrive/models/painimg2.pth'
torch.save(model.state_dict(), PATH)

#3rd part images

In [6]:
DIM = 224
IMAGE_SIZE = 224
PATCH_SIZE = 16
NUM_CLASSES = 2
NUM_FRAMES = 1
DEPTH = 12
HEADS = 8
DIM_HEAD = 64
ATTN_DROPOUT = 0.1
FF_DROPOUT = 0.1
ITERATIONS = 20
PATH = '/content/drive/MyDrive/models/painimg2.pth'
model = torch.nn.Sequential(
    TimeSformer(dim = DIM, image_size = IMAGE_SIZE, patch_size = PATCH_SIZE, num_frames = NUM_FRAMES, num_classes = NUM_CLASSES, depth = DEPTH, heads = HEADS, dim_head = DIM_HEAD, attn_dropout = ATTN_DROPOUT, ff_dropout = FF_DROPOUT),
    nn.Softmax(dim=1)
)
# torch.save(model.state_dict(), PATH)
chkpnt = torch.load(PATH)
model.load_state_dict(chkpnt)
#3rd batch of frames
video = torch.tensor(np.asarray(paframes[100:150]).reshape(50,1,3,224,224)).float()
labelless = framelabel[100:150]
video.shape

torch.Size([50, 1, 3, 224, 224])

In [7]:
loss_fn = torch.nn.BCELoss()
labels = torch.FloatTensor(labelless)#[[0.4, 0.6] for i in range(len(video))]) # Add here your own labels

learning_rate = 1e-4
for t in range(15):#ITERATIONS):
  y_pred = model(video)

  loss = loss_fn(y_pred, torch.tensor(labels))
  print("#" + str(t), " loss:" + str(loss.item()))
  
  model.zero_grad()
  loss.backward()
  with torch.no_grad():
      for param in model.parameters():
          param -= learning_rate * param.grad

pred = model(video) # (batch x classes)

  


#0  loss:0.4867725670337677
#1  loss:0.5075350999832153
#2  loss:0.5013892650604248
#3  loss:0.5014815330505371
#4  loss:0.49819305539131165
#5  loss:0.498831570148468
#6  loss:0.4764912724494934
#7  loss:0.5002437233924866
#8  loss:0.5031458139419556
#9  loss:0.5193483829498291
#10  loss:0.49719512462615967
#11  loss:0.48009976744651794
#12  loss:0.4841354489326477
#13  loss:0.5032749772071838
#14  loss:0.47818148136138916


In [8]:
PATH = '/content/drive/MyDrive/models/painimg3.pth'
torch.save(model.state_dict(), PATH)

#4th part

In [7]:
DIM = 224
IMAGE_SIZE = 224
PATCH_SIZE = 16
NUM_CLASSES = 2
NUM_FRAMES = 1
DEPTH = 12
HEADS = 8
DIM_HEAD = 64
ATTN_DROPOUT = 0.1
FF_DROPOUT = 0.1
ITERATIONS = 20


model = torch.nn.Sequential(
    TimeSformer(dim = DIM, image_size = IMAGE_SIZE, patch_size = PATCH_SIZE, num_frames = NUM_FRAMES, num_classes = NUM_CLASSES, depth = DEPTH, heads = HEADS, dim_head = DIM_HEAD, attn_dropout = ATTN_DROPOUT, ff_dropout = FF_DROPOUT),
    nn.Softmax(dim=1)
)
# torch.save(model.state_dict(), PATH)
PATH = '/content/drive/MyDrive/models/painimg3.pth'
chkpnt = torch.load(PATH)
model.load_state_dict(chkpnt)
#3rd batch of frames
video = torch.tensor(np.asarray(paframes[150:200]).reshape(50,1,3,224,224)).float()
labelless = framelabel[150:200]
video.shape

torch.Size([50, 1, 3, 224, 224])

In [8]:
loss_fn = torch.nn.BCELoss()
labels = torch.FloatTensor(labelless)#[[0.4, 0.6] for i in range(len(video))]) # Add here your own labels

learning_rate = 1e-4
for t in range(15):#ITERATIONS):
  y_pred = model(video)

  loss = loss_fn(y_pred, torch.tensor(labels))
  print("#" + str(t), " loss:" + str(loss.item()))
  
  model.zero_grad()
  loss.backward()
  with torch.no_grad():
      for param in model.parameters():
          param -= learning_rate * param.grad

pred = model(video) # (batch x classes)

  


#0  loss:0.6799436211585999
#1  loss:0.6803609728813171
#2  loss:0.712419867515564
#3  loss:0.6961427330970764
#4  loss:0.7151193022727966
#5  loss:0.6885093450546265
#6  loss:0.6862586140632629
#7  loss:0.6727443933486938
#8  loss:0.6699869632720947
#9  loss:0.6946759819984436
#10  loss:0.6839250922203064
#11  loss:0.6662608981132507
#12  loss:0.6803510189056396
#13  loss:0.6709305047988892
#14  loss:0.6865936517715454


In [9]:
PATH = '/content/drive/MyDrive/models/painimg4.pth'
torch.save(model.state_dict(), PATH)

#Eval

In [18]:
outGT = np.asarray(labelless)
outPRED = np.asarray(torch.Tensor.detach(pred))

In [44]:
outGT = np.asarray(labelless)
outPRED = np.asarray(torch.Tensor.detach(pred))
from sklearn.metrics import *
print('F1: {}'.format(f1_score(outGT, outPRED>.5, average="samples")))
print('Precision: {}'.format(precision_score(outGT, outPRED>.5, average="samples")))
print('Recall: {}'.format(recall_score(outGT, outPRED >.5, average="samples")))
print('Accuracy: {}'.format(accuracy_score(outGT, outPRED>.5)))

F1: 0.64
Precision: 0.64
Recall: 0.64
Accuracy: 0.64


In [40]:
for i in range(len(outGT)//2):
  print(outGT[i], outPRED[i])

[1 0] [0.22223349 0.77776647]
[1 0] [0.29166296 0.708337  ]
[0 1] [0.22004147 0.7799585 ]
[1 0] [0.27216145 0.7278385 ]
[0 1] [0.32832697 0.67167306]
[1 0] [0.28189862 0.71810144]
[0 1] [0.33535546 0.66464454]
[1 0] [0.30840933 0.6915907 ]
[0 1] [0.29247585 0.70752424]
[0 1] [0.2254198  0.77458024]
[0 1] [0.24262531 0.7573747 ]
[0 1] [0.252408   0.74759203]
[1 0] [0.3281416 0.6718584]
[0 1] [0.29749554 0.7025044 ]
[0 1] [0.3383802  0.66161984]
[0 1] [0.3041264  0.69587356]
[0 1] [0.28226125 0.7177388 ]
[0 1] [0.29282227 0.7071778 ]
[0 1] [0.36710197 0.63289803]
[1 0] [0.30377892 0.69622105]
[0 1] [0.37927523 0.6207248 ]
[0 1] [0.26974627 0.73025376]
[0 1] [0.37568036 0.6243197 ]
[0 1] [0.30206382 0.6979362 ]
[1 0] [0.30575326 0.69424677]
