## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models

from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import timm
from torchvision.models import video
import time

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [3]:
class CFG:
    model_name= "mvit_v2_s"
    n_folds = 5
    n_classes = 13
    video_length=16
    img_size=224
    epochs=50
    lr=3e-4
    batch_size=16
    seed=41
        

## Fixed RandomSeed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG.seed) # Seed 고정

## Data Load

In [5]:
df = pd.read_csv('./train.csv')

## Train / Validation Split

In [6]:
train, val, _, _ = train_test_split(df, df['label'], test_size=0.2, random_state=CFG.seed)

## CustomDataset

In [7]:
#class CustomDataset(Dataset):
#    def __init__(self, video_path_list, label_list):
#        self.video_path_list = video_path_list
#        self.label_list = label_list
#        
#    def __getitem__(self, index):
#        frames = self.get_video(self.video_path_list[index])
#        
#        if self.label_list is not None:
#            label = self.label_list[index]
#            return frames, label
#        else:
#            return frames
#        
#    def __len__(self):
#        return len(self.video_path_list)
#    
#    def get_video(self, path):
#        frames = []
#        cap = cv2.VideoCapture(path)
#        for _ in range(CFG.video_length):
#            _, img = cap.read()
#            img = cv2.resize(img, (CFG.img_size, CFG.img_size))
#            img = img / 255.
#            frames.append(img)
#        return torch.FloatTensor(np.array(frames)).permute(3, 0, 1, 2)

In [8]:
class CustomDataset(Dataset): # get last 16 images 
    def __init__(self, video_path_list, label_list):
        self.video_path_list = video_path_list
        self.label_list = label_list
        
    def __getitem__(self, index):
        frames = self.get_video(self.video_path_list[index])
        
        if self.label_list is not None:
            label = self.label_list[index]
            return frames, label
        else:
            return frames
        
    def __len__(self):
        return len(self.video_path_list)
    
    def get_video(self, path):
        frames = []
        cap = cv2.VideoCapture(path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        start_frame = total_frames - 16
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
        
        for _ in range(CFG.video_length):
            _, img = cap.read()
            img = cv2.resize(img, (CFG.img_size, CFG.img_size))
            img = img / 255.
            frames.append(img)
        return torch.FloatTensor(np.array(frames)).permute(3, 0, 1, 2)

In [9]:
train_dataset = CustomDataset(train['video_path'].values, train['label'].values)
train_loader = DataLoader(train_dataset, batch_size = CFG.batch_size, shuffle=True, num_workers=0)

val_dataset = CustomDataset(val['video_path'].values, val['label'].values)
val_loader = DataLoader(val_dataset, batch_size = CFG.batch_size, shuffle=False, num_workers=0)

## 비디오 길이 테스트

In [None]:
class CustomDataset50(Dataset):
    def __init__(self, video_path_list, label_list):
        self.video_path_list = video_path_list
        self.label_list = label_list
        
    def __getitem__(self, index):
        frames = self.get_video(self.video_path_list[index])
        
        if self.label_list is not None:
            label = self.label_list[index]
            return frames, label
        else:
            return frames
        
    def __len__(self):
        return len(self.video_path_list)
    
    def get_video(self, path):
        frames = []
        cap = cv2.VideoCapture(path)
        for _ in range(50):
            _, img = cap.read()
            img = cv2.resize(img, (CFG.img_size, CFG.img_size))
            img = img / 255.
            frames.append(img)
        return torch.FloatTensor(np.array(frames)).permute(3, 0, 1, 2)
    
    
class CustomDataset16(Dataset): # get la
    def __init__(self, video_path_list, label_list):
        self.video_path_list = video_path_list
        self.label_list = label_list
        
    def __getitem__(self, index):
        frames = self.get_video(self.video_path_list[index])
        
        if self.label_list is not None:
            label = self.label_list[index]
            return frames, label
        else:
            return frames
        
    def __len__(self):
        return len(self.video_path_list)
    
    def get_video(self, path):
        frames = []
        prev_time = 0
        cap = cv2.VideoCapture(path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        start_frame = total_frames - 16
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
        
        for _ in range(CFG.video_length):
            #time.sleep(200)
            _, img = cap.read()
            img = cv2.resize(img, (CFG.img_size, CFG.img_size))
            img = img / 255.
            #cap.set(cv2.CAP_PROP_POS_FRAMES, cap.get(cv2.CAP_PROP_POS_FRAMES) + 2)
            frames.append(img)
            
        return torch.FloatTensor(np.array(frames)).permute(3, 0, 1, 2)
    
train_dataset50 = CustomDataset50(train['video_path'].values, train['label'].values)
train_dataset16 = CustomDataset16(train['video_path'].values, train['label'].values)

In [None]:
sample50=train_dataset50[322][0].permute(1,0,2,3)
sample16=train_dataset16[322][0].permute(1,0,2,3)
print(sample16.shape)
from matplotlib import pyplot as plt 


plt.figure(figsize=(50, 50))
for i in range(0,16):
    #print(i)
    plt.subplot(1, 16, i + 1)
    plt.imshow(sample16[i].permute(1,2,0))
    plt.axis("off")

In [None]:
plt.figure(figsize=(50, 50))
for i in range(0,50,10):
    #print(i)
    plt.subplot(1, 5, i//10 + 1)
    plt.imshow(sample50[i].permute(1,2,0))
    plt.axis("off")

In [None]:
#train_dataset[0][0].shape
next(iter(train_loader))[0].shape

## Model Define

In [10]:
class BaseModel(nn.Module):
    def __init__(self, num_classes=CFG.n_classes, fc_type='shallow'):
        super(BaseModel, self).__init__()
        self.fc_type=fc_type
        self.num_classes=num_classes
        
        #get backbone
        self.backbone=getattr(video,CFG.model_name)(pretrained=True)
        self.backbone.head[1]=self.get_fc()

        
    def get_fc(self):
        
        if self.fc_type == 'shallow':
            fc = nn.Linear(self.backbone.head[1].in_features, self.num_classes)
        else:
            raise ValueError(f"Wrong fc-type input {self.fc_type}")
        return fc
    
    def forward(self, x):
        batch_size = x.size(0)
        x = self.backbone(x)
        return x

In [11]:
#model=BaseModel(fc_type='shallow')
#model

## Train

In [12]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)
    
    criterion = nn.CrossEntropyLoss().to(device)
    
    best_val_score = 0
    best_model = None
    cnt=0
    for epoch in range(1, CFG.epochs+1):
        model.train()
        train_loss = []
        for videos, labels in tqdm(iter(train_loader)):
            videos = videos.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            
            output = model(videos)
            loss = criterion(output, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        _val_loss, _val_score = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val F1 : [{_val_score:.5f}]')
        
        if scheduler is not None:
            scheduler.step(_val_score)
            
        if best_val_score < _val_score:
            best_val_score = _val_score
            best_model = model
            torch.save(model.state_dict(), './weights/mvitv2.pt')
            cnt=0
        else:
            print("early stopping count : {}".format(cnt))
            cnt+=1
        
        if cnt==5:
            print("early stopping done")
            break
            
    return best_model

In [13]:
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss = []
    preds, trues = [], []
    
    with torch.no_grad():
        for videos, labels in tqdm(iter(val_loader)):
            videos = videos.to(device)
            labels = labels.to(device)
            
            logit = model(videos)
            
            loss = criterion(logit, labels)
            
            val_loss.append(loss.item())
            
            preds += logit.argmax(1).detach().cpu().numpy().tolist()
            trues += labels.detach().cpu().numpy().tolist()
        
        _val_loss = np.mean(val_loss)
    
    _val_score = f1_score(trues, preds, average='macro')
    return _val_loss, _val_score

## Run!!

In [None]:
model = BaseModel(fc_type='shallow')
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG.lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=0.00001)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

Let's use 2 GPUs!


  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

Epoch [1], Train Loss : [0.75908] Val Loss : [0.60692] Val F1 : [0.23276]


  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

Epoch [2], Train Loss : [0.44411] Val Loss : [0.50383] Val F1 : [0.23316]


  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

Epoch [3], Train Loss : [0.36023] Val Loss : [0.42847] Val F1 : [0.36177]


  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

Epoch [4], Train Loss : [0.28611] Val Loss : [0.32464] Val F1 : [0.42425]


  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

Epoch [5], Train Loss : [0.20293] Val Loss : [0.30500] Val F1 : [0.41613]
early stopping count : 0


  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

Epoch [6], Train Loss : [0.16929] Val Loss : [0.42505] Val F1 : [0.39771]
early stopping count : 1


  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

Epoch [7], Train Loss : [0.13384] Val Loss : [0.79459] Val F1 : [0.27931]
early stopping count : 2


  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

## Inference

In [None]:
test = pd.read_csv('./test.csv')

In [None]:
test_dataset = CustomDataset(test['video_path'].values, None)
test_loader = DataLoader(test_dataset, batch_size = CFG.batch_size, shuffle=False, num_workers=0)

In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    preds = []
    with torch.no_grad():
        for videos in tqdm(iter(test_loader)):
            videos = videos.to(device)
            
            logit = model(videos)

            preds += logit.argmax(1).detach().cpu().numpy().tolist()
    return preds

In [None]:
preds = inference(model, test_loader, device)

## Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['label'] = preds
submit.head()

In [None]:
submit.to_csv('./mvitv2_submit.csv', index=False)