In [1]:
from dataclasses import dataclass
import torch
import torch.utils.data
import torch.backends.cudnn
import random
import numpy as np
import pandas as pd
import os
import sklearn.model_selection
import sklearn.metrics
import cv2 as cv
import time
import logging
from tqdm import tqdm
import matplotlib.pyplot as plt
import ipywidgets
import torchvision.transforms
import pytorchvideo.transforms
import wandb

@dataclass
class Config:
    seed = 42
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    num_workers = 0
    batch_size = 4
    pin_memory = False

    lr = 1e-3
    epochs = 15
    image_size = 256
    num_frame = 8
    
    model_name = 'slow_r50'
    # model_name = 'slowfast_r101'
    # model_name = 'slowfast_16x8_r101_50_50'
    
    data_path = '/datasets/dacon-hand-recog'
    load_path = None
    load_path = 'slow_r50(3).pt'
    save_path = 'slow_r50(5).pt'
    # save_path = 'slowfast_r101(0).pt'
    # save_path = 'slow_16x8_r101(1).pt'

def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

def train_loop(data_loader, model, loss_fn, optimizer):
    model.train()
    losses = []
    for X, y in tqdm(data_loader):
        X = X.to(Config.device)
        y = y.to(Config.device)
        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())

    logging.info(f'loss: {np.mean(losses):7.3f}')

def valid_loop(data_loader, model, loss_fn):
    model.eval()
    losses = []
    preds = []
    trues = []
    correct = 0
    with torch.no_grad():
        for X, y in tqdm(data_loader):
            X = X.to(Config.device)
            y = y.to(Config.device)
            pred = model(X)
            loss = loss_fn(pred, y)
            losses.append(loss.item())
            preds.extend(pred.argmax(1).cpu().numpy())
            trues.extend(y.cpu())
            correct += (pred.argmax(1) == y).sum().item()
    
    valid_loss = np.mean(losses)
    valid_score = sklearn.metrics.f1_score(trues, preds, average='macro')
    logging.info(f'correct: {correct:3} / {len(data_loader.dataset)}, validation_loss: {valid_loss:7.3f}, validation_score: {valid_score:7.3f}')
    return valid_loss, valid_score

def test_loop(data_loader, model):
    model.eval()
    preds = []
    for X in tqdm(data_loader):
        X = X.to(Config.device)
        pred = model(X)
        preds.extend(pred.argmax(1).cpu().numpy())
    submit = pd.read_csv('./sample_submission.csv')
    submit['label'] = preds
    submit.to_csv('./resnet_submission.csv', index=False)

def save_state_dict(path, model, optimizer, scheduler):
    torch.save({
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict(),
    }, path)

def load_state_dict(path, model, optimizer, scheduler):
    data = torch.load(path)
    model.load_state_dict(data['model'])
    optimizer.load_state_dict(data['optimizer'])
    scheduler.load_state_dict(data['scheduler'])

class VideoDataset(torch.utils.data.Dataset):
    def __init__(self, paths, labels=None, frame_transforms=None, video_transforms=None):
        self.paths = paths
        self.labels = labels
        self.frame_transforms = frame_transforms
        self.video_transforms = video_transforms
    
    def __getitem__(self, index):
        video = self.get_video(os.path.join(Config.data_path, os.path.basename(self.paths[index])))
        if self.labels is None:
            return video
            
        label = self.labels[index]
        return video, label

    def __len__(self):
        return len(self.paths)

    def get_video(self, path):
        cap = cv.VideoCapture(path)
        video = []
        while True:
            ret: bool; frame: np.ndarray
            ret, frame = cap.read()
            if not ret:
                break
            if self.frame_transforms is not None:
                frame = self.frame_transforms(frame)
            video.append(frame)
        
        video = torch.cat([frame.unsqueeze(1) for frame in video], 1)
        
        if self.video_transforms is not None:
            video = self.video_transforms(video)
        return video

class PackPathway(torch.nn.Module):
    def __init__(self, alpha):
        super().__init__()
        self.alpha = alpha
    
    def forward(self, x):
        fast = torch.index_select(x, 1, torch.linspace(0, x.shape[1] - 1, Config.num_frame).long())
        slow = torch.index_select(x, 1, torch.linspace(0, x.shape[1] - 1, Config.num_frame // self.alpha).long())
        return [slow, fast]

def get_dataloader():
    df = pd.read_csv(os.path.join(Config.data_path, 'train.csv'))
    train_df, valid_df = sklearn.model_selection.train_test_split(df, test_size=0.2, random_state=Config.seed)
    test_df = pd.read_csv(os.path.join(Config.data_path, 'test.csv'))

    train_paths: np.ndarray = train_df['path'].to_numpy()
    valid_paths: np.ndarray = valid_df['path'].to_numpy()
    test_paths: np.ndarray = test_df['path'].to_numpy()
    train_labels: np.ndarray = train_df['label'].to_numpy()
    valid_labels: np.ndarray = valid_df['label'].to_numpy()

    frame_transforms = torchvision.transforms.Compose([
        torchvision.transforms.ToPILImage(),
        torchvision.transforms.Resize((Config.image_size, Config.image_size)),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Lambda(lambda x: x / 255),
    ])
    video_transforms = pytorchvideo.transforms.UniformTemporalSubsample(Config.num_frame)
    # video_transforms = PackPathway(4)

    train_dataset = VideoDataset(train_paths, train_labels, frame_transforms=frame_transforms, video_transforms=video_transforms)
    valid_dataset = VideoDataset(valid_paths, valid_labels, frame_transforms=frame_transforms, video_transforms=video_transforms)
    test_dataset = VideoDataset(test_paths, frame_transforms=frame_transforms, video_transforms=video_transforms)
    
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=Config.batch_size, shuffle=True, num_workers=Config.num_workers)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=Config.batch_size, shuffle=False, num_workers=Config.num_workers)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=Config.batch_size, shuffle=False, num_workers=Config.num_workers)
    return train_loader, valid_loader, test_loader

def get_dataloader_kfold(k=5):
    df = pd.read_csv(os.path.join(Config.data_path, 'train.csv'))
    kf = sklearn.model_selection.KFold(k, shuffle=True, random_state=Config.seed)
    train_indices, valid_indices = [], []
    for train_idx, valid_idx in kf.split(df):
        train_indices.append(train_idx)
        valid_indices.append(valid_idx)

    train_df = [df.loc[train_idx] for train_idx in train_indices]
    valid_df = [df.loc[valid_idx] for valid_idx in valid_indices]
    test_df = pd.read_csv(os.path.join(Config.data_path, 'test.csv'))

    train_paths = [df['path'].to_numpy() for df in train_df]
    valid_paths = [df['path'].to_numpy() for df in valid_df]
    test_paths: np.ndarray = test_df['path'].to_numpy()
    train_labels = [df['label'].to_numpy() for df in train_df]
    valid_labels = [df['label'].to_numpy() for df in valid_df]

    frame_transforms = torchvision.transforms.Compose([
        torchvision.transforms.ToPILImage(),
        torchvision.transforms.Resize((Config.image_size, Config.image_size)),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Lambda(lambda x: x / 255),
    ])
    # video_transforms = PackPathway(4)
    video_transforms = pytorchvideo.transforms.UniformTemporalSubsample(Config.num_frame)

    train_dataset = [VideoDataset(paths, labels, frame_transforms=frame_transforms, video_transforms=video_transforms) for paths, labels in zip(train_paths, train_labels)]
    valid_dataset = [VideoDataset(paths, labels, frame_transforms=frame_transforms, video_transforms=video_transforms) for paths, labels in zip(valid_paths, valid_labels)]
    test_dataset = VideoDataset(test_paths, frame_transforms=frame_transforms, video_transforms=video_transforms)

    train_loader = [torch.utils.data.DataLoader(dataset, batch_size=Config.batch_size, shuffle=True, num_workers=Config.num_workers) for dataset in train_dataset]
    valid_loader = [torch.utils.data.DataLoader(dataset, batch_size=Config.batch_size, shuffle=False, num_workers=Config.num_workers) for dataset in valid_dataset]
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=Config.batch_size, shuffle=False, num_workers=Config.num_workers)
    return train_loader, valid_loader, test_loader

def train_model():
    wandb.init(project="video-hand-gesture-classification", entity="seokjin")
    wandb.config = {
        'model_name': Config.model_name,
        'num_frame': Config.num_frame,
        'image_size': Config.image_size,
    }
    train_loader, valid_loader, test_loader = get_dataloader()
    
    model = torch.hub.load('facebookresearch/pytorchvideo', Config.model_name, pretrained=True)
    model = torch.nn.Sequential(model, torch.nn.Linear(400, 5, bias=True))
    
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=Config.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)
    model.to(Config.device)
    if Config.load_path:
        load_state_dict(Config.load_path, model, optimizer, scheduler)
    
    best_score = 0
    for i in range(Config.epochs):
        train_loop(train_loader, model, loss_fn, optimizer)
        loss, score = valid_loop(valid_loader, model, loss_fn)
        scheduler.step(loss)
        if score > best_score:
            save_state_dict(Config.save_path, model, optimizer, scheduler)
            best_score = score
        
        wandb.log({
            'loss': loss,
            'score': score,
            'lr': scheduler._last_lr[0],
        })
        wandb.watch(model)

def train_model_kfold():
    k = 5
    train_loader, valid_loader, test_loader = get_dataloader_kfold(k)
    for fold in reversed(range(k)):
        kfold_message = f'{fold}fold'
        logging.info(f'{kfold_message:-^20}')
        wandb.init(project="video-hand-gesture-classification", entity="seokjin")
        wandb.config = {
            'model_name': Config.model_name,
            'num_frame': Config.num_frame,
            'image_size': Config.image_size,
        }
        
        model = torch.hub.load('facebookresearch/pytorchvideo', Config.model_name, pretrained=True)
        model = torch.nn.Sequential(model, torch.nn.Linear(400, 5, bias=True))
        
        loss_fn = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=Config.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)
        model.to(Config.device)
        if Config.load_path:
            load_state_dict(Config.load_path, model, optimizer, scheduler)
        
        best_score = 0
        for i in range(Config.epochs):
            train_loop(train_loader[fold], model, loss_fn, optimizer)
            loss, score = valid_loop(valid_loader[fold], model, loss_fn)
            scheduler.step(loss)
            if score > best_score:
                save_state_dict(f'{Config.save_path}_{fold}fold', model, optimizer, scheduler)
                best_score = score
            
            wandb.log({
                'loss': loss,
                'score': score,
                'lr': scheduler._last_lr[0],
            })
            wandb.watch(model)

def test_model():
    train_loader, valid_loader, test_loader = get_dataloader()
    
    model = torch.hub.load('facebookresearch/pytorchvideo', Config.model_name, pretrained=True)
    model = torch.nn.Sequential(model, torch.nn.Linear(400, 5, bias=True))

    optimizer = torch.optim.SGD(model.parameters(), lr=Config.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)
    model.to(Config.device)
    if Config.load_path:
        load_state_dict(Config.load_path, model, optimizer, scheduler)
    
    model.eval()
    preds = []
    for X in tqdm(test_loader):
        X = X.to(Config.device)
        pred = model(X)
        preds.extend(pred.argmax(1).cpu().numpy())
    submit = pd.read_csv(os.path.join(Config.data_path, 'sample_submission.csv'))
    submit['label'] = preds
    submit.to_csv(f'./{Config.load_path}_submission.csv', index=False)

def test_model_kfold():
    k = 5
    train_loader, valid_loader, test_loader = get_dataloader()

    soft_vote = []

    for fold in range(k):
        model = torch.hub.load('facebookresearch/pytorchvideo', Config.model_name, pretrained=True)
        model = torch.nn.Sequential(model, torch.nn.Linear(400, 5, bias=True))

        optimizer = torch.optim.SGD(model.parameters(), lr=Config.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)
        model.to(Config.device)
        if Config.load_path:
            load_state_dict(f'{Config.load_path}_{fold}fold', model, optimizer, scheduler)
        
        model.eval()
        preds = []
        for X in tqdm(test_loader):
            X = X.to(Config.device)
            pred = model(X)
            preds.extend(pred.cpu().numpy())
        preds = np.array(preds)
        soft_vote.append(preds)
    
    ensampled = np.sum(soft_vote, 0)
    ensampled = np.argmax(ensampled, 1)
    
    submit = pd.read_csv(os.path.join(Config.data_path, 'sample_submission.csv'))
    submit['label'] = ensampled
    submit.to_csv(f'./{Config.load_path}_submission.csv', index=False)

if __name__ == '__main__':
    logging.basicConfig(filename='logging.log', encoding='utf8', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    logging.info(f'{"Run":-^20}')
    seed_everything(Config.seed)
    # train_model_kfold()
    # train_model()
    test_model()

Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main
100%|██████████| 39/39 [00:16<00:00,  2.39it/s]
