In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from model import clip_vit_base_patch32_multimodal_adapter12x384
from video_dataset import VideoDataset
from video_dataset.random_erasing import RandomErasing
import logging
import os
from tqdm import tqdm
from utils import text_prompt, load_word_index_mapping, get_masked_sample, gen_label
import math

  from .autonotebook import tqdm as notebook_tqdm


In [53]:
config = {
    'num_classes': 400,  # Number of Kinetics-400 classes
    'num_frames': 8,     # Number of frames per video
    # 'mlm_head_len': 30522,  # BERT vocabulary size
    'learning_rate': 1e-4,
    'min_lr': 1e-5,
    'weight_decay': 0.01,
    'num_epochs': 12,
    'sampling_rate': 16,
    'num_temporal_views': 3,
    'num_spatial_views': 4,
    'warmup_epochs': 2,  # Number of warmup epochs
    'mlm_loss_weight': 1.0,
    'max_grad_norm': 1.0,
    'save_frequency': 1,
    'scale_range': (1.0, 1.15),
    'save_dir': 'checkpoints',
    'batch_size': 32,
    'masked_rate': 0.15,  # MLM masking rate
    'mlm_labels_path': 'k400_mlm_lables.txt',
    'classes_path': 'kinetics_400_labels.csv',  # kinetics_400_labels
}

# Initialize dataset with augmentations
random_erasing = RandomErasing(
    probability=0.25,
    mode='pixel',
    max_count=1,
    num_splits=0,
    device='cpu'
)

train_dataset = VideoDataset(
    list_path='k400_train.txt',  # Replace with your train list path
    data_root=r'Kinetics-400\videos_train',            # Replace with your data root path
    num_frames=config['num_frames'],
    sampling_rate=config['sampling_rate'],
    spatial_size=224,
    random_sample=True,
    random_erasing=random_erasing,
    resize_type='random_short_side_scale_jitter',
    # auto_augment='rand-m7-n4-mstd0.5-inc1',
    scale_range=config['scale_range']
)

val_dataset = VideoDataset(
    list_path='k400_val.txt',  # Using the provided validation list
    data_root=r'Kinetics-400\videos_val',            # Replace with your data root path
    num_frames=config['num_frames'],
    sampling_rate=config['sampling_rate'],
    spatial_size=224,
    random_sample=False,
    # num_temporal_views=config['num_temporal_views'],
    # num_spatial_views=config['num_spatial_views']
    )
# Create data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=config['batch_size'],
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config['batch_size'],
    shuffle=False,
    num_workers=4,                                                                  
    pin_memory=True
)


In [54]:
imaged, label = val_dataset[0]

In [55]:
imaged.squeeze(0).shape

torch.Size([3, 8, 224, 224])

In [56]:
train_loader.dataset[0][0].shape

torch.Size([3, 8, 224, 224])

In [57]:
val_loader_iter = iter(val_loader)
#  打印一个批次的数据和标签的形状
train_loader_iter = iter(train_loader)

In [58]:

next(val_loader_iter)[0].shape

torch.Size([32, 1, 3, 8, 224, 224])

In [63]:
next(val_loader_iter)[0].squeeze()

tensor([[[[[-1.2717e+00, -1.2820e+00, -1.2829e+00,  ..., -1.3847e+00,
            -1.3683e+00, -1.4013e+00],
           [-1.2717e+00, -1.2820e+00, -1.2829e+00,  ..., -1.3847e+00,
            -1.3683e+00, -1.4013e+00],
           [-1.2717e+00, -1.2820e+00, -1.2829e+00,  ..., -1.3847e+00,
            -1.3683e+00, -1.4013e+00],
           ...,
           [-1.7193e+00, -1.7193e+00, -1.7193e+00,  ..., -1.7354e+00,
            -1.7329e+00, -1.7494e+00],
           [-1.7193e+00, -1.7193e+00, -1.7193e+00,  ..., -1.7354e+00,
            -1.7329e+00, -1.7494e+00],
           [-1.7193e+00, -1.7193e+00, -1.7193e+00,  ..., -1.7354e+00,
            -1.7329e+00, -1.7494e+00]],

          [[-1.4496e+00, -1.4275e+00, -1.4117e+00,  ..., -1.0544e+00,
            -1.0441e+00, -1.0523e+00],
           [-1.4496e+00, -1.4273e+00, -1.4132e+00,  ..., -1.0593e+00,
            -1.0373e+00, -1.0519e+00],
           [-1.4496e+00, -1.4292e+00, -1.3970e+00,  ..., -1.1240e+00,
            -1.1031e+00, -1.0902e+00],
 

In [64]:
next(train_loader_iter)[0]


tensor([[[[[-5.6612e-01, -5.7117e-01, -5.8119e-01,  ..., -1.4845e+00,
            -1.5017e+00, -1.5003e+00],
           [-5.6780e-01, -5.7279e-01, -5.8224e-01,  ..., -1.4862e+00,
            -1.5026e+00, -1.5013e+00],
           [-5.4522e-01, -5.5502e-01, -5.7270e-01,  ..., -1.4648e+00,
            -1.4916e+00, -1.4892e+00],
           ...,
           [-1.5033e+00, -1.4725e+00, -1.4661e+00,  ..., -3.8677e-01,
            -4.8549e-01, -4.1477e-01],
           [-1.4700e+00, -1.4714e+00, -1.4684e+00,  ..., -5.3607e-01,
            -4.7689e-01, -2.6034e-01],
           [-1.4582e+00, -1.4620e+00, -1.4617e+00,  ..., -4.7028e-01,
            -3.5685e-01, -1.4500e-01]],

          [[-3.1837e-01, -3.5277e-01, -4.1321e-01,  ..., -1.4964e+00,
            -1.4981e+00, -1.4733e+00],
           [-3.1338e-01, -3.6291e-01, -3.8825e-01,  ..., -1.4969e+00,
            -1.4777e+00, -1.4724e+00],
           [-3.0513e-01, -3.3694e-01, -3.8481e-01,  ..., -1.4863e+00,
            -1.4691e+00, -1.4667e+00],
 

In [26]:

next(train_loader_iter)[1].shape


torch.Size([32])

In [27]:

len(next(train_loader_iter))


2