# PHẦN 1

In [1]:
import os

import matplotlib
matplotlib.use('AGG')

import matplotlib.pyplot as plt
import numpy as np

import keras

from keras.datasets import cifar10
from keras.layers import (Activation, Conv3D, Dense, Dropout, Flatten,
                          MaxPooling3D, MaxPooling2D)

from keras.layers import LeakyReLU
from keras.losses import categorical_crossentropy
from keras.models import Sequential
from keras.optimizers import Adam
#from keras.utils import np_utils
#from keras.utils.vis_utils import plot_model
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split

#import videoto3d
from tqdm import tqdm

from keras.callbacks import ModelCheckpoint
from keras.models import Model
from keras.layers import Input, Dense

##
import tensorflow as tf
from tensorflow.python.keras.backend import set_session
#from keras.backend.tensorflow_backend import set_session

2025-09-30 15:00:54.677822: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759244454.889537      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759244454.948846      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Processing data

In [2]:
import cv2


class Videoto3D:
    def __init__(self, width, height, depth):
        self.width = width
        self.height = height
        self.depth = depth

    def video3d(self, filename, color=False, skip=True):
        
        cap = cv2.VideoCapture(filename)
        
        nframe = cap.get(cv2.CAP_PROP_FRAME_COUNT)
        
        if skip:
            frames = [x * nframe / self.depth for x in range(self.depth)]
        else:
            frames = [x for x in range(self.depth)]
        
        framearray = []

        for i in range(self.depth):
            cap.set(cv2.CAP_PROP_POS_FRAMES, frames[i])
            ret, frame = cap.read()
        
            frame = cv2.resize(frame, (self.height, self.width))
        
            if color:
                framearray.append(frame)
            else:
                framearray.append(cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY))

        cap.release()
        
        return np.array(framearray)

    def get_UCF_classname(self, filename):
        x =  filename[filename.find('_') + 1:filename.find('_', 2)]
        return x

In [3]:

def plot_history(history, result_dir):
    plt.plot(history.history['accuracy'], marker='.')
    plt.plot(history.history['val_accuracy'], marker='.')
    plt.title('model accuracy')
    plt.xlabel('epoch')
    plt.ylabel('accuracy')
    plt.grid()
    plt.legend(['accuracy', 'val_accuracy'], loc='lower right')
    # plt.savefig(os.path.join(result_dir, 'model_accuracy.png'))
    plt.close()

    plt.plot(history.history['loss'], marker='.')
    plt.plot(history.history['val_loss'], marker='.')
    plt.title('model loss')
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.grid()
    plt.legend(['loss', 'val_loss'], loc='upper right')
    # plt.savefig(os.path.join(result_dir, 'model_loss.png'))
    plt.close()


def save_history(history, result_dir):
    loss = history.history['loss']
    acc = history.history['accuracy']
    val_loss = history.history['val_loss']
    val_acc = history.history['val_accuracy']
    nb_epoch = len(acc)

    with open(os.path.join(result_dir, 'result.txt'), 'w') as fp:
        fp.write('epoch\tloss\tacc\tval_loss\tval_acc\n')
        for i in range(nb_epoch):
            fp.write('{}\t{}\t{}\t{}\t{}\n'.format(
                i, loss[i], acc[i], val_loss[i], val_acc[i]))
    


def loaddata(video_dir, vid3d, nclass, result_dir, color=False, skip=True):
    
    files = os.listdir(video_dir)    
    X = []
    labels = []
    labellist = []

    pbar = tqdm(total=len(files))

    for filename in files:
    
        pbar.update(1)
        
        if filename == '.DS_Store':
            continue
        
        name = os.path.join(video_dir, filename)

        for v_files in os.listdir(name):

            v_file_path = os.path.join(name, v_files)

            label = vid3d.get_UCF_classname(filename)
            
            if label not in labellist:
                
                if len(labellist) >= nclass:
                    continue

                labellist.append(label)
            
            labels.append(label)
            
            X.append(vid3d.video3d(v_file_path, color=color, skip=skip))

    pbar.close()

    os.makedirs(result_dir, exist_ok=True)
    
    with open(os.path.join(result_dir, 'classes.txt'), 'w') as fp:
        for i in range(len(labellist)):
            fp.write('{}\n'.format(labellist[i]))

    for num, label in enumerate(labellist):
        for i in range(len(labels)):
            if label == labels[i]:
                labels[i] = num
                
    if color:
        return np.array(X).transpose((0, 2, 3, 4, 1)), labels
    else:
        return np.array(X).transpose((0, 2, 3, 1)), labels

# Chạy với dataset UCF50

In [4]:
def main():

    print('simple 3D convolution for action recognition')
    batch = 128
    # training epoch
    epoch = 100
    # directory where videos are stored
    videos= '/kaggle/input/ucf50-latest-version/UCF50'
    # Number of classes
    nclass = 50
    output = 'results'
    color  = False
    skip   = True
    depth  = 10

    
    img_rows, img_cols, frames = 32, 32, depth
    channel = 3 if color else 1
    fname_npz = 'dataset_{}_{}_{}.npz'.format(nclass, depth, skip)

    vid3d = Videoto3D(img_rows, img_cols, frames)
    nb_classes = nclass
    if os.path.exists(fname_npz):
        loadeddata = np.load(fname_npz)
        X, Y = loadeddata["X"], loadeddata["Y"]
    else:
        x, y = loaddata(videos, vid3d, nclass,
                        output, color, skip)
        X = x.reshape((x.shape[0], img_rows, img_cols, frames, channel))
        Y = keras.utils.to_categorical(y, nb_classes)

        X = X.astype('float32')
        np.savez(fname_npz, X=X, Y=Y)
        print('Saved dataset to dataset.npz.')
    print('X_shape:{}\nY_shape:{}'.format(X.shape, Y.shape))

    # Define model
    # model = Sequential()


    # model.add(Conv3D(16, kernel_size=(3, 3, 3), input_shape=(
    #     X.shape[1:]), padding='same'))
    # model.add(LeakyReLU(alpha=.001)) 


    # model.add(Conv3D(32, kernel_size=(3, 3, 3), input_shape=(
    #     X.shape[1:]), padding='same'))
    # model.add(LeakyReLU(alpha=.001)) 

    ###########################

    input_x = Input(shape = (32, 32, depth, 1))

    initial_conv = Conv3D(16, kernel_size= (3, 3, 3), padding='same')(input_x)
    initial_conv = LeakyReLU(alpha=.001)(initial_conv)

    initial_conv = Conv3D(32, kernel_size= (3, 3, 3), padding='same')(initial_conv)
    initial_conv = LeakyReLU(alpha=.001)(initial_conv)


    ###########################
    # PARALLEL 1

    conv1 = Conv3D(16, kernel_size=(1, 1, 1),padding='same')(initial_conv)
    conv1 = LeakyReLU(alpha=.001)(conv1)
    conv1 = MaxPooling3D(pool_size=(2, 2, 2), padding='same')(conv1)


    # conv1 = Conv3D(16, kernel_size=(3, 3, 3),padding='same')(conv1)
    # conv1 = LeakyReLU(alpha=.001)(conv1)

    conv1 = Conv3D(16, kernel_size=(1, 1, 1),padding='same')(conv1)
    conv1 = LeakyReLU(alpha=.001)(conv1)

    
    conv1 = MaxPooling3D(pool_size=(2, 2, 2), padding='same')(conv1)

    # conv1 = Conv3D(8, kernel_size=(1, 1, 1),padding='same')(conv1)

    #check it    
    # conv1 = LeakyReLU(alpha=.001)(conv1)

    # conv1 = Conv3D(1, kernel_size=(1, 1, 1),padding='same')(conv1)

    ##############################

    ##############################

    #Parallel 2

    conv2 = Conv3D(8, kernel_size=(1, 1, 1),padding='same')(initial_conv)
    conv2 = LeakyReLU(alpha=.001)(conv2)
    conv2 = MaxPooling3D(pool_size=(2, 2, 2), padding='same')(conv2)


    # conv2 = Conv3D(8, kernel_size=(3, 3, 3),padding='same')(conv2)
    # conv2 = LeakyReLU(alpha=.001)(conv2)

    conv2 = Conv3D(16, kernel_size=(1, 1, 1),padding='same')(conv2)
    conv2 = LeakyReLU(alpha=.001)(conv2)
    

    conv2 = MaxPooling3D(pool_size=(2, 2, 2), padding='same')(conv2)

    # conv2 = Conv3D(4, kernel_size=(1, 1, 1),padding='same')(conv2)

    # #check it    
    # conv2 = LeakyReLU(alpha=.001)(conv2)

    # conv2 = Conv3D(1, kernel_size=(1, 1, 1),padding='same')(conv2)

    ###################################


    ##############################

    #Parallel 3

    conv3 = Conv3D(4, kernel_size=(1, 1, 1),padding='same')(initial_conv)
    conv3 = LeakyReLU(alpha=.001)(conv3)
    conv3 = MaxPooling3D(pool_size=(2, 2, 2), padding='same')(conv3)


    # conv3 = Conv3D(4, kernel_size=(3, 3, 3),padding='same')(conv3)
    # conv3 = LeakyReLU(alpha=.001)(conv3)

    conv3 = Conv3D(16, kernel_size=(1, 1, 1),padding='same')(conv3)
    conv3 = LeakyReLU(alpha=.001)(conv3)
    

    conv3 = MaxPooling3D(pool_size=(2, 2, 2), padding='same')(conv3)

    # conv3 = Conv3D(4, kernel_size=(1, 1, 1),padding='same')(conv3)

    # #check it    
    # conv3 = LeakyReLU(alpha=.001)(conv3)

    # conv3 = Conv3D(1, kernel_size=(1, 1, 1),padding='same')(conv3)

    ###################################

    added = keras.layers.Add()([conv1, conv2, conv3])

    added = MaxPooling3D(pool_size=(2, 2, 2), padding='same')(added)

    added = Flatten()(added)

    dense_1 = Dense(256)(added)

    dense_2 = Dense(nb_classes, activation = 'softmax')(dense_1)

    model = Model(input_x, dense_2)

    model.compile(loss=categorical_crossentropy,
                  optimizer=Adam(), metrics=['accuracy']) 

    model.summary()
    # plot_model(model, show_shapes=True,
    #            to_file=os.path.join(args.output, 'model.png'))

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.2, random_state=43)
    print(X_train.shape)

    ####################

    # 1
    filepath="d_3dcnnmodel-{epoch:02d}-{val_accuracy:.2f}.weights.hd5.keras"
    
    checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    
    callbacks_list = [checkpoint]

    # 2 

    config = tf.compat.v1.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.3
    set_session(tf.compat.v1.Session(config=config))

    ###############

    # Train model
    history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=batch,
                        epochs=epoch, verbose=1, shuffle=True, callbacks=callbacks_list)
    
    model.evaluate(X_test, Y_test, verbose=0)
    
    model_json = model.to_json()
    
    if not os.path.isdir(output):
        os.makedirs(output)
    with open(os.path.join(output, 'ucf50_3dcnnmodel.json'), 'w') as json_file:
        json_file.write(model_json)
    
    model.save_weights(os.path.join(output, 'ucf50_3dcnnmodel-gpu.weights.h5'))

    loss, acc = model.evaluate(X_test, Y_test, verbose=0)
    
    print('Test loss:', loss)
    print('Test accuracy:', acc)

    plot_history(history, output)
    save_history(history, output)

In [5]:
if __name__ == '__main__':
    main()

simple 3D convolution for action recognition


100%|██████████| 50/50 [04:48<00:00,  5.77s/it]


Saved dataset to dataset.npz.
X_shape:(6669, 32, 32, 10, 1)
Y_shape:(6669, 50)


I0000 00:00:1759244786.336050      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


(5335, 32, 32, 10, 1)


I0000 00:00:1759244788.076585      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Epoch 1/100


I0000 00:00:1759244793.909231   26771 service.cc:148] XLA service 0x788898056770 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1759244793.910131   26771 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1759244794.476155   26771 cuda_dnn.cc:529] Loaded cuDNN version 90300
2025-09-30 15:06:38.304398: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng0{} for conv (f32[32,16,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[128,16,32,32,10]{4,3,2,1,0}, f32[128,32,32,32,10]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"leakyrelu_alpha":0,"side_input_scale":0},"force_earliest_schedule":false,"operation_queue_id":"0","wait_on_operation_queues":[]} is taking a while...
2025-09-30 15:06:38

[1m 2/42[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 71ms/step - accuracy: 0.0410 - loss: 54.3444 

I0000 00:00:1759244801.159168   26771 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step - accuracy: 0.0255 - loss: 25.2741
Epoch 1: val_accuracy improved from -inf to 0.02624, saving model to d_3dcnnmodel-01-0.03.weights.hd5.keras
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 250ms/step - accuracy: 0.0255 - loss: 25.0057 - val_accuracy: 0.0262 - val_loss: 4.4456
Epoch 2/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.0361 - loss: 4.2541
Epoch 2: val_accuracy improved from 0.02624 to 0.03823, saving model to d_3dcnnmodel-02-0.04.weights.hd5.keras
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 75ms/step - accuracy: 0.0361 - loss: 4.2507 - val_accuracy: 0.0382 - val_loss: 3.9612
Epoch 3/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.0471 - loss: 3.9124
Epoch 3: val_accuracy improved from 0.03823 to 0.04723, saving model to d_3dcnnmodel-03-0.05.weights.hd5.keras
[1m42/42[0m [3

# Sử dụng tích chập với filter 5x5x5.

In [6]:
def main():

    print('simple 3D convolution for action recognition')
    batch = 128
    # training epoch
    epoch = 100
    # directory where videos are stored
    videos= '/kaggle/input/ucf50-latest-version/UCF50'
    # Number of classes
    nclass = 50
    output = 'results_1'
    color  = False
    skip   = True
    depth  = 10

    
    img_rows, img_cols, frames = 32, 32, depth
    channel = 3 if color else 1
    fname_npz = 'dataset_{}_{}_{}.npz'.format(nclass, depth, skip)

    vid3d = Videoto3D(img_rows, img_cols, frames)
    nb_classes = nclass
    if os.path.exists(fname_npz):
        loadeddata = np.load(fname_npz)
        X, Y = loadeddata["X"], loadeddata["Y"]
    else:
        x, y = loaddata(videos, vid3d, nclass,
                        output, color, skip)
        X = x.reshape((x.shape[0], img_rows, img_cols, frames, channel))
        Y = keras.utils.to_categorical(y, nb_classes)

        X = X.astype('float32')
        np.savez(fname_npz, X=X, Y=Y)
        print('Saved dataset to dataset.npz.')
    print('X_shape:{}\nY_shape:{}'.format(X.shape, Y.shape))

    # Define model
    # model = Sequential()


    # model.add(Conv3D(16, kernel_size=(3, 3, 3), input_shape=(
    #     X.shape[1:]), padding='same'))
    # model.add(LeakyReLU(alpha=.001)) 


    # model.add(Conv3D(32, kernel_size=(3, 3, 3), input_shape=(
    #     X.shape[1:]), padding='same'))
    # model.add(LeakyReLU(alpha=.001)) 

    ###########################

    input_x = Input(shape = (32, 32, depth, 1))

    initial_conv = Conv3D(16, kernel_size= (3, 3, 3), padding='same')(input_x)
    initial_conv = LeakyReLU(alpha=.001)(initial_conv)

    initial_conv = Conv3D(32, kernel_size= (3, 3, 3), padding='same')(initial_conv)
    initial_conv = LeakyReLU(alpha=.001)(initial_conv)


    ###########################
    # PARALLEL 1

    conv1 = Conv3D(16, kernel_size=(5, 5, 5),padding='same')(initial_conv)
    conv1 = LeakyReLU(alpha=.001)(conv1)
    conv1 = MaxPooling3D(pool_size=(2, 2, 2), padding='same')(conv1)


    # conv1 = Conv3D(16, kernel_size=(3, 3, 3),padding='same')(conv1)
    # conv1 = LeakyReLU(alpha=.001)(conv1)

    conv1 = Conv3D(16, kernel_size=(1, 1, 1),padding='same')(conv1)
    conv1 = LeakyReLU(alpha=.001)(conv1)

    
    conv1 = MaxPooling3D(pool_size=(2, 2, 2), padding='same')(conv1)

    # conv1 = Conv3D(8, kernel_size=(1, 1, 1),padding='same')(conv1)

    #check it    
    # conv1 = LeakyReLU(alpha=.001)(conv1)

    # conv1 = Conv3D(1, kernel_size=(1, 1, 1),padding='same')(conv1)

    ##############################

    ##############################

    #Parallel 2

    conv2 = Conv3D(8, kernel_size=(5, 5, 5),padding='same')(initial_conv)
    conv2 = LeakyReLU(alpha=.001)(conv2)
    conv2 = MaxPooling3D(pool_size=(2, 2, 2), padding='same')(conv2)


    # conv2 = Conv3D(8, kernel_size=(3, 3, 3),padding='same')(conv2)
    # conv2 = LeakyReLU(alpha=.001)(conv2)

    conv2 = Conv3D(16, kernel_size=(1, 1, 1),padding='same')(conv2)
    conv2 = LeakyReLU(alpha=.001)(conv2)
    

    conv2 = MaxPooling3D(pool_size=(2, 2, 2), padding='same')(conv2)

    # conv2 = Conv3D(4, kernel_size=(1, 1, 1),padding='same')(conv2)

    # #check it    
    # conv2 = LeakyReLU(alpha=.001)(conv2)

    # conv2 = Conv3D(1, kernel_size=(1, 1, 1),padding='same')(conv2)

    ###################################


    ##############################

    #Parallel 3

    conv3 = Conv3D(4, kernel_size=(5, 5, 5),padding='same')(initial_conv)
    conv3 = LeakyReLU(alpha=.001)(conv3)
    conv3 = MaxPooling3D(pool_size=(2, 2, 2), padding='same')(conv3)


    # conv3 = Conv3D(4, kernel_size=(3, 3, 3),padding='same')(conv3)
    # conv3 = LeakyReLU(alpha=.001)(conv3)

    conv3 = Conv3D(16, kernel_size=(1, 1, 1),padding='same')(conv3)
    conv3 = LeakyReLU(alpha=.001)(conv3)
    

    conv3 = MaxPooling3D(pool_size=(2, 2, 2), padding='same')(conv3)

    # conv3 = Conv3D(4, kernel_size=(1, 1, 1),padding='same')(conv3)

    # #check it    
    # conv3 = LeakyReLU(alpha=.001)(conv3)

    # conv3 = Conv3D(1, kernel_size=(1, 1, 1),padding='same')(conv3)

    ###################################

    added = keras.layers.Add()([conv1, conv2, conv3])

    added = MaxPooling3D(pool_size=(2, 2, 2), padding='same')(added)

    added = Flatten()(added)

    dense_1 = Dense(256)(added)

    dense_2 = Dense(nb_classes, activation = 'softmax')(dense_1)

    model = Model(input_x, dense_2)

    model.compile(loss=categorical_crossentropy,
                  optimizer=Adam(), metrics=['accuracy']) 

    model.summary()
    # plot_model(model, show_shapes=True,
    #            to_file=os.path.join(args.output, 'model.png'))

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.2, random_state=43)
    print(X_train.shape)

    ####################

    # 1
    filepath="d_3dcnnmodel-{epoch:02d}-{val_accuracy:.2f}.weights.hd5.keras"
    
    checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    
    callbacks_list = [checkpoint]

    # 2 

    config = tf.compat.v1.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.3
    set_session(tf.compat.v1.Session(config=config))

    ###############

    # Train model
    history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=batch,
                        epochs=epoch, verbose=1, shuffle=True, callbacks=callbacks_list)
    
    model.evaluate(X_test, Y_test, verbose=0)
    
    model_json = model.to_json()
    
    if not os.path.isdir(output):
        os.makedirs(output)
    with open(os.path.join(output, 'ucf50_3dcnnmodel.json'), 'w') as json_file:
        json_file.write(model_json)
    
    model.save_weights(os.path.join(output, 'ucf50_3dcnnmodel-gpu.weights.h5'))

    loss, acc = model.evaluate(X_test, Y_test, verbose=0)
    
    print('Test loss:', loss)
    print('Test accuracy:', acc)

    plot_history(history, output)
    save_history(history, output)

In [7]:
if __name__ == '__main__':
    main()

simple 3D convolution for action recognition
X_shape:(6669, 32, 32, 10, 1)
Y_shape:(6669, 50)


(5335, 32, 32, 10, 1)


I0000 00:00:1759245680.362265      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Epoch 1/100


2025-09-30 15:21:28.256740: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng0{} for conv (f32[128,16,32,32,10]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[128,32,32,32,10]{4,3,2,1,0}, f32[16,32,5,5,5]{4,3,2,1,0}, f32[16]{0}), window={size=5x5x5 pad=2_2x2_2x2_2}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBiasActivationForward", backend_config={"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"leakyrelu_alpha":0,"side_input_scale":0},"force_earliest_schedule":false,"operation_queue_id":"0","wait_on_operation_queues":[]} is taking a while...
2025-09-30 15:21:28.714657: E external/local_xla/xla/service/slow_operation_alarm.cc:133] The operation took 1.45807s
Trying algorithm eng0{} for conv (f32[128,16,32,32,10]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[128,32,32,32,10]{4,3,2,1,0}, f32[16,32,5,5,5]{4,3,2,1,0}, f32[16]{0}), window={size=5x5x5 pad=2_2x2_2x2_2}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$c

[1m41/42[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 261ms/step - accuracy: 0.0199 - loss: 32.0493

2025-09-30 15:22:25.630036: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng0{} for conv (f32[8,32,5,5,5]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[87,32,32,32,10]{4,3,2,1,0}, f32[87,8,32,32,10]{4,3,2,1,0}), window={size=5x5x5 pad=2_2x2_2x2_2}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"leakyrelu_alpha":0,"side_input_scale":0},"force_earliest_schedule":false,"operation_queue_id":"0","wait_on_operation_queues":[]} is taking a while...
2025-09-30 15:22:26.267645: E external/local_xla/xla/service/slow_operation_alarm.cc:133] The operation took 1.637836544s
Trying algorithm eng0{} for conv (f32[8,32,5,5,5]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[87,32,32,32,10]{4,3,2,1,0}, f32[87,8,32,32,10]{4,3,2,1,0}), window={size=5x5x5 pad=2_2x2_2x2_2}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config=

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 880ms/step - accuracy: 0.0198 - loss: 31.5965
Epoch 1: val_accuracy improved from -inf to 0.03148, saving model to d_3dcnnmodel-01-0.03.weights.hd5.keras
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 980ms/step - accuracy: 0.0198 - loss: 31.1649 - val_accuracy: 0.0315 - val_loss: 4.1856
Epoch 2/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 260ms/step - accuracy: 0.0204 - loss: 4.1291
Epoch 2: val_accuracy did not improve from 0.03148
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 273ms/step - accuracy: 0.0204 - loss: 4.1269 - val_accuracy: 0.0277 - val_loss: 3.9064
Epoch 3/100
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 260ms/step - accuracy: 0.0376 - loss: 3.8996
Epoch 3: val_accuracy improved from 0.03148 to 0.05622, saving model to d_3dcnnmodel-03-0.06.weights.hd5.keras
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 276ms/st

# Xây dựng model dạng sequential

In [16]:
from tensorflow.keras import layers, models

def main():

    print('simple 3D convolution for action recognition')
    batch = 128
    # training epoch
    epoch = 50
    # directory where videos are stored
    videos= '/kaggle/input/ucf50-latest-version/UCF50'
    # Number of classes
    nclass = 50
    output = 'results_1'
    color  = False
    skip   = True
    depth  = 10

    
    img_rows, img_cols, frames = 32, 32, depth
    channel = 3 if color else 1
    fname_npz = 'dataset_{}_{}_{}.npz'.format(nclass, depth, skip)

    vid3d = Videoto3D(img_rows, img_cols, frames)
    nb_classes = nclass
    if os.path.exists(fname_npz):
        loadeddata = np.load(fname_npz)
        X, Y = loadeddata["X"], loadeddata["Y"]
    else:
        x, y = loaddata(videos, vid3d, nclass,
                        output, color, skip)
        X = x.reshape((x.shape[0], img_rows, img_cols, frames, channel))
        Y = keras.utils.to_categorical(y, nb_classes)

        X = X.astype('float32')
        np.savez(fname_npz, X=X, Y=Y)
        print('Saved dataset to dataset.npz.')
    print('X_shape:{}\nY_shape:{}'.format(X.shape, Y.shape))

    # Define model
    # model = Sequential()


    # model.add(Conv3D(16, kernel_size=(3, 3, 3), input_shape=(
    #     X.shape[1:]), padding='same'))
    # model.add(LeakyReLU(alpha=.001)) 


    # model.add(Conv3D(32, kernel_size=(3, 3, 3), input_shape=(
    #     X.shape[1:]), padding='same'))
    # model.add(LeakyReLU(alpha=.001)) 

    ###########################
    input_shape=(32, 32, 10, 1)
    num_classes=50
    model = models.Sequential([
        # Convolutional Layer 1
        layers.Conv3D(32, (3, 3, 3), activation='relu', input_shape=input_shape, padding='same'),
        layers.BatchNormalization(),
        layers.Conv3D(32, (3, 3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling3D((2, 2, 2)),
        layers.Dropout(0.25),
        
        # Convolutional Layer 2
        layers.Conv3D(64, (3, 3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.Conv3D(64, (3, 3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling3D((2, 2, 2)),
        layers.Dropout(0.25),
        
        # Convolutional Layer 3
        layers.Conv3D(128, (3, 3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling3D((2, 2, 2)),
        layers.Dropout(0.25),
        
        # Flatten and Dense Layers
        layers.Flatten(),
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])

    # Compile model
    model.compile(optimizer='adam',
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])


    model.summary()
    # plot_model(model, show_shapes=True,
    #            to_file=os.path.join(args.output, 'model.png'))

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.2, random_state=43)
    print(X_train.shape)

    ####################

    # 1
    filepath="d_3dcnnmodel_squen-{epoch:02d}-{val_accuracy:.2f}.weights.hd5.keras"
    
    checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
    
    callbacks_list = [checkpoint]

    # 2 

    config = tf.compat.v1.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.3
    set_session(tf.compat.v1.Session(config=config))

    ###############

    # Train model
    history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=batch,
                        epochs=epoch, verbose=1, shuffle=True, callbacks=callbacks_list)
    
    model.evaluate(X_test, Y_test, verbose=0)
    
    model_json = model.to_json()
    
    if not os.path.isdir(output):
        os.makedirs(output)
    with open(os.path.join(output, 'ucf50_3dcnnmodelsequential.json'), 'w') as json_file:
        json_file.write(model_json)
    
    model.save_weights(os.path.join(output, 'ucf50_3dcnnmodelsequential-gpu.weights.h5'))

    loss, acc = model.evaluate(X_test, Y_test, verbose=0)
    
    print('Test loss:', loss)
    print('Test accuracy:', acc)

    plot_history(history, output)
    save_history(history, output)

In [17]:
if __name__ == '__main__':
    main()

simple 3D convolution for action recognition
X_shape:(6669, 32, 32, 10, 1)
Y_shape:(6669, 50)


(5335, 32, 32, 10, 1)


I0000 00:00:1759247299.917183      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Epoch 1/50


2025-09-30 15:48:34.379907: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng0{} for conv (f32[32,32,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[128,32,32,32,10]{4,3,2,1,0}, f32[128,32,32,32,10]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"leakyrelu_alpha":0,"side_input_scale":0},"force_earliest_schedule":false,"operation_queue_id":"0","wait_on_operation_queues":[]} is taking a while...
2025-09-30 15:48:35.457133: E external/local_xla/xla/service/slow_operation_alarm.cc:133] The operation took 2.077380968s
Trying algorithm eng0{} for conv (f32[32,32,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[128,32,32,32,10]{4,3,2,1,0}, f32[128,32,32,32,10]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend

[1m41/42[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 102ms/step - accuracy: 0.0599 - loss: 4.6396

2025-09-30 15:48:50.368443: E external/local_xla/xla/service/slow_operation_alarm.cc:65] Trying algorithm eng0{} for conv (f32[32,32,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[87,32,32,32,10]{4,3,2,1,0}, f32[87,32,32,32,10]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_config={"cudnn_conv_backend_config":{"activation_mode":"kNone","conv_result_scale":1,"leakyrelu_alpha":0,"side_input_scale":0},"force_earliest_schedule":false,"operation_queue_id":"0","wait_on_operation_queues":[]} is taking a while...
2025-09-30 15:48:50.786523: E external/local_xla/xla/service/slow_operation_alarm.cc:133] The operation took 1.418231049s
Trying algorithm eng0{} for conv (f32[32,32,3,3,3]{4,3,2,1,0}, u8[0]{0}) custom-call(f32[87,32,32,32,10]{4,3,2,1,0}, f32[87,32,32,32,10]{4,3,2,1,0}), window={size=3x3x3 pad=1_1x1_1x1_1}, dim_labels=bf012_oi012->bf012, custom_call_target="__cudnn$convBackwardFilter", backend_con

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 369ms/step - accuracy: 0.0608 - loss: 4.6273
Epoch 1: val_accuracy improved from -inf to 0.04498, saving model to d_3dcnnmodel_squen-01-0.04.weights.hd5.keras
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 454ms/step - accuracy: 0.0616 - loss: 4.6156 - val_accuracy: 0.0450 - val_loss: 13.0842
Epoch 2/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step - accuracy: 0.2261 - loss: 3.1538
Epoch 2: val_accuracy improved from 0.04498 to 0.24138, saving model to d_3dcnnmodel_squen-02-0.24.weights.hd5.keras
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 120ms/step - accuracy: 0.2267 - loss: 3.1499 - val_accuracy: 0.2414 - val_loss: 2.8783
Epoch 3/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step - accuracy: 0.3549 - loss: 2.3916
Epoch 3: val_accuracy improved from 0.24138 to 0.33733, saving model to d_3dcnnmodel_squen-03-0.34.weights.hd5.keras

# PHẦN 2

In [29]:

# (Tùy chọn) Cài đặt thư viện nếu môi trường thiếu.
# Bạn có thể comment nếu đã có sẵn.
# !pip install opencv-python tqdm

import os
import math
import json
import time
import random
import shutil
from pathlib import Path
from dataclasses import dataclass, asdict

import cv2
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import transforms
import torchvision.models as models

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report


# Configuration

In [19]:
@dataclass
class Config:
    # Đường dẫn & IO
    DATA_ROOT: str = "/kaggle/input/ucf50-latest-version/UCF50"   # Sửa theo nơi lưu dữ liệu (VD: local)
    OUTPUT_DIR: str = "./outputs_ucf50_fusion"
    LABELS_JSON: str = "labels_map.json"
    FLOW_CACHE_DIR: str = "./flow_cache"           # Cache *.npy cho optical flow (tùy chọn)

    # Dataloader
    IMG_SIZE: int = 224
    BATCH_SIZE: int = 16
    NUM_WORKERS: int = 2
    VAL_SPLIT: float = 0.2
    SEED: int = 1337

    # Lấy mẫu video
    NUM_SEGMENTS: int = 1          # số frame mẫu (cho RGB). Two-Stream 2D thường 1 frame/clip
    FLOW_STACK: int = 5            # số cặp (u,v) liên tiếp => 2*FLOW_STACK kênh
    FLOW_METHOD: str = "farneback" # "farneback" | "tvl1"
    FLOW_PRECOMPUTE: bool = True   # bật cache optical flow ra đĩa

    # Huấn luyện
    NUM_EPOCHS: int = 10
    LEARNING_RATE: float = 1e-3
    WEIGHT_DECAY: float = 1e-4
    SCHEDULER: str = "cosine"      # "cosine" | "step" | "plateau" | "none"
    STEP_SIZE: int = 5
    GAMMA: float = 0.1
    T_MAX: int = 10                # cho CosineAnnealingLR
    EARLY_STOP_PATIENCE: int = 5

    # Fusion
    FUSION: str = "late"           # "late" | "early_feature" | "early_channel"
    LATE_FUSION_WEIGHTS: tuple = (0.5, 0.5)  # (w_rgb, w_flow)
    # Nếu early_channel: in_channels = 3 + 2*FLOW_STACK
    # Nếu early_feature: concat features: [feat_rgb ; feat_flow]

    # Lưu & Log
    SAVE_BEST: bool = True
    BEST_MODEL_PATH: str = "best_model.pth"
    LOG_JSON: str = "train_log.json"

cfg = Config()

# Tạo thư mục xuất
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
os.makedirs(cfg.FLOW_CACHE_DIR, exist_ok=True)

# Cố định seed cho tái lập
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(cfg.SEED)
print(cfg)


Config(DATA_ROOT='/kaggle/input/ucf50-latest-version/UCF50', OUTPUT_DIR='./outputs_ucf50_fusion', LABELS_JSON='labels_map.json', FLOW_CACHE_DIR='./flow_cache', IMG_SIZE=224, BATCH_SIZE=16, NUM_WORKERS=2, VAL_SPLIT=0.2, SEED=1337, NUM_SEGMENTS=1, FLOW_STACK=5, FLOW_METHOD='farneback', FLOW_PRECOMPUTE=True, NUM_EPOCHS=10, LEARNING_RATE=0.001, WEIGHT_DECAY=0.0001, SCHEDULER='cosine', STEP_SIZE=5, GAMMA=0.1, T_MAX=10, EARLY_STOP_PATIENCE=5, FUSION='late', LATE_FUSION_WEIGHTS=(0.5, 0.5), SAVE_BEST=True, BEST_MODEL_PATH='best_model.pth', LOG_JSON='train_log.json')


# Processing_data

In [20]:
def scan_ucf50(root: str):
    root = Path(root)
    classes = sorted([p.name for p in root.iterdir() if p.is_dir()])
    items = []
    for ci, cname in enumerate(classes):
        for v in (root/cname).glob("*.avi"):
            items.append({"path": str(v), "label_name": cname, "label": ci})
    return classes, items

classes, items = scan_ucf50(cfg.DATA_ROOT)
num_classes = len(classes)
print(f"Found {len(items)} videos across {num_classes} classes.")

# Lưu nhãn
labels_map = {i: c for i, c in enumerate(classes)}
with open(os.path.join(cfg.OUTPUT_DIR, cfg.LABELS_JSON), "w", encoding="utf-8") as f:
    json.dump(labels_map, f, ensure_ascii=False, indent=2)

# Stratified split theo lớp
from collections import defaultdict
by_class = defaultdict(list)
for it in items:
    by_class[it["label"]].append(it)

train_list, val_list = [], []
for k, vids in by_class.items():
    n = len(vids)
    idx = list(range(n))
    random.shuffle(idx)
    cut = int(n * (1.0 - cfg.VAL_SPLIT))
    for j in idx[:cut]:
        train_list.append(vids[j])
    for j in idx[cut:]:
        val_list.append(vids[j])

print(f"Train videos: {len(train_list)}, Val videos: {len(val_list)}")


Found 6669 videos across 50 classes.
Train videos: 5316, Val videos: 1353


In [38]:

def read_video_frames(path, target_indices, resize_hw=None):
    '''Đọc một số frame theo chỉ số trong video. Trả về list ảnh BGR (np.uint8).'''
    cap = cv2.VideoCapture(path)
    frames = []
    if not cap.isOpened():
        cap.release()
        raise RuntimeError(f"Cannot open video: {path}")
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    for ti in target_indices:
        idx = min(max(int(ti), 0), max(total-1, 0))
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ok, frame = cap.read()
        if not ok:
            ok2, frame2 = cap.read()
            if not ok2:
                frame = np.zeros((resize_hw[0], resize_hw[1], 3), dtype=np.uint8) if resize_hw else None
            else:
                frame = frame2
        if resize_hw is not None and frame is not None:
            frame = cv2.resize(frame, (resize_hw[1], resize_hw[0]), interpolation=cv2.INTER_LINEAR)
        frames.append(frame)
    cap.release()
    return frames

def compute_flow_pair(prev_gray, next_gray, method="farneback"):
    if method == "farneback":
        flow = cv2.calcOpticalFlowFarneback(prev_gray, next_gray,
                                            None, 0.5, 3, 15, 3, 5, 1.2, 0)
        return flow  # HxWx2 (u,v)
    elif method == "tvl1":
        tvl1 = cv2.optflow.DualTVL1OpticalFlow_create()
        flow = tvl1.calc(prev_gray, next_gray, None)
        return flow
    else:
        raise ValueError("FLOW_METHOD must be 'farneback' or 'tvl1'")

def flow_to_uv_stack(frames_gray, flow_stack=5, method="farneback"):
    '''Tính (u,v) cho flow_stack cặp khung hình liên tiếp quanh frame trung tâm.
       Trả về array shape (2*flow_stack, H, W), dtype float32.'''
    T = len(frames_gray)
    if T < flow_stack + 1:
        while len(frames_gray) < flow_stack + 1:
            frames_gray.append(frames_gray[-1])
        T = len(frames_gray)
    uv_list = []
    for i in range(flow_stack):
        f0 = frames_gray[i]
        f1 = frames_gray[i+1]
        flow = compute_flow_pair(f0, f1, method=method)  # HxWx2
        u, v = flow[..., 0], flow[..., 1]
        uv_list.append(u.astype(np.float32))
        uv_list.append(v.astype(np.float32))
    uv = np.stack(uv_list, axis=0)  # (2*flow_stack, H, W)
    uv = np.clip(uv, -20.0, 20.0) / 20.0  # [-1,1]
    return uv

def load_or_compute_flow_stack(video_path, center_idx, resize_hw, flow_stack, method, cache_dir):
    '''Tải từ cache hoặc tính flow quanh center_idx. Cache theo tên file.'''
    from pathlib import Path
    key = f"{Path(video_path).stem}_f{center_idx}_s{flow_stack}_{method}_{resize_hw[0]}x{resize_hw[1]}.npy"
    cache_path = Path(cache_dir)/key
    if cache_path.exists():
        return np.load(str(cache_path))
    frame_indices = [center_idx + i for i in range(flow_stack+1)]
    frames = read_video_frames(video_path, frame_indices, resize_hw)
    grays = [cv2.cvtColor(f, cv2.COLOR_BGR2GRAY) for f in frames]
    uv = flow_to_uv_stack(grays, flow_stack=flow_stack, method=method)
    # np.save(str(cache_path), uv)
    return uv


In [39]:

class UCF50TwoStreamDataset(Dataset):
    def __init__(self, items, img_size=224, num_segments=1, flow_stack=5,
                 flow_method="farneback", flow_cache_dir=None, mode="train"):
        self.items = items
        self.H = img_size
        self.W = img_size
        self.num_segments = num_segments
        self.flow_stack = flow_stack
        self.flow_method = flow_method
        self.flow_cache_dir = flow_cache_dir
        self.mode = mode

        self.rgb_train_tf = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((self.H, self.W)),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]),
        ])
        self.rgb_val_tf = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((self.H, self.W)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]),
        ])

        self.to_tensor = transforms.ToTensor()

    def __len__(self):
        return len(self.items)

    def _sample_center(self, vpath):
        cap = cv2.VideoCapture(vpath)
        if not cap.isOpened():
            return 0, 1
        total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        cap.release()
        if total <= 1:
            return 0, 1
        ci = random.randint(1, max(total-2, 1))
        return ci, total

    def __getitem__(self, idx):
        rec = self.items[idx]
        vpath = rec["path"]
        label = rec["label"]

        center_idx, total = self._sample_center(vpath)

        rgb_frame = read_video_frames(vpath, [center_idx], resize_hw=(self.H, self.W))[0]  # BGR
        rgb = cv2.cvtColor(rgb_frame, cv2.COLOR_BGR2RGB)
        if self.mode == "train":
            rgb_t = self.rgb_train_tf(rgb)
        else:
            rgb_t = self.rgb_val_tf(rgb)

        if self.flow_cache_dir is not None:
            uv = load_or_compute_flow_stack(
                vpath, center_idx, (self.H, self.W),
                self.flow_stack, self.flow_method, self.flow_cache_dir
            )
        else:
            frames = read_video_frames(vpath, [center_idx + i for i in range(self.flow_stack+1)],
                                       resize_hw=(self.H, self.W))
            grays = [cv2.cvtColor(f, cv2.COLOR_BGR2GRAY) for f in frames]
            uv = flow_to_uv_stack(grays, self.flow_stack, self.flow_method)
        flow_t = torch.from_numpy(uv)  # (2*flow_stack, H, W)

        return rgb_t, flow_t, label


In [40]:

train_ds = UCF50TwoStreamDataset(
    train_list, img_size=cfg.IMG_SIZE, num_segments=cfg.NUM_SEGMENTS,
    flow_stack=cfg.FLOW_STACK, flow_method=cfg.FLOW_METHOD,
    flow_cache_dir=(cfg.FLOW_CACHE_DIR if cfg.FLOW_PRECOMPUTE else None),
    mode="train"
)
val_ds = UCF50TwoStreamDataset(
    val_list, img_size=cfg.IMG_SIZE, num_segments=cfg.NUM_SEGMENTS,
    flow_stack=cfg.FLOW_STACK, flow_method=cfg.FLOW_METHOD,
    flow_cache_dir=(cfg.FLOW_CACHE_DIR if cfg.FLOW_PRECOMPUTE else None),
    mode="val"
)

train_loader = DataLoader(train_ds, batch_size=cfg.BATCH_SIZE, shuffle=True,
                          num_workers=cfg.NUM_WORKERS, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=cfg.BATCH_SIZE, shuffle=False,
                        num_workers=cfg.NUM_WORKERS, pin_memory=True)

len(train_ds), len(val_ds)


(5316, 1353)

# Types of Fusion

## Thay backbone Resnet18 -> Mobile Net v3 small

In [41]:
# def inflate_conv1_weight(conv1: nn.Conv2d, new_in_channels: int):
#     '''Mở rộng conv1 để nhận nhiều kênh > 3 bằng cách lặp/avg trọng số ban đầu.'''
#     out_c, in_c, kh, kw = old_w.shape
#     if new_in_channels == in_c:
#         return conv1
#     new_w = torch.zeros((out_c, new_in_channels, kh, kw))
#     for oc in range(out_c):
#         for ic in range(new_in_channels):
#             new_w[oc, ic] = old_w[oc, ic % in_c]
#     conv1.in_channels = new_in_channels
#     conv1.weight = nn.Parameter(new_w)
#     return conv1

# class EarlyChannelMobileNet(nn.Module):
#     '''Early-Channel Fusion: ghép RGB + Flow theo kênh => một backbone duy nhất.'''
#     def __init__(self, num_classes, in_channels):
#         super().__init__()
#         self.backbone = mobilenet_v3_small(weights='IMAGENET1K_V1')
#         self.backbone.conv1 = inflate_conv1_weight(self.backbone.conv1, in_channels)
#         in_feat = self.backbone.fc.in_features
#         self.backbone.fc = nn.Linear(in_feat, num_classes)

#     def forward(self, x):  # x: (B, C, H, W)
#         return self.backbone(x)

# class TwoBackboneEarlyFeature(nn.Module):
#     '''Early-Feature Fusion: 2 backbone riêng -> concat features -> classifier.'''
#     def __init__(self, num_classes, flow_in_channels):
#         super().__init__()
#         self.rgb_net = mobilenet_v3_small(weights='IMAGENET1K_V1')
#         self.flow_net = mobilenet_v3_small(weights='IMAGENET1K_V1')
#         self.flow_net.conv1 = inflate_conv1_weight(self.flow_net.conv1, flow_in_channels)
#         self.rgb_net.fc = nn.Identity()
#         self.flow_net.fc = nn.Identity()
#         in_feat = 512 + 512
#         self.fc = nn.Linear(in_feat, num_classes)

#     def forward(self, rgb, flow):
#         fr = self.rgb_net(rgb)   # (B,512)
#         ff = self.flow_net(flow) # (B,512)
#         f = torch.cat([fr, ff], dim=1)
#         logits = self.fc(f)
#         return logits

# class LateFusionModel(nn.Module):
#     '''Late Fusion: 2 backbone riêng -> logits riêng -> trộn theo trọng số.'''
#     def __init__(self, num_classes, flow_in_channels, w_rgb=0.5, w_flow=0.5):
#         super().__init__()
#         self.rgb_net = mobilenet_v3_small(weights='IMAGENET1K_V1')
#         self.flow_net = mobilenet_v3_small(weights='IMAGENET1K_V1')
#         self.flow_net.conv1 = inflate_conv1_weight(self.flow_net.conv1, flow_in_channels)
#         self.rgb_head = nn.Linear(self.rgb_net.fc.in_features, num_classes)
#         self.flow_head = nn.Linear(self.flow_net.fc.in_features, num_classes)
#         self.w_rgb = w_rgb
#         self.w_flow = w_flow
#         self.rgb_net.fc = nn.Identity()
#         self.flow_net.fc = nn.Identity()

#     def forward(self, rgb, flow):
#         fr = self.rgb_net(rgb)    # (B,512)
#         ff = self.flow_net(flow)  # (B,512)
#         lr = self.rgb_head(fr)    # (B,C)
#         lf = self.flow_head(ff)   # (B,C)
#         logits = self.w_rgb * lr + self.w_flow * lf
#         return logits, lr, lf


def inflate_conv_weight(conv: nn.Conv2d, new_in_channels: int):
    '''Mở rộng lớp Conv2d để nhận nhiều kênh > 3 bằng cách lặp/avg trọng số ban đầu.'''
    old_w = conv.weight.data  # (out_c, in_c, k, k)
    out_c, in_c, kh, kw = old_w.shape
    if new_in_channels == in_c:
        return conv
    new_w = torch.zeros((out_c, new_in_channels, kh, kw))
    for oc in range(out_c):
        for ic in range(new_in_channels):
            new_w[oc, ic] = old_w[oc, ic % in_c]
    new_conv = nn.Conv2d(new_in_channels, out_c, kernel_size=(kh, kw), 
                         stride=conv.stride, padding=conv.padding, bias=conv.bias is not None)
    new_conv.weight = nn.Parameter(new_w)
    if conv.bias is not None:
        new_conv.bias = nn.Parameter(conv.bias.data.clone())
    return new_conv

class EarlyChannelMobileNet(nn.Module):
    '''Early-Channel Fusion: ghép RGB + Flow theo kênh => một backbone duy nhất.'''
    def __init__(self, num_classes, in_channels):
        super().__init__()
        self.backbone = models.mobilenet_v3_small(weights='IMAGENET1K_V1')
        self.backbone.features[0][0] = inflate_conv_weight(self.backbone.features[0][0], in_channels)
        in_feat = self.backbone.classifier[-1].in_features
        self.backbone.classifier[-1] = nn.Linear(in_feat, num_classes)

    def forward(self, x):  # x: (B, C, H, W)
        return self.backbone(x)

class TwoBackboneEarlyFeature(nn.Module):
    '''Early-Feature Fusion: 2 backbone riêng -> concat features -> classifier.'''
    def __init__(self, num_classes, flow_in_channels):
        super().__init__()
        self.rgb_net = models.mobilenet_v3_small(weights='IMAGENET1K_V1')
        self.flow_net = models.mobilenet_v3_small(weights='IMAGENET1K_V1')
        self.flow_net.features[0][0] = inflate_conv_weight(self.flow_net.features[0][0], flow_in_channels)
        self.rgb_net.classifier = nn.Identity()
        self.flow_net.classifier = nn.Identity()
        in_feat = 576 + 576
        self.fc = nn.Linear(in_feat, num_classes)

    def forward(self, rgb, flow):
        fr = self.rgb_net(rgb)   # (B, 576)
        ff = self.flow_net(flow) # (B, 576)
        f = torch.cat([fr, ff], dim=1)
        logits = self.fc(f)
        return logits

class LateFusionModel(nn.Module):
    '''Late Fusion: 2 backbone riêng -> logits riêng -> trộn theo trọng số.'''
    def __init__(self, num_classes, flow_in_channels, w_rgb=0.5, w_flow=0.5):
        super().__init__()
        self.rgb_net = models.mobilenet_v3_small(weights='IMAGENET1K_V1')
        self.flow_net = models.mobilenet_v3_small(weights='IMAGENET1K_V1')
        # Điều chỉnh lớp convolution đầu tiên của flow_net
        self.flow_net.features[0][0] = inflate_conv_weight(self.flow_net.features[0][0], flow_in_channels)
        # Lấy số đặc trưng từ classifier (MobileNetV3-Small có in_features=576)
        in_feat = 576  # Hard-coded vì MobileNetV3-Small luôn có 576
        self.rgb_head = nn.Linear(in_feat, num_classes)
        self.flow_head = nn.Linear(in_feat, num_classes)
        self.w_rgb = w_rgb
        self.w_flow = w_flow
        # Loại bỏ classifier gốc
        self.rgb_net.classifier = nn.Identity()
        self.flow_net.classifier = nn.Identity()

    def forward(self, rgb, flow):
        fr = self.rgb_net(rgb)    # (B, 576)
        ff = self.flow_net(flow)  # (B, 576)
        lr = self.rgb_head(fr)    # (B, num_classes)
        lf = self.flow_head(ff)   # (B, num_classes)
        logits = self.w_rgb * lr + self.w_flow * lf
        return logits, lr, lf

In [42]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

flow_in_ch = 2 * cfg.FLOW_STACK

if cfg.FUSION == "early_channel":
    model = EarlyChannelMobileNet(num_classes=num_classes, in_channels=3 + flow_in_ch).to(device)
elif cfg.FUSION == "early_feature":
    model = TwoBackboneEarlyFeature(num_classes=num_classes, flow_in_channels=flow_in_ch).to(device)
elif cfg.FUSION == "late":
    w_rgb, w_flow = cfg.LATE_FUSION_WEIGHTS
    model = LateFusionModel(num_classes=num_classes, flow_in_channels=flow_in_ch,
                            w_rgb=w_rgb, w_flow=w_flow).to(device)
else:
    raise ValueError("cfg.FUSION must be one of: early_channel | early_feature | late")

optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.LEARNING_RATE, weight_decay=cfg.WEIGHT_DECAY)
if cfg.SCHEDULER == "cosine":
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg.T_MAX)
elif cfg.SCHEDULER == "step":
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=cfg.STEP_SIZE, gamma=cfg.GAMMA)
elif cfg.SCHEDULER == "plateau":
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=2, factor=0.5)
else:
    scheduler = None

criterion = nn.CrossEntropyLoss()

class EarlyStopper:
    def __init__(self, patience=5):
        self.patience = patience
        self.best = -1.0
        self.count = 0
    def step(self, metric):
        if metric > self.best:
            self.best = metric
            self.count = 0
            return True
        else:
            self.count += 1
            return False
    def should_stop(self):
        return self.count >= self.patience

early_stopper = EarlyStopper(cfg.EARLY_STOP_PATIENCE)


Device: cuda


In [43]:

def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total, correct, loss_sum = 0, 0, 0.0
    for rgb, flow, y in tqdm(loader, desc="Train", leave=False):
        rgb = rgb.to(device)
        flow = flow.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        if isinstance(model, EarlyChannelMobileNet):
            x = torch.cat([rgb, flow], dim=1)  # (B, 3+2*flow_stack, H, W)
            logits = model(x)
        elif isinstance(model, TwoBackboneEarlyFeature):
            logits = model(rgb, flow)
        elif isinstance(model, LateFusionModel):
            logits, _, _ = model(rgb, flow)
        else:
            raise RuntimeError("Unknown model type")

        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            pred = logits.argmax(dim=1)
            correct += (pred == y).sum().item()
            total += y.size(0)
            loss_sum += loss.item() * y.size(0)

    return loss_sum/total, correct/total

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total, correct, loss_sum = 0, 0, 0.0
    all_y, all_pred = [], []
    for rgb, flow, y in tqdm(loader, desc="Val", leave=False):
        rgb = rgb.to(device)
        flow = flow.to(device)
        y = y.to(device)
        if isinstance(model, EarlyChannelMobileNet):
            x = torch.cat([rgb, flow], dim=1)
            logits = model(x)
        elif isinstance(model, TwoBackboneEarlyFeature):
            logits = model(rgb, flow)
        elif isinstance(model, LateFusionModel):
            logits, _, _ = model(rgb, flow)
        else:
            raise RuntimeError("Unknown model type")

        loss = criterion(logits, y)
        pred = logits.argmax(dim=1)

        correct += (pred == y).sum().item()
        total += y.size(0)
        loss_sum += loss.item() * y.size(0)

        all_y.extend(y.cpu().numpy().tolist())
        all_pred.extend(pred.cpu().numpy().tolist())

    acc = correct/total if total > 0 else 0.0
    return loss_sum/max(total,1), acc, np.array(all_y), np.array(all_pred)


In [47]:

best_acc = -1.0
log_hist = []

for epoch in range(1, cfg.NUM_EPOCHS+1):
    t0 = time.time()
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc, y_true, y_pred = evaluate(model, val_loader, criterion, device)

    if cfg.SCHEDULER == "plateau":
        scheduler.step(val_acc)
    elif cfg.SCHEDULER in ["cosine", "step"] and scheduler is not None:
        scheduler.step()

    rec = {
        "epoch": epoch,
        "train_loss": train_loss,
        "train_acc": train_acc,
        "val_loss": val_loss,
        "val_acc": val_acc,
        "lr": optimizer.param_groups[0]["lr"],
        "time_sec": round(time.time()-t0, 2),
    }
    log_hist.append(rec)
    print(f"[Epoch {epoch:02d}] "
          f"train_loss={train_loss:.4f} acc={train_acc:.4f} | "
          f"val_loss={val_loss:.4f} acc={val_acc:.4f} | lr={rec['lr']:.2e} | "
          f"time={rec['time_sec']}s")

    improved = val_acc > best_acc
    if improved and cfg.SAVE_BEST:
        best_acc = val_acc
        # torch.save(model.state_dict(), os.path.join(cfg.OUTPUT_DIR, cfg.BEST_MODEL_PATH))
        # with open(os.path.join(cfg.OUTPUT_DIR, "best_epoch.txt"), "w") as f:
        #     f.write(f"best_epoch={epoch}\nval_acc={val_acc:.4f}\n")

    if early_stopper.step(val_acc) is False and early_stopper.should_stop():
        print("Early stopping triggered.")
        break

# with open(os.path.join(cfg.OUTPUT_DIR, cfg.LOG_JSON), "w") as f:
#     json.dump(log_hist, f, indent=2)

print("Training done. Best val acc:", best_acc)


                                                        

[Epoch 01] train_loss=0.7190 acc=0.7912 | val_loss=0.6714 acc=0.7975 | lr=9.05e-04 | time=490.92s


                                                        

[Epoch 02] train_loss=0.4506 acc=0.8681 | val_loss=0.5652 acc=0.8411 | lr=7.94e-04 | time=491.63s


                                                        

[Epoch 03] train_loss=0.3078 acc=0.9093 | val_loss=0.4882 acc=0.8477 | lr=6.55e-04 | time=497.59s


                                                        

[Epoch 04] train_loss=0.2093 acc=0.9398 | val_loss=0.2677 acc=0.9165 | lr=5.00e-04 | time=504.93s


                                                        

[Epoch 05] train_loss=0.1669 acc=0.9505 | val_loss=0.1719 acc=0.9549 | lr=3.45e-04 | time=503.4s


                                                        

[Epoch 06] train_loss=0.1069 acc=0.9701 | val_loss=0.1968 acc=0.9424 | lr=2.06e-04 | time=492.73s


                                                        

[Epoch 07] train_loss=0.0956 acc=0.9740 | val_loss=0.1023 acc=0.9734 | lr=9.55e-05 | time=486.91s


                                                        

[Epoch 08] train_loss=0.0628 acc=0.9853 | val_loss=0.1007 acc=0.9719 | lr=2.45e-05 | time=483.04s


                                                        

[Epoch 09] train_loss=0.0597 acc=0.9850 | val_loss=0.0948 acc=0.9704 | lr=0.00e+00 | time=485.82s


                                                        

[Epoch 10] train_loss=0.0631 acc=0.9844 | val_loss=0.1029 acc=0.9704 | lr=2.45e-05 | time=486.9s
Training done. Best val acc: 0.9733924611973392




In [48]:
@torch.no_grad()
def predict_single_video(model, video_path, cfg):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.eval()
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) if cap.isOpened() else 2
    cap.release()
    ci = min(max(total//2, 1), max(total-2, 1))

    rgb_frame = read_video_frames(video_path, [ci], resize_hw=(cfg.IMG_SIZE, cfg.IMG_SIZE))[0]
    rgb = cv2.cvtColor(rgb_frame, cv2.COLOR_BGR2RGB)
    tf = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((cfg.IMG_SIZE, cfg.IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])
    rgb_t = tf(rgb).unsqueeze(0).to(device)

    uv = load_or_compute_flow_stack(
        video_path, ci, (cfg.IMG_SIZE, cfg.IMG_SIZE), cfg.FLOW_STACK, cfg.FLOW_METHOD, cfg.FLOW_CACHE_DIR
    )
    flow_t = torch.from_numpy(uv).unsqueeze(0).to(device)  # (1,2*stack,H,W)

    if isinstance(model, EarlyChannelMobileNet):
        x = torch.cat([rgb_t, flow_t], dim=1)
        logits = model(x)
    elif isinstance(model, TwoBackboneEarlyFeature):
        logits = model(rgb_t, flow_t)
    elif isinstance(model, LateFusionModel):
        logits, _, _ = model(rgb_t, flow_t)

    prob = F.softmax(logits, dim=1)[0].cpu().numpy()
    pred_idx = int(np.argmax(prob))
    return pred_idx, prob

# Ví dụ sử dụng (sau khi train):
pred_idx, prob = predict_single_video(model, "/kaggle/input/ucf50-latest-version/UCF50/Basketball/v_Basketball_g01_c01.avi", cfg)
print("Dự đoán:", classes[pred_idx])


Dự đoán: Basketball
