In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

import os
import sys
sys.path.insert(0, os.path.dirname('../'))

from data_utils import video_to_frames
from data_utils import metadata_loader
from data_utils.kth_dataset_builder import DatasetBuilder

from models.IMAGENET import Imagenet, Video_Feature_Extractor 
from models.IMAGENET import AVG_Video_Classifier, LSTM_Video_Classifier

# Load Dataset

In [2]:
# Setup builder
video_path = '../data/kth-actions/video'
frame_path = '../data/kth-actions/frame'
builder = DatasetBuilder(video_path, frame_path, img_width=84, img_height=84, ms_per_frame=1000, max_frames=16)

# Convert videos and generate metadata
#builder.convert_videos_to_frames()
metadata = builder.generate_metadata()

# Build datasets
train_ds = builder.make_video_dataset(metadata=metadata['train'])
valid_ds = builder.make_video_dataset(metadata=metadata['valid'])

# Preprocess dataset
IMG_SIZE = 160 # All images will be resized to 160x160
IMG_SHAPE = (IMG_SIZE, IMG_SIZE, 3)

def format_example(image, label):
    image = tf.repeat(image,3,axis=3)   
    image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))
    return image, label

train_ds = train_ds.map(format_example)
valid_ds = valid_ds.map(format_example)

# Print
for x, lab in valid_ds.take(1):
    print(x.shape, lab.shape)
train_ds

(16, 160, 160, 3) (6,)


<MapDataset shapes: ((None, 160, 160, None), (6,)), types: (tf.float32, tf.int32)>

In [48]:
# Training set
a = np.zeros(6)
for _, label in train_ds.as_numpy_iterator():
    a=a+label
print("Training set, cases for each class:",a)

# Valid Set
a = np.zeros(6)
for _, label in valid_ds.as_numpy_iterator():
    a=a+label
print("Validation set, cases for each class:",a)

Training set, cases for each class: [70. 70. 70. 70. 70. 70.]
Validation set, cases for each class: [15. 15. 15. 15. 15. 15.]


# Transfer learning 
### For videos
Below we show to ways how to do transfer learning based on a pretrained base model.
The only part that should be changed is the one comming after video_fature_extractor. Below we show to ways how one can use an RNN(LSTM) or a simple MLP to do the job.

### For images
If we want to train with frames as input there is no feature_extractor necessary. We can put a classifier directly on top of the base model.
In order to see how we do fine tuning chacke the **Transfer_learning.ipynb**

## 1)RNN(LSTM) based classifier with Inception backbone

In [3]:
from tensorflow.keras.layers import Input, Activation, Dense, Conv3D, MaxPool3D, Flatten, Dropout, BatchNormalization, LSTM
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalAveragePooling2D, TimeDistributed
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.losses import CategoricalCrossentropy

def My_Video_Classifier(features, class_nr, optimizer='adam'):
    # model
    full_model = tf.keras.Sequential([
        features,
        Dense(128, kernel_initializer="he_normal"),
        LSTM(512, input_shape=(None,128)),
        #Dense(512, kernel_initializer="he_normal"),
        Dropout(rate=0.4),
        Dense(class_nr)
        ])
    
    #compile model
    full_model.compile(
        optimizer=optimizer,
        loss=CategoricalCrossentropy(from_logits=True),
        metrics=['accuracy']
        )
    return full_model

In [4]:
# Base model (returns pretrained frozen base model trained on Imagenet)
inception = Imagenet(input_shape=IMG_SHAPE, name='inception')

# Feature Extractor (Has output (NR_FRAME x D) where D is feature dimension)
featuer_ex = Video_Feature_Extractor(inception)

# LSTM Clasifier
model = My_Video_Classifier(features=featuer_ex, class_nr=6)
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_1 (Sequential)    (None, None, 2048)        21802784  
_________________________________________________________________
dense (Dense)                (None, None, 128)         262272    
_________________________________________________________________
lstm (LSTM)                  (None, 512)               1312768   
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 3078      
Total params: 23,380,902
Trainable params: 1,578,118
Non-trainable params: 21,802,784
_________________________________________________________________


In [5]:
model.fit(train_ds.shuffle(100).batch(25).prefetch(1), validation_data=valid_ds.batch(1), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
 1/17 [>.............................] - ETA: 0s - loss: 0.9349 - accuracy: 0.5600

KeyboardInterrupt: 

In [None]:
model.evaluate(valid_ds.batch(1))

## 2)MLP classifier with Inception backbone

In [8]:
from tensorflow.keras.layers import Input, Activation, Dense, Conv3D, MaxPool3D, Flatten, Dropout, BatchNormalization, LSTM
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalAveragePooling2D, TimeDistributed
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.regularizers import l2

# svm classifier
def AVG_Video_Classifier(features, class_nr, optimizer='adam'):
    # model
    full_model = tf.keras.Sequential([
        features,
        GlobalAveragePooling1D(),
        #Dense(2048, kernel_initializer="he_normal"),
        #Dense(class_nr, kernel_initializer="he_normal"),
        Dense(class_nr, kernel_regularizer=l2(0.001)),
        ])
    
    #compile model
    # full_model.compile(
    #     optimizer=optimizer,
    #     # loss=CategoricalCrossentropy(from_logits=True),
    #     loss='categorical_hinge',
    #     metrics=['accuracy'])


    full_model.compile(loss='hinge',
                optimizer='adam',
                metrics=['accuracy'])



    return full_model


In [9]:
# Base model (returns pretrained frozen base model trained on Imagenet)
inception = Imagenet(name='inception')

# Feature Extractor (Has output (NR_FRAME x D) where D is feature dimension)
featuer_ex = Video_Feature_Extractor(inception)

# MLP Clasifier
model = AVG_Video_Classifier(features=featuer_ex, class_nr=6)

In [10]:
model.fit(train_ds.shuffle(100).batch(25).prefetch(1),validation_data=valid_ds.batch(1), epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f747ba52c18>

In [11]:
model.evaluate(valid_ds.batch(1))



[0.29953745007514954, 0.6222222447395325]