In [1]:
import os
import util
import model_builder
import numpy as np
import pandas as pd
import tensorflow as tf

Working plan - Since it's seen that MobileNetV2 + GRU gave an adequately good result, I want to try the architecture (the best alone or the top 5) on some variations:
1. on the same videos decomposed into more number of frames (I did 16, try 32 and 64)
2. on a different dataset (I used DFD, try CelebDF)
3. using efficientnet or some other pretrained model (so adjust image sizes accordingly)

Make generic functions so that any data, any number of frames, and any pretrained model can be used. Save all the best ones.

In [2]:
base_dir = r'data'
data_sources = ['DFD', 'CelebDF']
num_frames = [16, 32, 64, 128, 256]

In [3]:
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

# DFD More Frames

In [4]:
labels1, classifier1, (classifier_loss1, classifier_acc1) = model_builder.train_test_classifier(
    data_dir=os.path.join(base_dir, 'DFD'),
    num_frames=64)

Image Model: MobileNetV2, Image Size: (224, 224)
TRAIN set: 140 videos
VAL set: 30 videos
TEST set: 30 videos
TRAIN set: 140 videos
VAL set: 30 videos
TEST set: 30 videos
Video data extraction and splitting completed.

Shapes:
X_train: (280, 64, 224, 224, 3)
X_val: (60, 64, 224, 224, 3)
X_test: (60, 64, 224, 224, 3)
y_train: (280,)
y_val: (60,)
X_test: (60,)
[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 240ms/step
Embeddings shape: (17920, 1280)
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 243ms/step
Embeddings shape: (3840, 1280)
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 243ms/step
Embeddings shape: (3840, 1280)
Temporal model defined.


None
Epoch 1/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 242ms/step - accuracy: 0.5071 - loss: 0.7692 - val_accuracy: 0.4000 - val_loss: 0.7579
Epoch 2/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 169ms/step - accuracy: 0.4643 - loss: 0.7439 - val_accuracy: 0.5000 - val_loss: 0.7462
Epoch 3/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 169ms/step - accuracy: 0.4893 - loss: 0.7417 - val_accuracy: 0.5167 - val_loss: 0.7417
Epoch 4/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 167ms/step - accuracy: 0.5036 - loss: 0.7068 - val_accuracy: 0.5000 - val_loss: 0.7386
Epoch 5/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 168ms/step - accuracy: 0.4750 - loss: 0.7421 - val_accuracy: 0.4833 - val_loss: 0.7359
Epoch 6/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 169ms/step - accuracy: 0.5107 - loss: 0.7100 - val_accuracy: 0.4667 - val_loss: 0.7331
Epoch 7/500
[1m9/9[0m [32m

In [5]:
labels1

array(['real', 'fake'], dtype='<U4')

In [6]:
print(f"DFD with more Frames-\nLoss: {classifier_loss1}\nAccuracy: {classifier_acc1}")

DFD with more Frames-
Loss: 0.746757447719574
Accuracy: 0.5166666507720947


* The same architecture (2 GRUs) that gave 66.7% accuracy for DFD with 16 frames is only giving **51.7%** when applied on DFD videos split into 64 frames. This is probably because there's more temporal dependencies to capture here and 2 GRUs aren't sufficient for that.

# DFD More Frames GRUs & Dense Layers

In [10]:
data_dir = os.path.join(base_dir, 'DFD')
os.path.dirname(data_dir), os.path.basename(data_dir).lower()

('data', 'dfd')

In [4]:
labels2, classifier2, (classifier_loss2, classifier_acc2) = model_builder.train_test_classifier(
    data_dir=os.path.join(base_dir, 'DFD'),
    num_frames=64,
    num_gru=2, num_dense=1, batchnorm=True)

Image Model: MobileNetV2, Image Size: (224, 224)
Video data extraction and splitting completed.

Shapes:
X_train: (280, 64, 224, 224, 3)
X_val: (60, 64, 224, 224, 3)
X_test: (60, 64, 224, 224, 3)
y_train: (280,)
y_val: (60,)
X_test: (60,)
[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 237ms/step
Embeddings shape: (17920, 1280)
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 236ms/step
Embeddings shape: (3840, 1280)
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 246ms/step
Embeddings shape: (3840, 1280)


Temporal model defined.


Epoch 1/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 182ms/step - accuracy: 0.4786 - loss: 0.8555 - val_accuracy: 0.5167 - val_loss: 0.7156
Epoch 2/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 142ms/step - accuracy: 0.5036 - loss: 0.8782 - val_accuracy: 0.5167 - val_loss: 0.7125
Epoch 3/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 149ms/step - accuracy: 0.5286 - loss: 0.8430 - val_accuracy: 0.5000 - val_loss: 0.7095
Epoch 4/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 150ms/step - accuracy: 0.5000 - loss: 0.8823 - val_accuracy: 0.5000 - val_loss: 0.7070
Epoch 5/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 150ms/step - accuracy: 0.5000 - loss: 0.8448 - val_accuracy: 0.5167 - val_loss: 0.7049
Epoch 6/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 147ms/step - accuracy: 0.5143 - loss: 0.8433 - val_accuracy: 0.5167 - val_loss: 0.7032
Epoch 7/500
[1m9/9[0m [32m━━━━━

In [5]:
labels2

array([0, 1], dtype=int64)

In [6]:
print(f"DFD with more Frames and 4 GRUs-\nLoss: {classifier_loss2}\nAccuracy: {classifier_acc2}")

DFD with more Frames and 4 GRUs-
Loss: 0.7296609878540039
Accuracy: 0.5


* With both 4 GRUs and 2 GRUs + 1 Dense + BatchNormalization, only **50%** accuracy can be obtained. Perhaps the added temporal dependencies from more number of frames adds more load on the already small dataset. So, it appears 16 frames was the best case.

# DFD Xception

In [5]:
labels3, classifier3, (classifier_loss3, classifier_acc3) = model_builder.train_test_classifier(
    data_dir=os.path.join(base_dir, 'DFD'),
    num_frames=16,
    img_model_name='Xception')

Image Model: Xception, Image Size: (299, 299)
TRAIN set: 140 videos
VAL set: 30 videos
TEST set: 30 videos
TRAIN set: 140 videos
VAL set: 30 videos
TEST set: 30 videos
Video data extraction and splitting completed.

Shapes:
X_train: (280, 16, 299, 299, 3)
X_val: (60, 16, 299, 299, 3)
X_test: (60, 16, 299, 299, 3)
y_train: (280,)
y_val: (60,)
X_test: (60,)
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 2s/step
Embeddings shape: (4480, 2048)
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 2s/step 
Embeddings shape: (960, 2048)
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 2s/step 
Embeddings shape: (960, 2048)


Temporal model defined.


Epoch 1/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 112ms/step - accuracy: 0.4857 - loss: 0.7206 - val_accuracy: 0.5000 - val_loss: 0.6888
Epoch 2/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step - accuracy: 0.5500 - loss: 0.7069 - val_accuracy: 0.5000 - val_loss: 0.6871
Epoch 3/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step - accuracy: 0.5393 - loss: 0.6992 - val_accuracy: 0.5000 - val_loss: 0.6855
Epoch 4/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - accuracy: 0.5357 - loss: 0.6942 - val_accuracy: 0.5000 - val_loss: 0.6843
Epoch 5/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - accuracy: 0.5321 - loss: 0.6998 - val_accuracy: 0.5167 - val_loss: 0.6832
Epoch 6/500
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - accuracy: 0.5679 - loss: 0.6794 - val_accuracy: 0.4833 - val_loss: 0.6823
Epoch 7/500
[1m9/9[0m [32m━━━━━━━━━━

In [6]:
labels3

array(['real', 'fake'], dtype='<U4')

In [7]:
print(f"DFD using Xception-\nLoss: {classifier_loss3}\nAccuracy: {classifier_acc3}")

DFD using Xception-
Loss: 0.6473307013511658
Accuracy: 0.6166666746139526


* The exact same temporal model that gave 66.7% accuracy with MobileNetV2 was only able to achieve **61.7%** with Xception. Hence, it appears that MobileNetV2 could capture more spatial dependencies.

# CelebDF (best pretrained model) & (best num frames)

In [4]:
labels4, classifier4, (classifier_loss4, classifier_acc4) = model_builder.train_test_classifier(
    data_dir=os.path.join(base_dir, 'CelebDF'),
    num_frames=16,
    img_model_name='Xception')

Image Model: Xception, Image Size: (299, 299)
Video data extraction and splitting completed.

Shapes:
X_train: (812, 16, 299, 299, 3)
X_val: (174, 16, 299, 299, 3)
X_test: (174, 16, 299, 299, 3)
y_train: (812,)
y_val: (174,)
X_test: (174,)
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m688s[0m 2s/step 
Embeddings shape: (12992, 2048)
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 2s/step
Embeddings shape: (2784, 2048)
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 2s/step
Embeddings shape: (2784, 2048)


Temporal model defined.


Epoch 1/500
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 69ms/step - accuracy: 0.4926 - loss: 0.7103 - val_accuracy: 0.5345 - val_loss: 0.6915
Epoch 2/500
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - accuracy: 0.5382 - loss: 0.6959 - val_accuracy: 0.5230 - val_loss: 0.6917
Epoch 3/500
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - accuracy: 0.5012 - loss: 0.7012 - val_accuracy: 0.5402 - val_loss: 0.6911
Epoch 4/500
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - accuracy: 0.5074 - loss: 0.6982 - val_accuracy: 0.5460 - val_loss: 0.6908
Epoch 5/500
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.5431 - loss: 0.6957 - val_accuracy: 0.5230 - val_loss: 0.6910
Epoch 6/500
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - accuracy: 0.5345 - loss: 0.6898 - val_accuracy: 0.5230 - val_loss: 0.6906
Epoch 7/500
[1m26/26[0m [

In [5]:
labels4

array([0, 1], dtype=int64)

In [7]:
print(f"CelebDF with 16 Frames, Xception, 2GRUs -\nLoss: {classifier_loss4}\nAccuracy: {classifier_acc4}")

CelebDF with 16 Frames, Xception, 2GRUs -
Loss: 0.692363440990448
Accuracy: 0.5287356376647949


* The same architecture that gave 61.7% accuracy for DFD only gave **52.9%** accuracy for CelebDF. The size of the dataset is slightly bigger than DFD but perhaps 16 frames & 2 GRUs aren't sufficient to capture the temporal dependencies or XceptionNet isn't good at capturing the spatial dependencies of this dataset.

In [9]:
labels5, classifier5, (classifier_loss5, classifier_acc5) = model_builder.train_test_classifier(
    data_dir=os.path.join(base_dir, 'CelebDF'),
    num_frames=16,
    img_model_name='MobileNetV2')

Image Model: MobileNetV2, Image Size: (224, 224)
TRAIN set: 406 videos
VAL set: 87 videos
TEST set: 87 videos
TRAIN set: 406 videos
VAL set: 87 videos
TEST set: 87 videos
Video data extraction and splitting completed.

Shapes:
X_train: (812, 16, 224, 224, 3)
X_val: (174, 16, 224, 224, 3)
X_test: (174, 16, 224, 224, 3)
y_train: (812,)
y_val: (174,)
X_test: (174,)
[1m406/406[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 246ms/step
Embeddings shape: (12992, 1280)
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 236ms/step
Embeddings shape: (2784, 1280)
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 244ms/step
Embeddings shape: (2784, 1280)


Temporal model defined.


Epoch 1/500
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 62ms/step - accuracy: 0.4914 - loss: 0.7423 - val_accuracy: 0.5230 - val_loss: 0.7068
Epoch 2/500
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.5246 - loss: 0.7183 - val_accuracy: 0.5115 - val_loss: 0.7057
Epoch 3/500
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.5012 - loss: 0.7225 - val_accuracy: 0.5172 - val_loss: 0.7048
Epoch 4/500
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.5259 - loss: 0.7126 - val_accuracy: 0.5172 - val_loss: 0.7043
Epoch 5/500
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - accuracy: 0.5012 - loss: 0.7258 - val_accuracy: 0.5287 - val_loss: 0.7033
Epoch 6/500
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.5049 - loss: 0.7180 - val_accuracy: 0.5345 - val_loss: 0.7026
Epoch 7/500
[1m26/26[0m [

In [11]:
labels5

array(['real', 'fake'], dtype='<U4')

In [10]:
print(f"CelebDF with 16 Frames, MobileNetV2, 2GRUs -\nLoss: {classifier_loss5}\nAccuracy: {classifier_acc5}")

CelebDF with 16 Frames, MobileNetV2, 2GRUs -
Loss: 0.7166829705238342
Accuracy: 0.4482758641242981


* The best performing model and setup for DFD dataset gave just **44.8%** accuracy for the CelebDF dataset.