In [1]:
import os
import util
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Input, Rescaling, Conv2D, MaxPool2D, Flatten, Dense, Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.optimizers import Adam
from sklearn.metrics import accuracy_score

In [2]:
base_dir = 'artifacts'
data = os.path.join(base_dir, 'split_3d_data.pkl')
batch_size = 32

# Load Data

In [7]:
X_train, X_val, X_test, y_train, y_val, y_test = util.load_split_3d_data(data)

In [8]:
X_train.shape, X_test.shape, X_val.shape

((280, 16, 224, 224, 3), (60, 16, 224, 224, 3), (60, 16, 224, 224, 3))

In [9]:
num_frames, img_size = X_train.shape[1], X_train.shape[2:4]
print(num_frames, img_size)

16 (224, 224)


In [10]:
y_train.shape, y_test.shape, y_val.shape

((280,), (60,), (60,))

## One-Hot Encoding of Target

In [11]:
y_val

array(['real', 'real', 'real', 'real', 'real', 'real', 'real', 'real',
       'real', 'real', 'real', 'real', 'real', 'real', 'real', 'real',
       'real', 'real', 'real', 'real', 'real', 'real', 'real', 'real',
       'real', 'real', 'real', 'real', 'real', 'real', 'fake', 'fake',
       'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake',
       'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake',
       'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake',
       'fake', 'fake', 'fake', 'fake'], dtype='<U4')

In [12]:
pd.factorize(y_val)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64),
 array(['real', 'fake'], dtype='<U4'))

In [13]:
labels = pd.factorize(y_val)[1]
print(labels)

['real' 'fake']


In [14]:
y_train, y_val, y_test = util.one_hot_encoding(num_categories=len(labels),
                                               split=3,
                                               y_train=pd.factorize(y_train)[0], 
                                               y_val=pd.factorize(y_val)[0], 
                                               y_test=pd.factorize(y_test)[0])

In [15]:
y_train.shape, y_test.shape, y_val.shape

((280, 2), (60, 2), (60, 2))

## Converting 3D to 2D Data

In [16]:
num_train = X_train.shape[0]
num_val = X_val.shape[0]
num_test = X_test.shape[0]

In [17]:
train_frames, val_frames, test_frames = util.convert_3d_to_2d(split=3,
                                                         train=(X_train, y_train),
                                                         val=(X_val,y_val),
                                                         test=(X_test, y_test))
X_train_frames, y_train_frames = train_frames
X_val_frames, y_val_frames = val_frames
X_test_frames, y_test_frames = test_frames

In [18]:
X_train_frames.shape, X_val_frames.shape, X_test_frames.shape

((4480, 224, 224, 3), (960, 224, 224, 3), (960, 224, 224, 3))

In [19]:
y_train_frames.shape, y_val_frames.shape, y_test_frames.shape

((4480, 2), (960, 2), (960, 2))

# Simple CNN Model

In [20]:
model = Sequential()
model.add(Input(shape=img_size+(3,)))
model.add(Rescaling(1./255))
model.add(Conv2D(32, 3, strides=1, padding='valid', activation='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=2, padding='valid'))
model.add(Conv2D(64, 3, strides=1, padding='valid', activation='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=2, padding='valid'))
model.add(Conv2D(128, 3, strides=1, padding='valid', activation='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=2, padding='valid'))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.summary()

In [21]:
model.compile(loss='categorical_crossentropy', 
              optimizer=Adam(learning_rate=0.01), 
              metrics=['accuracy'])

In [22]:
estop = EarlyStopping(monitor='val_loss', mode='min',
                      min_delta=1e-5, patience=10,
                      restore_best_weights=True,
                      verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5,
                              min_delta=1e-3, min_lr=1e-5, verbose=1)

In [23]:
model.fit(X_train_frames, y_train_frames, 
          validation_data=(X_val_frames, y_val_frames),
          epochs=500, batch_size=batch_size,
          callbacks=[estop, reduce_lr],
          verbose=1)

Epoch 1/500
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 282ms/step - accuracy: 0.4911 - loss: 3.8660 - val_accuracy: 0.5000 - val_loss: 0.6933 - learning_rate: 0.0100
Epoch 2/500
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 277ms/step - accuracy: 0.4951 - loss: 0.6936 - val_accuracy: 0.5000 - val_loss: 0.6939 - learning_rate: 0.0100
Epoch 3/500
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 276ms/step - accuracy: 0.4982 - loss: 0.6935 - val_accuracy: 0.5000 - val_loss: 0.6934 - learning_rate: 0.0100
Epoch 4/500
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 277ms/step - accuracy: 0.4897 - loss: 0.6935 - val_accuracy: 0.5000 - val_loss: 0.6938 - learning_rate: 0.0100
Epoch 5/500
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 280ms/step - accuracy: 0.5000 - loss: 0.6938 - val_accuracy: 0.5000 - val_loss: 0.6933 - learning_rate: 0.0100
Epoch 6/500
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x18048d07740>

In [24]:
test_loss, test_accuracy = model.evaluate(X_test_frames, y_test_frames, verbose=0)
print("Test loss:", test_loss)
print("Test accuracy:", test_accuracy)

Test loss: 0.6931480169296265
Test accuracy: 0.5


**Simple CNN model gave a (frame-level) test accuracy of 50%.**

In [25]:
model.save('artifacts/baseline_cnn.keras')

In [27]:
frame_pred_probs = model.predict(X_test_frames, verbose=0).reshape(num_test, num_frames, -1)
frame_pred_labels = np.argmax(frame_pred_probs, axis=2)
print(frame_pred_labels.shape)

(60, 16)


In [28]:
video_pred_labels = (np.sum(frame_pred_labels, axis=1) > (num_frames/2)).astype(int)
print(video_pred_labels.shape)

(60,)


In [31]:
video_test_accuracy = accuracy_score(np.argmax(y_test, axis=1), video_pred_labels)
print("Video-level Test accuracy:", video_test_accuracy)

Video-level Test accuracy: 0.5


**With majority voting, frame predictions are combined to obtain video predictions. The video-level test accuracy achieved is 50%.\
This serves as the baseline model which appears to be working no better than random guessing.**