<a href="https://colab.research.google.com/github/anaaaraujoo/deepfake-detection/blob/main/HybridDeepfakeDetector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Hybrid deepfake detection (CNN-LSTM-Transformer Approach)**

In [1]:
# Installing all the necessary libraries and frameworks
!pip install -q tensorflow matplotlib seaborn scikit-learn opencv-python numpy pandas tqdm pytorch-lightning timm gdown

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.9/73.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.7/620.7 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m832.4/832.4 kB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.5/57.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.5/24.5 MB[0m [31m84.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Importing all the necessary libraries
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torchvision
import tensorflow as tf
import keras
from keras import layers
from tensorflow.keras.layers import Input, TimeDistributed, Dense, LSTM, Dropout, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.optimizers import Adam
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

tf.keras.mixed_precision.set_global_policy('mixed_float16')

In [None]:
# Downloading and unziping the dataset - CelebDF
from google.colab import drive
import zipfile
drive.mount('/content/drive')
dataset_path = '/content/drive/MyDrive/Celeb-DF.zip'
extract_path = '/content'
zip_ref = zipfile.ZipFile(dataset_path, 'r')
zip_ref.extractall(extract_path)
zip_ref.close()

In [None]:
# Data analysis
# Plotting some samples for visual comprehension
real_path = '/content/Celeb-real'
yt_path = '/content/YouTube-real'
fake_path = '/content/Celeb-synthesis'

real_videos_samples = [os.path.join(real_path, f) for f in os.listdir(real_path) if f.endswith(".mp4")]
fake_videos_samples = [os.path.join(fake_path, f) for f in os.listdir(fake_path) if f.endswith(".mp4")]
yt_videos_samples = [os.path.join(yt_path, f) for f in os.listdir(yt_path) if f.endswith(".mp4")]

print(f'Number of real videos on the training set: {len(real_videos_samples) + len(yt_videos_samples)}')
print(f'Number of fake videos on the training set: {len(fake_videos_samples)}')

n = 3 # Samples to plot per class

def extract_first_frame(video_path):
  cap = cv2.VideoCapture(video_path)
  ret, frame = cap.read()
  cap.release()
  return frame

i = 1
print('Real Samples: ')
plt.rcParams['figure.figsize'] = [9, 9]
for video in real_videos_samples[:n]:
  ax = plt.subplot(3, n, i)
  frame = extract_first_frame(video)
  plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)
  i += 1
plt.show()

print('Fake Samples: ')
plt.rcParams['figure.figsize'] = [9, 9]
for video in fake_videos_samples[:n]:
  ax = plt.subplot(3, n, i)
  frame = extract_first_frame(video)
  plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
  ax.get_xaxis().set_visible(False)
  ax.get_yaxis().set_visible(False)
  i += 1
plt.show()

In [None]:
# Dataset class
class CelebDF:
  def __init__(self, video_paths, labels, detect_faces_flag=True, max_frames=15, img_size = 112):
    self.videos = video_paths
    self.labels = labels
    self.detect_faces_flag = detect_faces_flag
    self.img_size = img_size
    self.max_frames = max_frames

    haar_cascade_url = 'https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml'
    haar_cascade_path = 'haarcascade_frontalface_default.xml'
    if not os.path.exists(haar_cascade_path):
        import urllib.request
        urllib.request.urlretrieve(haar_cascade_url, haar_cascade_path)
    self.face_cascade = cv2.CascadeClassifier(haar_cascade_path)

  def __len__(self):
    return len(self.videos)

  # Frame extractor
  def frame_extractor(self, video_path):
    frames = []
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
      print(f'Error opening video file: {video_path}')
      return np.zeros((self.max_frames, self.img_size, self.img_size, 3), dtype=np.float32)
    frame_count = 0
    while True:
      ret, frame = cap.read()
      if not ret or frame_count >= self.max_frames:
        break
      frame = cv2.resize(frame, (224, 224))
      frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
      frames.append(frame)
      frame_count += 1
    cap.release()
    return np.array(frames)

  # Face recognition
  def detect_faces(self, frames):
    processed_frames = []
    for frame in frames:
      gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
      faces = self.face_cascade.detectMultiScale(gray, 1.1, 4)
      if len(faces) > 0:
        largest_face = max(faces, key=lambda x: x[2] * x[3])
        x, y, w, h = largest_face
        face = cv2.resize(frame[y:y+h, x:x+w], (224, 224))
        processed_frames.append(face)
      else:
        processed_frames.append(np.zeros((224, 224, 3), dtype=np.uint8))
    return np.array(processed_frames) if processed_frames else np.array([])

  # Each output is in TensorFlow format
  def __getitem__(self, idx):
    video_path = self.videos[idx]
    label = self.labels[idx]
    frames = self.frame_extractor(str(video_path))
    if (len(frames) == 0):
      return tf.zeros((1, 224, 224, 3), dtype=tf.float32), tf.constant(0, dtype=tf.int32)

    if self.crop_faces_flag:
      frames_crops = self.detect_faces(frames)
      if frames_crops.size > 0:
        frames = frames_crops
      else:
        return tf.zeros((1, 3, 224, 224), dtype=tf.float32), tf.constant(0, dtype=tf.int32)

    frames = tf.convert_to_tensor(frames, dtype=tf.float32) / 255.0
    label = tf.constant(1 if "real" in str(video_path) else 0, dtype=tf.int32)
    return frames, label

  def generator(self):
    for video_path in self.videos:
      frames = self.frame_extractor(str(video_path))
      if (len(frames) == 0):
        yield tf.zeros((1, 224, 224, 3), dtype=tf.float32), tf.constant(0, dtype=tf.int32)
        continue
      if self.crop_faces_flag:
        frames_crops = self.detect_faces(frames)
        if frames_crops.size > 0:
          frames = frames_crops
        else:
          yield tf.zeros((1, 3, 224, 224), dtype=tf.float32), tf.constant(0, dtype=tf.int32)
          continue
      frames = tf.convert_to_tensor(frames, dtype=tf.float32) / 255.0
      label = tf.constant(1 if "real" in str(video_path) else 0, dtype=tf.int32)
      yield frames, label

In [None]:
# Dividing the data in the train/val/test split
# Extracting videos from the .txt file
def read_split_list(txt_file):
    with open(txt_file, "r") as f:
        videos = [line.strip() for line in f.readlines()]
    return videos

real_videos_root = Path("/content/Celeb-real")
yt_videos_root = Path("/content/YouTube-real")
fake_videos_root = Path("/content/Celeb-synthesis")
test_videos_root = Path("/content/List_of_testing_videos.txt")

real_videos = list(real_videos_root.rglob("*.mp4"))
yt_videos = list(yt_videos_root.rglob("*.mp4"))
fake_videos = list(fake_videos_root.rglob("*.mp4"))
test_videos  = read_split_list(test_videos_root)

all_videos = real_videos + fake_videos + yt_videos
print(f"Total number of videos: {len(all_videos)}")

# Create labels for the videos: 0 -> real, 1 -> fake
train_labels = [1] * (len(real_videos) + len(yt_videos)) + [0] * len(fake_videos)

train_videos, val_videos, train_labels, val_labels = train_test_split(
    all_videos, train_labels,
    test_size = 0.15,
    random_state = 42,
    stratify = train_labels
)

train_data = CelebDF(train_videos, detect_faces_flag=True)
val_data   = CelebDF(val_videos, detect_faces_flag=True)
test_data  = CelebDF(test_videos, detect_faces_flag=True)

train_ds = tf.data.Dataset.from_generator(
    train_data.generator,
    output_signature=(
        tf.TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.int32),
    )
).padded_batch(4).shuffle(100).cache().prefetch(tf.data.AUTOTUNE)

val_ds = tf.data.Dataset.from_generator(
    val_data.generator,
    output_signature=(
        tf.TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.int32),
    )
).padded_batch(4).cache().prefetch(tf.data.AUTOTUNE)

test_ds = tf.data.Dataset.from_generator(
    test_data.generator,
    output_signature=(
        tf.TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.int32),
    )
).padded_batch(4).cache().prefetch(tf.data.AUTOTUNE)

print(f"Number of training videos: {len(train_data)}")
print(f"Number of validation videos: {len(val_data)}")
print(f"Number of testing videos: {len(test_data)}")

In [None]:
# Testing the frame extractor and the face recognition functions
max_display = 5
frames, label = train_data[1]
print(f'Video; {train_data.videos[1]}')
print(f'Label: {label}')
print(f'Frames: {frames.shape}')
fig, axes = plt.subplots(1, max_display, figsize=(4 * max_display, 4))
for i in range (max_display):
  axes[i].imshow(frames[i].numpy().astype('float32')) # Convert to float32 for display
  axes[i].set_title(f'Frame nº{i+1}')
  axes[i].axis('off')
plt.tight_layout()
plt.show()

In [None]:
# CNN-LSTM Model
# The Xception model only outputs features, that is why the fully connected top layer (that
# predicts classes scores) is left out
def cnn_lstm_model():
  cnn_backbone = tf.keras.applications.MobileNetV2(
      weights='imagenet',
      include_top=False,
      input_shape=(img_size, img_size, 3)
  )

  for layer in cnn_backbone.layers[:-20]:
    layer.trainable = False # Some layers are not trainable to reduce memory usage
  video_input = Input(shape=(max_frames, img_size, img_size, 3))
  cnn_features = TimeDistributed(cnn_backbone)(video_input)
  cnn_features = TimeDistributed(GlobalAveragePooling2D())(cnn_features)
  cnn_features = (Dropout(0.3))(cnn_features)

  lstm_out = LSTM(64, return_sequences)(cnn_features)
  lstm_out = Dropout(0.4)(lstm_out)
  lstm_out = LSTM(32)(lstm_out)
  lstm_features = Dropout(0.4)(lstm_out)

  output = Dense(1, activation='sigmoid', dtype='float32')(lstm_features)
  model = Model(inputs=video_input, outputs=output)
  optimizer = Adam(learning_rate=0.001)


  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  return model

In [None]:
# Training the model
model = cnn_lstm_model()
model.fit(
    train_ds,
    epochs = 10,
    validation_data = val_ds,
    verbose = 1 # Progress bar
)