Activating the virtual environment: source new_env/bin/activate

In [1]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from transformers import ViTForImageClassification, ViTFeatureExtractor
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from PIL import Image
import os

def get_available_device():
    """Automatically selects multiple GPUs if available, otherwise falls back to CPU."""
    if torch.cuda.is_available():
        try:
            # Get the number of available GPUs
            num_gpus = torch.cuda.device_count()
            device = torch.device("cuda")  # Use all available GPUs
            
            print(f"Using {num_gpus} GPUs: {[torch.cuda.get_device_name(i) for i in range(num_gpus)]}")
            return device
        except Exception as e:
            print(f"GPU error: {str(e)}, falling back to CPU")

    return torch.device("cpu")  # Default to CPU if no GPU is available

# Example Usage
device = get_available_device()


  from .autonotebook import tqdm as notebook_tqdm
2025-05-15 16:23:06.888312: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-15 16:23:06.902414: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747326186.918998   44230 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747326186.923989   44230 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-15 16:23:06.941926: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

Using 5 GPUs: ['Tesla T4', 'Tesla T4', 'Tesla T4', 'Tesla T4', 'Tesla T4']


In [2]:
import torch
print(torch.version.cuda)       # Should show a version like 12.1
print(torch.backends.cudnn.version())  # Should print an integer version
print(torch.cuda.is_available())       # Should be True


12.1
8902
True


In [3]:
device = get_available_device()
print(f"Running on: {device}")


Using 5 GPUs: ['Tesla T4', 'Tesla T4', 'Tesla T4', 'Tesla T4', 'Tesla T4']
Running on: cuda


This code defines a custom PyTorch Dataset class called DeepfakeDataset — used for loading and preprocessing image data (e.g., for a deepfake detection task).

This class enables your DataLoader to efficiently:

Load each image

Apply transformations (like resizing, normalization, etc.)

Pair the image with its corresponding label (real or fake)


__init__()	Initializes dataset with image paths, labels, and transforms

__len__()	Returns number of total samples

__getitem__()	Loads, transforms, and returns an image-label pair

In [4]:
class DeepfakeDataset(Dataset):
    def __init__(self, image_paths, labels, transform):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        image = self.transform(image)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return image, label


In [5]:
import os
from sklearn.model_selection import train_test_split

# Paths to training and test datasets
train_dir = "/home/ghulam/FF++/cropped_face_mtcnn/train"
test_dir = "/home/ghulam/FF++/cropped_face_mtcnn/test"

# Collect training images
real_train_images = [os.path.join(train_dir, "real", img) for img in os.listdir(os.path.join(train_dir, "real"))]
fake_train_images = [os.path.join(train_dir, "fake", img) for img in os.listdir(os.path.join(train_dir, "fake"))]

# Combine images and assign labels (0 for real, 1 for fake)
train_image_paths = real_train_images + fake_train_images
train_labels = [0] * len(real_train_images) + [1] * len(fake_train_images)

# Split training data into train and validation
train_paths, val_paths, train_labels, val_labels = train_test_split(
    train_image_paths, train_labels, test_size=0.2, stratify=train_labels
)

# Collect test images (not split, used separately)
real_test_images = [os.path.join(test_dir, "real", img) for img in os.listdir(os.path.join(test_dir, "real"))]
fake_test_images = [os.path.join(test_dir, "fake", img) for img in os.listdir(os.path.join(test_dir, "fake"))]

test_paths = real_test_images + fake_test_images
test_labels = [0] * len(real_test_images) + [1] * len(fake_test_images)

print(f"Train samples: {len(train_paths)}, Validation samples: {len(val_paths)}, Test samples: {len(test_paths)}")


Train samples: 576, Validation samples: 144, Test samples: 140


Transformation	Before	After
Resize (224, 224)	Any size (e.g., 512x512)	224 x 224
ToTensor	PIL Image / NumPy array	PyTorch tensor (C x H x W)
Normalize	Values in [0, 1]	Centered around 0 with small std

In [6]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images for both models
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # average pixel values from the ImageNet dataset, which pretrained models were trained on
])

Component	Purpose

__init__	Collects image paths and assigns numeric labels for each subfolder

__len__	Returns total number of images

__getitem__	Opens an image, applies transforms, returns image + label

transform	Standard preprocessing for model compatibility
convert("RGB")	Ensures all images have 3 channels (even grayscale ones)
labeling logic	Auto-assigns 0, 1, ... to class folders using enumerate()

In [7]:
import os
from torch.utils.data import Dataset
from PIL import Image

class CustomImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_paths = []
        self.labels = []

        # Traverse the nested folder structure
        for label, subdir in enumerate(os.listdir(root_dir)):
            subdir_path = os.path.join(root_dir, subdir)
            if os.path.isdir(subdir_path):
                for img_name in os.listdir(subdir_path):
                    img_path = os.path.join(subdir_path, img_name)
                    if os.path.isfile(img_path):
                        self.image_paths.append(img_path)
                        self.labels.append(label)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, label


Loading Pertrained efficientnet B3

Section	                                           Purpose
Load Pretrained Model	                Use efficientnet_b3 trained on ImageNet
Modify Classifier	                    Change output layer to fit binary classification
Set Device	                            Automatically use GPU if available
Use Multi-GPU	                        Wrap with DataParallel if multiple GPUs are found
Move Model to Device	                Ensures model is ready to train on selected device

In [8]:
import torch
import torch.nn as nn
from torchvision import models

# Load EfficientNet-B3
efficientnet = models.efficientnet_b3(pretrained=True)

# Modify classifier for binary classification
efficientnet.classifier[1] = nn.Linear(efficientnet.classifier[1].in_features, 2)

# Check if multiple GPUs are available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    efficientnet = nn.DataParallel(efficientnet)  # Distribute model across multiple GPUs

# Move model to GPU(s)
efficientnet.to(device)




Using 5 GPUs!


DataParallel(
  (module): EfficientNet(
    (features): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): SiLU(inplace=True)
      )
      (1): Sequential(
        (0): MBConv(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
              (1): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (2): SiLU(inplace=True)
            )
            (1): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(40, 10, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(10, 40, kernel_size=(1, 1), stride=(1, 1))
              (activation): SiLU(inplace=True)
              (scale

Step	                                            Purpose

Load ViTConfig	                       Configure the model structure with binary classification (2 labels)

Load Pretrained ViT	                   Load ImageNet-pretrained ViT model, ignoring classifier size mismatch

Replace Classifier	                   Manually override the classifier to fit your use case

Setup GPU Device	                   Auto-detects and uses CUDA GPU

Enable Multi-GPU	                   Parallelizes training if more than one GPU is available

Move to Device	                       Deploys model on CPU or GPU(s)

In [9]:
import torch
import torch.nn as nn
from transformers import ViTForImageClassification, ViTConfig

# Step 1: Load the model configuration
config = ViTConfig.from_pretrained("google/vit-base-patch16-224")
config.num_labels = 2  # Set the number of output labels for binary classification

# Step 2: Load the pretrained model
vit = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",
    config=config,
    ignore_mismatched_sizes=True,  # Ignore the size mismatch for the classifier
)

# Step 3: Replace the classifier
vit.classifier = nn.Linear(vit.config.hidden_size, config.num_labels)

# Step 4: Enable Multi-GPU training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    vit = nn.DataParallel(vit)  # Distributes model across multiple GPUs

# Move model to GPU(s)
vit.to(device)


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 5 GPUs!


DataParallel(
  (module): ViTForImageClassification(
    (vit): ViTModel(
      (embeddings): ViTEmbeddings(
        (patch_embeddings): ViTPatchEmbeddings(
          (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
        )
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (encoder): ViTEncoder(
        (layer): ModuleList(
          (0-11): 12 x ViTLayer(
            (attention): ViTAttention(
              (attention): ViTSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.0, inplace=False)
              )
              (output): ViTSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.0, inplace=False)
              )
            )
            (i

Phase	                                   What Happens
Setup	                         Move model to device and wrap with DataParallel if needed
Training	                     Forward pass, loss calculation, backward pass, optimizer step
Evaluation	                     Inference only, no gradient update
Metrics	                         Track loss and accuracy for both training and validation

In [10]:
import torch
import torch.nn as nn

def train_model(model, optimizer, criterion, train_loader, val_loader, epochs, device):
    model.to(device)  # Move model to GPU(s)

    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs for training!")
        model = nn.DataParallel(model)  # Distributes model across multiple GPUs

    for epoch in range(epochs):
        model.train()
        train_loss, correct, total = 0.0, 0, 0
        
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad() #Clears out the old gradients before a new backward pass
            outputs = model(images)  # DataParallel automatically distributes inputs
            logits = outputs.logits if isinstance(outputs, dict) else outputs  # handles both dict outputs (ViT) and normal outputs (e.g., EfficientNet)
            
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * images.size(0)
            correct += (logits.argmax(1) == labels).sum().item()
            total += labels.size(0)

        train_loss /= total
        train_accuracy = correct / total
        print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

        # Validation
        model.eval()
        val_loss, correct, total = 0.0, 0, 0

        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                logits = outputs.logits if isinstance(outputs, dict) else outputs  # handles both dict outputs (ViT) and normal outputs (e.g., EfficientNet)
                
                loss = criterion(logits, labels)
                val_loss += loss.item() * images.size(0)
                correct += (logits.argmax(1) == labels).sum().item()
                total += labels.size(0)

        val_loss /= total
        val_accuracy = correct / total
        print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")


Line	                             Action	                               Why it's important
torch.device(...)	                Set device	                       Portability across CPU/GPU
nn.DataParallel(...)	            Enable multi-GPU training	       Speeds up training, handles large batches
model.to(device)	                Move model to hardware	           Ensures computations run on chosen device
CrossEntropyLoss()	                Loss function for classification   Handles multi-class or binary logits
torch.optim.Adam(...)	            Optimizer	                       Adaptive and efficient for transformer models

In [11]:
# Set device for multiple GPUs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Wrap the model with DataParallel for multi-GPU usage
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    vit = nn.DataParallel(vit)  # Distributes model across all available GPUs

# Move model to GPU(s)
vit.to(device)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Optimizer
vit_optimizer = torch.optim.Adam(vit.parameters(), lr=1e-4)


Using 5 GPUs!


forward(self, x): The core of the model. This method defines how the input data flows through the network.

x is the input tensor (batch of images).

EfficientNet Feature Extraction: The input is passed through the EfficientNet backbone (excluding the classifier) to extract features.

Adaptive AvgPool: After feature extraction, an adaptive average pooling layer is used to pool the output into a fixed size (1x1), regardless of the input image size.

eff_features.view(...): Flattens the pooled features into a 1D vector.





This code defines a custom combined model that uses both EfficientNet-B3 (for feature extraction) and ViT (Vision Transformer).

EfficientNet processes the image to generate feature maps, while ViT extracts global features via its CLS token.

The two feature sets are concatenated and passed through a fully connected layer for binary classification.

The model supports multi-GPU training via DataParallel and moves the model to the GPU using .to(device).

In [12]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torchvision.models as models
from transformers import ViTForImageClassification

class CombinedModel(nn.Module):
    def __init__(self, efficientnet, vit):
        super(CombinedModel, self).__init__()
        self.efficientnet = nn.Sequential(*list(efficientnet.children())[:-2])  # Remove classifier
        
        # Store the full ViT model
        self.vit = vit
        
        # Extract correct feature sizes
        self.efficientnet_avgpool = nn.AdaptiveAvgPool2d(1)
        eff_features_dim = 1536  # EfficientNet-B3 output size
        vit_features_dim = vit.config.hidden_size  # ViT hidden size (768)
        
        # Fully connected layer
        self.fc = nn.Linear(eff_features_dim + vit_features_dim, 2)
        
        # Print expected dimensions
        print(f"EfficientNet expected features: {eff_features_dim}")
        print(f"ViT expected features: {vit_features_dim}")
        print(f"Combined features expected: {eff_features_dim + vit_features_dim}")
        
    def forward(self, x):
        # Print input shape
        print(f"Input shape: {x.shape}")
        
        # EfficientNet feature extraction
        eff_features = self.efficientnet(x)
        print(f"EfficientNet features before pooling: {eff_features.shape}")
        
        eff_features = self.efficientnet_avgpool(eff_features)
        print(f"EfficientNet features after pooling: {eff_features.shape}")
        
        eff_features = eff_features.view(eff_features.size(0), -1)  # (batch_size, 1536)
        print(f"EfficientNet features after flattening: {eff_features.shape}")
        
        # Process input for ViT if needed
        if x.shape[-1] != 224 or x.shape[-2] != 224:
            vit_input = F.interpolate(x, size=(224, 224), mode="bilinear", align_corners=False)
            print(f"Resized input for ViT: {vit_input.shape}")
        else:
            vit_input = x
            print(f"Original input for ViT: {vit_input.shape}")
            
        # Get ViT features using output_hidden_states
        vit_outputs = self.vit(vit_input, output_hidden_states=True)
        
        # Get the CLS token from the last hidden state
        vit_features = vit_outputs.hidden_states[-1][:, 0]
        print(f"ViT features shape: {vit_features.shape}")
        
        # Concatenate features
        combined_features = torch.cat((eff_features, vit_features), dim=1)
        print(f"Combined features shape: {combined_features.shape}")
        
        # Final classification
        output = self.fc(combined_features)
        print(f"Output shape: {output.shape}")
        
        return output





# Multi-GPU Support
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models
efficientnet = models.efficientnet_b3(pretrained=True)  # Load EfficientNet-B3
efficientnet.classifier[1] = nn.Linear(efficientnet.classifier[1].in_features, 2)  # Modify for binary classification

from transformers import ViTForImageClassification, ViTConfig
vit = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
vit.classifier = nn.Linear(vit.config.hidden_size, 2)  # Modify classifier for binary classification

model = CombinedModel(efficientnet, vit)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)  

model.to(device)  


EfficientNet expected features: 1536
ViT expected features: 768
Combined features expected: 2304
Using 5 GPUs!


DataParallel(
  (module): CombinedModel(
    (efficientnet): Sequential(
      (0): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(3, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (1): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): SiLU(inplace=True)
        )
        (1): Sequential(
          (0): MBConv(
            (block): Sequential(
              (0): Conv2dNormActivation(
                (0): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
                (1): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                (2): SiLU(inplace=True)
              )
              (1): SqueezeExcitation(
                (avgpool): AdaptiveAvgPool2d(output_size=1)
                (fc1): Conv2d(40, 10, kernel_size=(1, 1), stride=(1, 1))
                (fc2): Conv2d(10, 40, kernel_size=(1, 1), stride=(1, 1))
     

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModelForImageClassification, AutoConfig
from torchvision import models

# Load EfficientNet-B3
efficientnet = models.efficientnet_b3(pretrained=True)

# Load Vision Transformer (ViT) with Custom Config
config = AutoConfig.from_pretrained("google/vit-base-patch16-224")
config.num_labels = 2  # Update for binary classification
vit = AutoModelForImageClassification.from_pretrained(
    "google/vit-base-patch16-224", 
    config=config, 
    ignore_mismatched_sizes=True
)

# Define Combined Model
class CombinedModel(nn.Module):
    def __init__(self, efficientnet, vit):
        super(CombinedModel, self).__init__()
        self.efficientnet = nn.Sequential(*list(efficientnet.children())[:-2])  # Remove classifier
        self.vit = vit  # Store the full ViT model
        
        # Extract correct feature sizes
        self.efficientnet_avgpool = nn.AdaptiveAvgPool2d(1)  # Pool to (batch_size, 1536, 1, 1)
        eff_features_dim = 1536  # EfficientNet-B3 output size
        vit_features_dim = vit.config.hidden_size  # ViT hidden size (768)
        
        # Print dimensions for debugging
        print(f"EfficientNet feature dim: {eff_features_dim}")
        print(f"ViT feature dim: {vit_features_dim}")
        print(f"Combined features dim: {eff_features_dim + vit_features_dim}")
        
        # Fully connected layer
        self.fc = nn.Linear(eff_features_dim + vit_features_dim, 2)

    def forward(self, x):
        print(f"Input shape: {x.shape}")
        # EfficientNet feature extraction
        eff_features = self.efficientnet(x)
        print(f"EfficientNet features before pooling: {eff_features.shape}")
        eff_features = self.efficientnet_avgpool(eff_features)
        print(f"EfficientNet features after pooling: {eff_features.shape}")
        eff_features = eff_features.view(eff_features.size(0), -1)  # (batch_size, 1536)
        print(f"EfficientNet features after flattening: {eff_features.shape}")
        
        # Resize input for ViT (224x224)
        vit_input = F.interpolate(x, size=(224, 224), mode="bilinear", align_corners=False)
        print(f"ViT input shape: {vit_input.shape}")
        
        # Use the ViT model with return_dict=True to ensure consistent output structure
        vit_outputs = self.vit(vit_input, output_hidden_states=True, return_dict=True)
        
        # Access hidden states correctly - the structure depends on the model version
        if hasattr(vit_outputs, 'hidden_states') and vit_outputs.hidden_states is not None:
            # Get the CLS token from the last hidden state
            vit_features = vit_outputs.hidden_states[-1][:, 0]
        else:
            # For some versions, we might need to access it differently
            # Use the pooler output as fallback
            vit_features = vit_outputs.pooler_output
            
        # Concatenate features
        combined_features = torch.cat((eff_features, vit_features), dim=1)
        
        # Final classification
        output = self.fc(combined_features)
        print(f"ViT features shape: {vit_features.shape}")
        print(f"Combined features shape: {combined_features.shape}")
        print(f"Output shape: {output.shape}")
        return output

# Device & Multi-GPU Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CombinedModel(efficientnet, vit)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)  

model.to(device) 


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EfficientNet feature dim: 1536
ViT feature dim: 768
Combined features dim: 2304
Using 5 GPUs!


DataParallel(
  (module): CombinedModel(
    (efficientnet): Sequential(
      (0): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(3, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (1): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): SiLU(inplace=True)
        )
        (1): Sequential(
          (0): MBConv(
            (block): Sequential(
              (0): Conv2dNormActivation(
                (0): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
                (1): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                (2): SiLU(inplace=True)
              )
              (1): SqueezeExcitation(
                (avgpool): AdaptiveAvgPool2d(output_size=1)
                (fc1): Conv2d(40, 10, kernel_size=(1, 1), stride=(1, 1))
                (fc2): Conv2d(10, 40, kernel_size=(1, 1), stride=(1, 1))
     

You're not repeating anything unnecessarily. The flow is actually proper:

Load pretrained EfficientNet and ViT → done at the top.

Define your CombinedModel class → done in the middle.

Instantiate the CombinedModel with the loaded backbones.

Prepare it for training with device & multi-GPU setup → done at the end.

It might seem like a re-init, but it's a sequential and logical progression.

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the combined model
class CombinedModel(nn.Module):
    def __init__(self, efficientnet, vit):
        super(CombinedModel, self).__init__()
        self.efficientnet = nn.Sequential(*list(efficientnet.children())[:-2])  # Remove classifier
        self.vit = vit  # Store the full ViT model
        
        # Extract correct feature sizes
        self.efficientnet_avgpool = nn.AdaptiveAvgPool2d(1)
        eff_features_dim = 1536  # EfficientNet-B3 output size
        vit_features_dim = vit.config.hidden_size  # ViT hidden size
        
        # Fully connected layer
        self.fc = nn.Linear(eff_features_dim + vit_features_dim, 2)
        
    def forward(self, x):
        # EfficientNet feature extraction
        eff_features = self.efficientnet(x)
        eff_features = self.efficientnet_avgpool(eff_features)
        eff_features = eff_features.view(eff_features.size(0), -1)  # (batch_size, 1536)
        
        # Resize input for ViT if needed
        vit_input = F.interpolate(x, size=(224, 224), mode="bilinear", align_corners=False)
        
        # Get ViT features using the full model with return_dict=True for consistent output structure
        vit_outputs = self.vit(vit_input, output_hidden_states=True, return_dict=True)
        
        # Access hidden states correctly - handle different model versions
        if hasattr(vit_outputs, 'hidden_states') and vit_outputs.hidden_states is not None:
            # Get the CLS token from the last hidden state
            vit_features = vit_outputs.hidden_states[-1][:, 0]  # Shape: (batch_size, 768)
        else:
            # Fallback to pooler output if hidden_states not available
            vit_features = vit_outputs.pooler_output
        
        # Concatenate features
        combined_features = torch.cat((eff_features, vit_features), dim=1)  # (batch_size, 2304)
        
        # Final classification
        output = self.fc(combined_features)
        return output


# Instantiate the combined model
combined_model = CombinedModel(efficientnet, vit)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    combined_model = nn.DataParallel(combined_model)  
combined_model.to(device)  

Using 5 GPUs!


DataParallel(
  (module): CombinedModel(
    (efficientnet): Sequential(
      (0): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(3, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (1): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): SiLU(inplace=True)
        )
        (1): Sequential(
          (0): MBConv(
            (block): Sequential(
              (0): Conv2dNormActivation(
                (0): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
                (1): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                (2): SiLU(inplace=True)
              )
              (1): SqueezeExcitation(
                (avgpool): AdaptiveAvgPool2d(output_size=1)
                (fc1): Conv2d(40, 10, kernel_size=(1, 1), stride=(1, 1))
                (fc2): Conv2d(10, 40, kernel_size=(1, 1), stride=(1, 1))
     

Code	                                         Purpose
nn.CrossEntropyLoss()	              Used to calculate classification loss based on predicted logits and true labels
torch.optim.Adam(...)	              Optimizes model weights during training
.parameters()	                      Tells the optimizer which tensors to update
.module.parameters()	              Needed if model is wrapped in nn.DataParallel
lr=1e-4	                              Learning rate for controlling how fast the optimizer updates weights

In [15]:
# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer (Handle DataParallel case)
optimizer = torch.optim.Adam(combined_model.module.parameters() if isinstance(combined_model, nn.DataParallel) else combined_model.parameters(), lr=1e-4)


Component	                                                Purpose
CustomImageDataset	               A custom PyTorch dataset to load images and labels from a nested folder structure
__init__()	                       Traverses folder structure, assigns labels, and stores paths
__getitem__()	                   Loads a single image, applies transforms, and returns it with label
get_data_loader()	               Utility to create DataLoader with batching, shuffling, multiprocessing
DataLoader	                       Feeds your model batches of (image, label) pairs efficiently, with multi-core support

In [16]:
import os
from PIL import Image
from torch.utils.data import Dataset, DataLoader

class CustomImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.image_paths = []
        self.labels = []
        self.transform = transform

        # Assign labels based on top-level directories
        label_mapping = { "fake": 0, "real": 1 }

        # Traverse the nested folder structure
        for label_name, label in label_mapping.items():
            label_dir = os.path.join(root_dir, label_name)
            if os.path.isdir(label_dir):
                for subdir in os.listdir(label_dir):
                    subdir_path = os.path.join(label_dir, subdir)
                    if os.path.isdir(subdir_path):
                        for file_name in os.listdir(subdir_path):
                            file_path = os.path.join(subdir_path, file_name)
                            if os.path.isfile(file_path):
                                self.image_paths.append(file_path)
                                self.labels.append(label)

        print(f"Loaded {len(self.image_paths)} images from {root_dir}")

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label

# Use DataLoader for Multi-GPU training
def get_data_loader(root_dir, transform, batch_size=32, num_workers=4):
    dataset = CustomImageDataset(root_dir, transform=transform)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
    return data_loader


Block	                                  Purpose
Transform	                  Resize & normalize images before feeding to model
CustomImageDataset	          Load image-label pairs from folder structure
DataLoader	                  Efficiently batch and load training/validation images
DataParallel	              Enable training across multiple GPUs (if available)
.to(device)	                  Move model to GPU(s) for actual training

In [17]:
import torch
from torch.utils.data import DataLoader
from torchvision import transforms

# Update Transformations for EfficientNet-B3 (300x300 input size)
transform = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load Datasets
train_dataset = CustomImageDataset(root_dir='/home/ghulam/FF++/cropped_face_mtcnn/train', transform=transform)
val_dataset = CustomImageDataset(root_dir='/home/ghulam/FF++/cropped_face_mtcnn/val', transform=transform)

# Optimize DataLoader for Multi-GPU Training
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=8, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=8, pin_memory=True)

# Move Model to GPUs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)  
model.to(device)  

Loaded 21595 images from /home/ghulam/FF++/cropped_face_mtcnn/train
Loaded 4196 images from /home/ghulam/FF++/cropped_face_mtcnn/val
Using 5 GPUs!


DataParallel(
  (module): DataParallel(
    (module): CombinedModel(
      (efficientnet): Sequential(
        (0): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(3, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
            (1): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): Sequential(
            (0): MBConv(
              (block): Sequential(
                (0): Conv2dNormActivation(
                  (0): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
                  (1): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
                  (2): SiLU(inplace=True)
                )
                (1): SqueezeExcitation(
                  (avgpool): AdaptiveAvgPool2d(output_size=1)
                  (fc1): Conv2d(40, 10, kernel_size=(1, 1), stride=(1, 1))
              

Feature	                                                Purpose
GradScaler + autocast()	                    Mixed precision training (fast + memory efficient)
model.train() / model.eval()	            Switch between training/eval modes
DataParallel	                            Use multiple GPUs automatically
no_grad()	                                Save memory during validation
argmax(1)	                                Get predicted class labels
.item()	                                    Convert tensors to Python scalars
to(device)	                                Move everything to GPU (or CPU)

In [18]:
import torch
import torch.nn as nn

# Use Mixed Precision for Faster Training
scaler = torch.cuda.amp.GradScaler()  

def train_combined_model(model, train_loader, val_loader, criterion, optimizer, epochs, device):
    model.to(device)  # Move model to GPU(s)
    
    for epoch in range(epochs):
        # Training Phase
        model.train()
        train_loss, correct = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)  # Move data to GPU(s)

            optimizer.zero_grad()

            # Use AMP for Efficient Training
            with torch.cuda.amp.autocast():
                outputs = model(images)
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            train_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()

        train_accuracy = correct / len(train_loader.dataset)
        print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

        # Validation Phase
        model.eval()
        val_loss, correct = 0, 0

        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)  # Move data to GPU(s)
                outputs = model(images)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                correct += (outputs.argmax(1) == labels).sum().item()

        val_accuracy = correct / len(val_loader.dataset)
        print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

# Multi-GPU Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)  # Enables training on all available GPUs


Using 5 GPUs!


In [None]:
# Use Multi-GPU and Mixed Precision
train_combined_model(
    model=combined_model.to(device),  # Move model to GPU(s)
    train_loader=train_loader, 
    val_loader=val_loader,          
    criterion=criterion, 
    optimizer=optimizer, 
    epochs=5,
    device=device  
)

  with torch.cuda.amp.autocast():


Epoch 1, Train Loss: 33.6293, Train Accuracy: 0.9607
Validation Loss: 16.4292, Validation Accuracy: 0.9261
Epoch 2, Train Loss: 3.4864, Train Accuracy: 0.9968
Validation Loss: 4.7842, Validation Accuracy: 0.9771
Epoch 3, Train Loss: 1.2266, Train Accuracy: 0.9989
Validation Loss: 11.9484, Validation Accuracy: 0.9659
Epoch 4, Train Loss: 5.7996, Train Accuracy: 0.9939
Validation Loss: 3.2974, Validation Accuracy: 0.9843
Epoch 5, Train Loss: 2.4096, Train Accuracy: 0.9975
Validation Loss: 5.5837, Validation Accuracy: 0.9781


In [49]:
# Save your best model (e.g., after training)
torch.save(combined_model.state_dict(), 'best_combined_model.pth')


Component	                                                   Purpose
transforms	                                           Resize and normalize images
DataLoader	                                           Loads data efficiently in batches
model.eval()	                                       Turns off dropout and batchnorm during inference
torch.no_grad()	                                       Saves memory by disabling gradient calculations
torch.cuda.amp.autocast()	                           Mixed precision → faster and more efficient inference
torch.max()	                                           Gets predicted class from output logits
classification_report	                               Gives detailed performance metrics (Precision, Recall, F1)
confusion_matrix	                                   Shows where the model made right or wrong predictions

In [19]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms

# Define test transformation (same as training)
test_transform = transforms.Compose([
    transforms.Resize((300, 300)),  # EfficientNet-B3 input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load test dataset
test_dataset = CustomImageDataset(root_dir='/home/ghulam/FF++/cropped_face_mtcnn/test', 
                                 transform=test_transform)

# Create test DataLoader with optimal settings for multi-GPU
test_loader = DataLoader(test_dataset, 
                        batch_size=64,  
                        shuffle=False,  
                        num_workers=8,  
                        pin_memory=True)  
def test_combined_model(model, test_loader, device):
    
    model.eval()
    
    
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    
    # Disable gradient computation for inference
    with torch.no_grad():
        for images, labels in test_loader:
            # Move data to device
            images = images.to(device)
            labels = labels.to(device)
            
            # Forward pass with mixed precision
            with torch.cuda.amp.autocast():
                outputs = model(images)
            
            # Get predictions
            _, predicted = torch.max(outputs.data, 1)
            
            # Update metrics
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            # Store predictions and labels for additional metrics
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate accuracy
    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')
    
    return accuracy, all_preds, all_labels

# Load your trained model
combined_model = CombinedModel(efficientnet, vit)
combined_model.load_state_dict(torch.load('/home/ghulam/FF++/DeepfakeImageDetectionApp_ver1.0/best_combined_model.pth'))

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Enable multi-GPU testing
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs for testing!")
    combined_model = nn.DataParallel(combined_model)

# Move model to device
combined_model.to(device)

# Run testing
accuracy, predictions, true_labels = test_combined_model(combined_model, test_loader, device)

# Optional: Calculate additional metrics
from sklearn.metrics import classification_report, confusion_matrix

print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=['Real', 'Fake']))

print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, predictions))


Loaded 4200 images from /home/ghulam/FF++/cropped_face_mtcnn/test
Using 5 GPUs for testing!
Test Accuracy: 81.60%

Classification Report:
              precision    recall  f1-score   support

        Real       0.74      0.97      0.84      2100
        Fake       0.96      0.66      0.78      2100

    accuracy                           0.82      4200
   macro avg       0.85      0.82      0.81      4200
weighted avg       0.85      0.82      0.81      4200


Confusion Matrix:
[[2045   55]
 [ 718 1382]]


In [20]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms

# Define test transformation (same as training)
test_transform = transforms.Compose([
    transforms.Resize((300, 300)),  # EfficientNet-B3 input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load test dataset
test_dataset = CustomImageDataset(root_dir='/home/ghulam/Celeb-DF/test', 
                                 transform=test_transform)

# Create test DataLoader with optimal settings for multi-GPU
test_loader = DataLoader(test_dataset, 
                        batch_size=64,  
                        shuffle=False,  
                        num_workers=8,  
                        pin_memory=True)  
def test_combined_model(model, test_loader, device):
    
    model.eval()
    
    
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    
    # Disable gradient computation for inference
    with torch.no_grad():
        for images, labels in test_loader:
            # Move data to device
            images = images.to(device)
            labels = labels.to(device)
            
            # Forward pass with mixed precision
            with torch.cuda.amp.autocast():
                outputs = model(images)
            
            # Get predictions
            _, predicted = torch.max(outputs.data, 1)
            
            # Update metrics
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            # Store predictions and labels for additional metrics
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate accuracy
    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')
    
    return accuracy, all_preds, all_labels

# Load your trained model
combined_model = CombinedModel(efficientnet, vit)
combined_model.load_state_dict(torch.load('/home/ghulam/FF++/DeepfakeImageDetectionApp_ver1.0/best_combined_model.pth'))

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Enable multi-GPU testing
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs for testing!")
    combined_model = nn.DataParallel(combined_model)

# Move model to device
combined_model.to(device)

# Run testing
accuracy, predictions, true_labels = test_combined_model(combined_model, test_loader, device)

# Optional: Calculate additional metrics
from sklearn.metrics import classification_report, confusion_matrix

print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=['Real', 'Fake']))

print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, predictions))


Loaded 15457 images from /home/ghulam/Celeb-DF/test
Using 5 GPUs for testing!
Test Accuracy: 65.17%

Classification Report:
              precision    recall  f1-score   support

        Real       0.66      0.99      0.79     10133
        Fake       0.35      0.01      0.03      5324

    accuracy                           0.65     15457
   macro avg       0.50      0.50      0.41     15457
weighted avg       0.55      0.65      0.53     15457


Confusion Matrix:
[[10001   132]
 [ 5252    72]]
