In [2]:
import os
import cv2

def extract_faces_from_video(video_path, output_folder, face_counter):
    # Load the video
    cap = cv2.VideoCapture(video_path)
    
    # Load the pre-trained face detection model (Haar Cascade)
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    
    frame_count = 0
    face_count = 0
    
    while True:
        # Read a frame from the video
        ret, frame = cap.read()
        
        # If frame reading fails, break the loop
        if not ret:
            break
        
        # Convert frame to grayscale for face detection
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
        # Detect faces in the frame
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
        
        # Process each detected face
        for (x, y, w, h) in faces:
            # Extract the face region
            face = frame[y:y+h, x:x+w]
            
            # Save the face image with a unique number
            face_filename = os.path.join(output_folder, f"face_{face_counter[0]}.jpg")
            cv2.imwrite(face_filename, face)
            face_count += 1
            face_counter[0] += 1
        
        frame_count += 1
        if frame_count % 100 == 0:
            print(f"Processed {frame_count} frames from {os.path.basename(video_path)}, found {face_count} faces so far")
    
    cap.release()
    print(f"Finished processing {os.path.basename(video_path)}. Total faces extracted: {face_count}")
    return face_count

def process_videos_in_folder(input_folder, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Get all video files in the input folder
    video_extensions = ('.mp4', '.avi', '.mov', '.mkv', '.flv')
    video_files = [f for f in os.listdir(input_folder) if f.lower().endswith(video_extensions)]
    
    if not video_files:
        print(f"No video files found in {input_folder}")
        return
    
    # Use a list with single element to maintain counter across function calls
    face_counter = [0]
    total_faces = 0
    
    for video_file in video_files:
        video_path = os.path.join(input_folder, video_file)
        
        print(f"\nProcessing video: {video_file}")
        faces_extracted = extract_faces_from_video(video_path, output_folder, face_counter)
        total_faces += faces_extracted

    print(f"\nAll videos processed! Total faces extracted: {total_faces}")

if __name__ == "__main__":
    # Set your input and output folders here
    input_folder = "Real"
    output_folder = "imgreal"
    
    process_videos_in_folder(input_folder, output_folder)


Processing video: id0_0000.mp4
Processed 100 frames from id0_0000.mp4, found 100 faces so far
Processed 200 frames from id0_0000.mp4, found 201 faces so far
Processed 300 frames from id0_0000.mp4, found 302 faces so far
Processed 400 frames from id0_0000.mp4, found 402 faces so far
Finished processing id0_0000.mp4. Total faces extracted: 471

Processing video: id0_0001.mp4
Processed 100 frames from id0_0001.mp4, found 105 faces so far
Processed 200 frames from id0_0001.mp4, found 205 faces so far
Processed 300 frames from id0_0001.mp4, found 305 faces so far
Finished processing id0_0001.mp4. Total faces extracted: 308

Processing video: id0_0002.mp4
Processed 100 frames from id0_0002.mp4, found 100 faces so far
Processed 200 frames from id0_0002.mp4, found 200 faces so far
Processed 300 frames from id0_0002.mp4, found 300 faces so far
Finished processing id0_0002.mp4. Total faces extracted: 351

Processing video: id0_0003.mp4
Processed 100 frames from id0_0003.mp4, found 85 faces so f

In [3]:
import os
import cv2

def extract_faces_from_video(video_path, output_folder, face_counter):
    # Load the video
    cap = cv2.VideoCapture(video_path)
    
    # Load the pre-trained face detection model (Haar Cascade)
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    
    frame_count = 0
    face_count = 0
    
    while True:
        # Read a frame from the video
        ret, frame = cap.read()
        
        # If frame reading fails, break the loop
        if not ret:
            break
        
        # Convert frame to grayscale for face detection
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
        # Detect faces in the frame
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
        
        # Process each detected face
        for (x, y, w, h) in faces:
            # Extract the face region
            face = frame[y:y+h, x:x+w]
            
            # Save the face image with a unique number
            face_filename = os.path.join(output_folder, f"face_{face_counter[0]}.jpg")
            cv2.imwrite(face_filename, face)
            face_count += 1
            face_counter[0] += 1
        
        frame_count += 1
        if frame_count % 100 == 0:
            print(f"Processed {frame_count} frames from {os.path.basename(video_path)}, found {face_count} faces so far")
    
    cap.release()
    print(f"Finished processing {os.path.basename(video_path)}. Total faces extracted: {face_count}")
    return face_count

def process_videos_in_folder(input_folder, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Get all video files in the input folder
    video_extensions = ('.mp4', '.avi', '.mov', '.mkv', '.flv')
    video_files = [f for f in os.listdir(input_folder) if f.lower().endswith(video_extensions)]
    
    if not video_files:
        print(f"No video files found in {input_folder}")
        return
    
    # Use a list with single element to maintain counter across function calls
    face_counter = [0]
    total_faces = 0
    
    for video_file in video_files:
        video_path = os.path.join(input_folder, video_file)
        
        print(f"\nProcessing video: {video_file}")
        faces_extracted = extract_faces_from_video(video_path, output_folder, face_counter)
        total_faces += faces_extracted

    print(f"\nAll videos processed! Total faces extracted: {total_faces}")

if __name__ == "__main__":
    # Set your input and output folders here
    input_folder = "Fake"
    output_folder = "imgfake"
    
    process_videos_in_folder(input_folder, output_folder)


Processing video: id0_id1_0000.mp4
Processed 100 frames from id0_id1_0000.mp4, found 100 faces so far
Processed 200 frames from id0_id1_0000.mp4, found 200 faces so far
Processed 300 frames from id0_id1_0000.mp4, found 300 faces so far
Processed 400 frames from id0_id1_0000.mp4, found 400 faces so far
Finished processing id0_id1_0000.mp4. Total faces extracted: 469

Processing video: id0_id1_0001.mp4
Processed 100 frames from id0_id1_0001.mp4, found 104 faces so far
Processed 200 frames from id0_id1_0001.mp4, found 204 faces so far
Processed 300 frames from id0_id1_0001.mp4, found 304 faces so far
Finished processing id0_id1_0001.mp4. Total faces extracted: 307

Processing video: id0_id1_0002.mp4
Processed 100 frames from id0_id1_0002.mp4, found 100 faces so far
Processed 200 frames from id0_id1_0002.mp4, found 200 faces so far
Processed 300 frames from id0_id1_0002.mp4, found 308 faces so far
Finished processing id0_id1_0002.mp4. Total faces extracted: 360

Processing video: id0_id1_

In [1]:
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.model_selection import train_test_split
from mtcnn import MTCNN  # For face detection
from transformers import ViTModel, ViTConfig

# Constants
IMAGE_SIZE = (200, 200)
BATCH_SIZE = 32
EPOCHS = 2
SEQUENCE_LENGTH = 10  # Number of frames to process as a sequence
FRAMES_TO_CAPTURE = 100
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Face detector
detector = MTCNN()

class FaceSequenceDataset(Dataset):
    def __init__(self, sequences, labels, transform=None):
        self.sequences = sequences
        self.labels = labels
        self.transform = transform
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        
        if self.transform:
            sequence = torch.stack([self.transform(frame) for frame in sequence])
            
        return sequence, torch.tensor(label, dtype=torch.float32)

def detect_face(image):
    """Detect and crop face from image using MTCNN"""
    results = detector.detect_faces(image)
    if len(results) == 0:
        return None
    
    x, y, width, height = results[0]['box']
    # Ensure coordinates are within image bounds
    x, y = max(0, x), max(0, y)
    width = min(width, image.shape[1] - x)
    height = min(height, image.shape[0] - y)
    
    face = image[y:y+height, x:x+width]
    face = cv2.resize(face, IMAGE_SIZE)
    return face

def load_images_from_folder(folder, label):
    """Load images from folder, detect faces, and assign labels"""
    images = []
    labels = []
    
    for filename in os.listdir(folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            img_path = os.path.join(folder, filename)
            image = cv2.imread(img_path)
            if image is not None:
                face = detect_face(image)
                if face is not None:
                    images.append(face)
                    labels.append(label)
    
    return np.array(images), np.array(labels)

class VideoTransformer(nn.Module):
    def __init__(self):
        super(VideoTransformer, self).__init__()
        
        # Configuration for Vision Transformer
        config = ViTConfig(
            image_size=IMAGE_SIZE[0],
            patch_size=32,
            num_channels=3,
            hidden_size=768,
            num_hidden_layers=4,
            num_attention_heads=4,
            intermediate_size=3072,
            hidden_act='gelu',
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1,
            initializer_range=0.02,
            layer_norm_eps=1e-12
        )
        
        self.vit = ViTModel(config)
        
        # Transformer for temporal sequence
        self.temporal_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=768,
                nhead=4,
                dim_feedforward=3072,
                dropout=0.1
            ),
            num_layers=2
        )
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        # x shape: (batch_size, sequence_length, channels, height, width)
        batch_size, seq_len = x.shape[0], x.shape[1]
        
        # Process each frame in the sequence through ViT
        vit_features = []
        for i in range(seq_len):
            frame = x[:, i, :, :, :]
            outputs = self.vit(frame)
            vit_features.append(outputs.last_hidden_state[:, 0, :])  # Take [CLS] token
            
        # Stack features along sequence dimension
        vit_features = torch.stack(vit_features, dim=1)  # (batch_size, seq_len, hidden_size)
        
        # Process temporal sequence
        temporal_features = self.temporal_transformer(vit_features)
        
        # Take features from the middle frame
        middle_idx = seq_len // 2
        middle_features = temporal_features[:, middle_idx, :]
        
        # Classify
        output = self.classifier(middle_features)
        return output.squeeze()

def create_sequences(images, labels):
    """Convert images to sequences for transformer"""
    num_sequences = len(images) // SEQUENCE_LENGTH
    sequences = []
    sequence_labels = []
    
    for i in range(num_sequences):
        sequence = images[i*SEQUENCE_LENGTH : (i+1)*SEQUENCE_LENGTH]
        label = labels[i*SEQUENCE_LENGTH + SEQUENCE_LENGTH//2]  # Use middle frame label
        sequences.append(sequence)
        sequence_labels.append(label)
        
    return np.array(sequences), np.array(sequence_labels)

def train_model():
    # Load real and fake images
    real_images, real_labels = load_images_from_folder('imgreal', 1)
    fake_images, fake_labels = load_images_from_folder('imgfake', 0)
    
    # Combine datasets
    X = np.concatenate((real_images, fake_images))
    y = np.concatenate((real_labels, fake_labels))
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create sequences
    X_train_seq, y_train_seq = create_sequences(X_train, y_train)
    X_test_seq, y_test_seq = create_sequences(X_test, y_test)
    
    # Data augmentation and transformations
    train_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(10),
        transforms.RandomAffine(0, translate=(0.1, 0.1)),
        transforms.RandomResizedCrop(IMAGE_SIZE, scale=(0.9, 1.0)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    test_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Create datasets
    train_dataset = FaceSequenceDataset(X_train_seq, y_train_seq, train_transform)
    test_dataset = FaceSequenceDataset(X_test_seq, y_test_seq, test_transform)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    # Create model
    model = VideoTransformer().to(DEVICE)
    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=0.0001)
    
    # Training loop
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0.0
        correct = 0
        total = 0
        
        for sequences, labels in train_loader:
            sequences = sequences.to(DEVICE)
            labels = labels.to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            predicted = (outputs > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
        
        train_acc = correct / total
        train_loss = train_loss / len(train_loader)
        
        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for sequences, labels in test_loader:
                sequences = sequences.to(DEVICE)
                labels = labels.to(DEVICE)
                
                outputs = model(sequences)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                predicted = (outputs > 0.5).float()
                correct += (predicted == labels).sum().item()
                total += labels.size(0)
        
        val_acc = correct / total
        val_loss = val_loss / len(test_loader)
        
        print(f"Epoch {epoch+1}/{EPOCHS}: "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
    
    return model

def process_video(video_path, model):
    """Process video to detect real/fake faces"""
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    predictions = []
    
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    while frame_count < FRAMES_TO_CAPTURE and cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
            
        # Detect face
        face = detect_face(frame)
        if face is not None:
            # Preprocess face for model
            face = transform(face).unsqueeze(0).to(DEVICE)
            
            # Create sequence by repeating the same frame
            sequence = face.repeat(SEQUENCE_LENGTH, 1, 1, 1, 1)
            sequence = sequence.permute(1, 0, 2, 3, 4)  # (batch, seq, C, H, W)
            
            # Predict
            with torch.no_grad():
                prediction = model(sequence).item()
            
            predictions.append(prediction)
            frame_count += 1
            
            print(f"Frame {frame_count}: {'Real' if prediction > 0.5 else 'Fake'} ({prediction:.4f})")
    
    cap.release()
    
    if not predictions:
        print("No faces detected in video.")
        return 0.5  # Neutral value if no faces found
    
    avg_prediction = np.mean(predictions)
    print(f"\nAverage prediction: {avg_prediction:.4f}")
    print(f"Final determination: {'REAL' if avg_prediction > 0.5 else 'FAKE'}")
    
    return avg_prediction

if __name__ == "__main__":
    # Train or load model
    model_path = "face_authenticity_transformer.pth"
    
    if os.path.exists(model_path):
        model = VideoTransformer().to(DEVICE)
        model.load_state_dict(torch.load(model_path))
        model.eval()
        print("Loaded pre-trained model")
    else:
        print("Training new model...")
        model = train_model()
        torch.save(model.state_dict(), model_path)
        print("Model saved to", model_path)
    
    # Test on video
    video_path = "Real\id0_0000.mp4"
    result = process_video(video_path, model)

  video_path = "Real\id0_0000.mp4"
  video_path = "Real\id0_0000.mp4"


ModuleNotFoundError: No module named 'tensorflow'

In [2]:
pip install mtcnn

Defaulting to user installation because normal site-packages is not writeable
Collecting mtcnn
  Using cached mtcnn-1.0.0-py3-none-any.whl.metadata (5.8 kB)
Collecting lz4>=4.3.3 (from mtcnn)
  Downloading lz4-4.4.4-cp313-cp313-win_amd64.whl.metadata (3.9 kB)
Using cached mtcnn-1.0.0-py3-none-any.whl (1.9 MB)
Downloading lz4-4.4.4-cp313-cp313-win_amd64.whl (99 kB)
Installing collected packages: lz4, mtcnn
Successfully installed lz4-4.4.4 mtcnn-1.0.0
Note: you may need to restart the kernel to use updated packages.
