In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'automathon-deepfake:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F77151%2F8401844%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240430%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240430T195037Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2f1af147451b61b9b37c274c0838d06d4c37cd281c5fa932291056fddf077c8fa77d3518e94bb97ad48aed973721d1c4fae5dae770addcfe382a284dccc6617038998dbdb970ced483d1211740cdd97b00437d32ab487747da390dc816925e5cda1708eec85ff3549d435b9da2590d13c3554a428dc26d9ab732ef373ca2c6657760eed8d98395ec796a37b42924f44cf6a135ecce303af6e41f497ece2def08abdf7e704190288ba3ed15f62a58156e593f3b65559f326c584d8884d30a865dd01ca1c8dff2726778adab2b12fd7076663d1708d70e1f5e1cb9e9fdd921db0d4adf99b3763c6398b5c6ad71261d33f8f1b1c0bf5e24f051cf3e5ddb91b2e2fc'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading automathon-deepfake, 3919757882 bytes compressed
Downloaded and uncompressed: automathon-deepfake
Data source import complete.


In [None]:
import os
import fnmatch
import pandas as pd
import json
import numpy as np
import cv2
import pyarrow as pa
import pyarrow.parquet as pq
import asyncio
from torchvision import transforms
from PIL import Image
import io
import cv2
import glob
import torch
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset



In [None]:
dataset_video_path = "/kaggle/input/automathon-deepfake/dataset/experimental_dataset"
dataset_metadata_path = "/kaggle/input/automathon-deepfake/dataset/experimental_dataset/metadata.json"

video_files = glob.glob(dataset_video_path)

df_labels = pd.read_json(dataset_metadata_path, orient='index')
df_labels.reset_index(inplace=True)
df_labels.columns = ['Filename', 'Label']
df_labels['label_value'] = np.where(df_labels['Label'] == 'real', 1, 0)

In [None]:
class video_dataset(Dataset):
    def __init__(self, df, base_video_path, transform=None, sequence_length=60):
        """
        Initializes the dataset.
        :param df: DataFrame containing 'Filename' and 'label_value'
        :param transform: Transformations to be applied to each frame
        :param sequence_length: Number of frames to extract from each video
        """
        self.df = df
        self.transform = transform
        self.sequence_length = sequence_length
        self.base_path = base_video_path
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Fetch the video path and label from the DataFrame
        video_path = self.df.iloc[idx]['Filename']
        label = self.df.iloc[idx]['label_value']

        # Extract frames from the video
        frames = []
        for i, frame in enumerate(self.frame_extract(self.base_path + "/" + video_path)):
            if self.transform:
                frame = self.transform(frame)
            frames.append(frame)
            if len(frames) == self.sequence_length:
                break

        # Stack the frames
        frames = torch.stack(frames)
        frames = frames[:self.sequence_length]
        return frames, label

    def frame_extract(self, path):
        """
        Generator to extract frames from a given video path.
        """
        print("Path of video", path)
        vidObj = cv2.VideoCapture(path)
        success, image = vidObj.read()
        while success:
            yield image
            success, image = vidObj.read()

In [None]:
frame_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((112, 112)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

train_data = video_dataset(df_labels, dataset_video_path, transform=frame_transforms)
train_loader = DataLoader(train_data,batch_size = 4,shuffle = True)

In [None]:
for frames, labels in train_loader:
    print("Batch shape:", frames.shape)  # Expected: (batch_size, sequence_length, channels, height, width)
    print("Labels shape:", labels.shape)  # Expected: (batch_size,)
    break  # Only check the first batch for quick verification

Path of video /kaggle/input/automathon-deepfake/dataset/experimental_dataset/ozgsdfnksv.mp4
Path of video /kaggle/input/automathon-deepfake/dataset/experimental_dataset/mshibrgvlv.mp4
Path of video /kaggle/input/automathon-deepfake/dataset/experimental_dataset/oaaoicrccb.mp4
Path of video /kaggle/input/automathon-deepfake/dataset/experimental_dataset/iclluvbprk.mp4
Batch shape: torch.Size([4, 60, 3, 112, 112])
Labels shape: torch.Size([4])
