In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Phase 1: ff-c23 implementation

In [2]:
import os
import json
import pandas as pd
from collections import defaultdict
from pathlib import Path
import random

random.seed(42)

In [26]:
DATA_ROOT = Path("/kaggle/input/ff-c23/FaceForensics++_C23")  # adjust if needed
CELEBDF_ROOT = Path("/kaggle/input/celeb-df-v2")
DFDC_ROOT    = Path("/kaggle/input/deepfake-detection-challenge")

folders = [f.name for f in DATA_ROOT.iterdir() if f.is_dir()]
folders

['Face2Face',
 'csv',
 'Deepfakes',
 'DeepFakeDetection',
 'original',
 'NeuralTextures',
 'FaceShifter',
 'FaceSwap']

In [5]:
CSV_DIR = DATA_ROOT / "csv"
csv_files = list(CSV_DIR.glob("*.csv"))
csv_files


[PosixPath('/kaggle/input/ff-c23/FaceForensics++_C23/csv/FaceSwap.csv'),
 PosixPath('/kaggle/input/ff-c23/FaceForensics++_C23/csv/FaceShifter.csv'),
 PosixPath('/kaggle/input/ff-c23/FaceForensics++_C23/csv/DeepFakeDetection.csv'),
 PosixPath('/kaggle/input/ff-c23/FaceForensics++_C23/csv/Face2Face.csv'),
 PosixPath('/kaggle/input/ff-c23/FaceForensics++_C23/csv/Deepfakes.csv'),
 PosixPath('/kaggle/input/ff-c23/FaceForensics++_C23/csv/Mean_Data.csv'),
 PosixPath('/kaggle/input/ff-c23/FaceForensics++_C23/csv/original.csv'),
 PosixPath('/kaggle/input/ff-c23/FaceForensics++_C23/csv/NeuralTextures.csv'),
 PosixPath('/kaggle/input/ff-c23/FaceForensics++_C23/csv/FF++_Metadata_Shuffled.csv'),
 PosixPath('/kaggle/input/ff-c23/FaceForensics++_C23/csv/FF++_Metadata.csv')]

In [6]:
metadata = []

for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    df["source_csv"] = csv_file.name
    metadata.append(df)

metadata_df = pd.concat(metadata, ignore_index=True)
metadata_df.head()

Unnamed: 0.1,Unnamed: 0,File Path,Label,Frame Count,Width,Height,Codec,File Size(MB),source_csv,Frame Count Mean,Width Mean,Height Mean,File Size(MB) Mean
0,0,FaceSwap/000_003.mp4,FAKE,303.0,640.0,480.0,h264,0.69,FaceSwap.csv,,,,
1,1,FaceSwap/001_870.mp4,FAKE,460.0,1280.0,720.0,h264,2.77,FaceSwap.csv,,,,
2,2,FaceSwap/002_006.mp4,FAKE,310.0,1280.0,720.0,h264,0.96,FaceSwap.csv,,,,
3,3,FaceSwap/003_000.mp4,FAKE,303.0,640.0,480.0,h264,0.62,FaceSwap.csv,,,,
4,4,FaceSwap/004_982.mp4,FAKE,309.0,1280.0,720.0,h264,1.21,FaceSwap.csv,,,,


In [9]:
video_index = []

# REAL videos
for vid in (DATA_ROOT / "original").glob("*.mp4"):
    video_index.append({
        "video_path": str(vid),
        "label": "real",
        "manipulation": "original",
        "identity": vid.stem
    })

# FAKE videos
manipulation_folders = [
    "DeepFakeDetection", "Deepfakes", "Face2Face",
    "FaceShifter", "FaceSwap", "NeuralTextures"
]

for manip in manipulation_folders:
    for vid in (DATA_ROOT / manip).glob("*.mp4"):
        identity = vid.stem.split("_")[0]
        video_index.append({
            "video_path": str(vid),
            "label": "fake",
            "manipulation": manip,
            "identity": identity
        })

videos_df = pd.DataFrame(video_index)


In [10]:
videos_df.head()

Unnamed: 0,video_path,label,manipulation,identity
0,/kaggle/input/ff-c23/FaceForensics++_C23/origi...,real,original,123
1,/kaggle/input/ff-c23/FaceForensics++_C23/origi...,real,original,738
2,/kaggle/input/ff-c23/FaceForensics++_C23/origi...,real,original,479
3,/kaggle/input/ff-c23/FaceForensics++_C23/origi...,real,original,660
4,/kaggle/input/ff-c23/FaceForensics++_C23/origi...,real,original,565


In [11]:
videos_df["manipulation"].value_counts()

manipulation
original             1000
DeepFakeDetection    1000
Deepfakes            1000
Face2Face            1000
FaceShifter          1000
FaceSwap             1000
NeuralTextures       1000
Name: count, dtype: int64

In [12]:
videos_df["identity"].nunique()

1028

In [16]:
videos_df["manipulation"].value_counts()

manipulation
original             1000
DeepFakeDetection    1000
Deepfakes            1000
Face2Face            1000
FaceShifter          1000
FaceSwap             1000
NeuralTextures       1000
Name: count, dtype: int64

In [19]:
videos_df["label"].value_counts()

label
fake    6000
real    1000
Name: count, dtype: int64

In [20]:
import random

random.seed(42)

identities = videos_df["identity"].unique().tolist()
random.shuffle(identities)

n = len(identities)

train_ids = identities[:int(0.6 * n)]
val_ids   = identities[int(0.6 * n):int(0.8 * n)]
test_ids  = identities[int(0.8 * n):]

print(len(train_ids), len(val_ids), len(test_ids))


616 206 206


In [22]:
train_df = videos_df[videos_df["identity"].isin(train_ids)]
val_df   = videos_df[videos_df["identity"].isin(val_ids)]
test_df  = videos_df[videos_df["identity"].isin(test_ids)]

In [23]:
assert set(train_ids).isdisjoint(val_ids)
assert set(train_ids).isdisjoint(test_ids)
assert set(val_ids).isdisjoint(test_ids)

print("✅ Identity-disjoint splits verified")

✅ Identity-disjoint splits verified


In [24]:
from collections import defaultdict

def build_task_pools(df):
    tasks = defaultdict(list)
    for _, row in df.iterrows():
        task_name = f"FFPP_{row['manipulation']}"
        tasks[task_name].append({
            "video_path": row["video_path"],
            "label": row["label"],
            "identity": row["identity"]
        })
    return tasks

meta_train_tasks = build_task_pools(train_df)
meta_val_tasks   = build_task_pools(val_df)
meta_test_tasks  = build_task_pools(test_df)

meta_train_tasks.keys()


dict_keys(['FFPP_original', 'FFPP_DeepFakeDetection', 'FFPP_Deepfakes', 'FFPP_Face2Face', 'FFPP_FaceShifter', 'FFPP_FaceSwap', 'FFPP_NeuralTextures'])

In [34]:
import json
from pathlib import Path

OUTPUT_DIR = Path("/kaggle/working/phase1_tasks/ffpp")
OUTPUT_DIR.mkdir(exist_ok=True)

with open(OUTPUT_DIR / "meta_train_tasks.json", "w") as f:
    json.dump(meta_train_tasks, f, indent=2)

with open(OUTPUT_DIR / "meta_val_tasks.json", "w") as f:
    json.dump(meta_val_tasks, f, indent=2)

with open(OUTPUT_DIR / "meta_test_tasks.json", "w") as f:
    json.dump(meta_test_tasks, f, indent=2)

print("✅ Phase 1 FF++ task files saved")


✅ Phase 1 FF++ task files saved


# CelebDf implementation

In [27]:
celebdf_index = []

# Real videos
real_dir = CELEBDF_ROOT / "Celeb-real"
for vid in real_dir.glob("*.mp4"):
    celebdf_index.append({
        "video_path": str(vid),
        "label": "real",
        "manipulation": "CelebDF",
        "identity": vid.stem   # celebrity name / ID
    })

# Fake videos
fake_dir = CELEBDF_ROOT / "Celeb-synthesis"
for vid in fake_dir.glob("*.mp4"):
    identity = vid.stem.split("_")[0]
    celebdf_index.append({
        "video_path": str(vid),
        "label": "fake",
        "manipulation": "CelebDF",
        "identity": identity
    })

celebdf_df = pd.DataFrame(celebdf_index)

print("Celeb-DF identities:", celebdf_df["identity"].nunique())
print(celebdf_df["label"].value_counts())


Celeb-DF identities: 648
label
fake    5639
real     590
Name: count, dtype: int64


In [28]:
def build_tasks(df, prefix):
    tasks = defaultdict(list)
    for _, row in df.iterrows():
        task_name = f"{prefix}_{row['manipulation']}"
        tasks[task_name].append({
            "video_path": row["video_path"],
            "label": row["label"],
            "identity": row["identity"]
        })
    return tasks

celebdf_tasks = build_tasks(celebdf_df, "CELEBDF")


In [29]:
OUTPUT_DIR = Path("/kaggle/working/phase1_tasks/celebdf")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

with open(OUTPUT_DIR / "meta_test_tasks.json", "w") as f:
    json.dump(celebdf_tasks, f, indent=2)

print("✅ Celeb-DF Phase-1 tasks saved")


✅ Celeb-DF Phase-1 tasks saved
