In [None]:
import os
import json
import glob
import shutil
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from google.colab import files

# --- 1. SETUP AND DOWNLOAD FROM KAGGLE ---
print("Setting up Kaggle credentials...")
kaggle_creds = {
    "username": "aniketkhedkar313",
    "key": "2855e7cfe2b95b7859f216820abe155d"
}

os.makedirs('/root/.kaggle', exist_ok=True)
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump(kaggle_creds, f)
os.chmod('/root/.kaggle/kaggle.json', 0o600)

print("Downloading dataset from Kaggle...")
exit_code = os.system('kaggle datasets download -d manjilkarki/deepfake-and-real-images')

if exit_code != 0:
    raise RuntimeError("Failed to download dataset.")

print("Unzipping dataset...")
os.system('unzip -q -o deepfake-and-real-images.zip -d /content/raw_data')

# --- 2. CONFIGURATION ---
INPUT_ROOT = "/content/raw_data"
OUTPUT_DIR = "/content/cleaned_dataset"
SEED = 123

# --- 3. SCAN AND SORT FILES ---
print("Scanning for images...")
all_files = glob.glob(os.path.join(INPUT_ROOT, "**", "*.jpg"), recursive=True) + \
            glob.glob(os.path.join(INPUT_ROOT, "**", "*.png"), recursive=True) + \
            glob.glob(os.path.join(INPUT_ROOT, "**", "*.jpeg"), recursive=True)

real_paths = [p for p in all_files if "real" in p.lower() and "fake" not in p.lower()]
fake_paths = [p for p in all_files if "fake" in p.lower() and "real" not in p.lower()]

print(f"Real images found: {len(real_paths)}")
print(f"Fake images found: {len(fake_paths)}")

# --- 4. CREATE SPLITS ---
def get_splits(paths):
    if not paths: return [], [], []
    # 70% Train, 15% Val, 15% Test
    train_val, test = train_test_split(paths, test_size=0.15, random_state=SEED)
    train, val = train_test_split(train_val, test_size=0.1765, random_state=SEED)
    return train, val, test

real_train, real_val, real_test = get_splits(real_paths)
fake_train, fake_val, fake_test = get_splits(fake_paths)

splits = {
    "train": {"real": real_train, "fake": fake_train},
    "validation": {"real": real_val, "fake": fake_val},
    "test": {"real": real_test, "fake": fake_test}
}

# --- 5. COPY FILES (NO PROCESSING) ---
print("Organizing files into Train/Validation/Test folders...")
if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)

file_count = 0

for split_name, categories in splits.items():
    for class_name, paths in categories.items():
        save_dir = os.path.join(OUTPUT_DIR, split_name, class_name)
        os.makedirs(save_dir, exist_ok=True)

        for src_path in tqdm(paths, desc=f"{split_name}/{class_name}"):
            try:
                # Direct file copy (Fastest method)
                fname = os.path.basename(src_path)
                dst_path = os.path.join(save_dir, fname)
                shutil.copy2(src_path, dst_path)
                file_count += 1
            except Exception:
                continue

# --- 6. ZIP AND DOWNLOAD ---
print("Zipping the organized dataset...")
shutil.make_archive("/content/final_dataset_organized", 'zip', OUTPUT_DIR)

print("Starting download to local machine...")
files.download("/content/final_dataset_organized.zip")

print(f"Operation complete. Total files organized: {file_count}")

Setting up Kaggle credentials...
Downloading dataset from Kaggle...
Unzipping dataset...
Scanning for images...
Real images found: 95201
Fake images found: 95134
Organizing files into Train/Validation/Test folders...


train/real: 100%|██████████| 66637/66637 [00:11<00:00, 5905.60it/s]
train/fake: 100%|██████████| 66590/66590 [00:11<00:00, 5846.15it/s]
validation/real: 100%|██████████| 14283/14283 [00:02<00:00, 6994.39it/s]
validation/fake: 100%|██████████| 14273/14273 [00:02<00:00, 5119.67it/s]
test/real: 100%|██████████| 14281/14281 [00:02<00:00, 5195.20it/s]
test/fake: 100%|██████████| 14271/14271 [00:03<00:00, 3757.25it/s]


Zipping the organized dataset...
Starting download to local machine...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Operation complete. Total files organized: 190335
