# Libraries

In [2]:
import torch
import os
import shutil
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
import random
from glob import glob


# Preparation:
Delete the whole processed_dir structure

In [3]:
if os.path.exists(PROCESSED_DIR):
        shutil.rmtree(PROCESSED_DIR )

Some global variables

In [4]:
RAW_DIR = "data/raw/140k face"
PROCESSED_DIR = "data/processed"

CATEGORIES = ["fake", "real"]
TRAIN_DIR = os.path.join(PROCESSED_DIR, "train")
TEST_DIR = os.path.join(PROCESSED_DIR, "test")

GABOR_TRAIN_DIR = os.path.join(PROCESSED_DIR, "train")
GABOR_TEST_DIR = os.path.join(PROCESSED_DIR, "test")

SHARPEN_TRAIN_DIR = os.path.join(PROCESSED_DIR, "train")
SHARPEN_TEST_DIR = os.path.join(PROCESSED_DIR, "test")

EDGE_TRAIN_DIR = os.path.join(PROCESSED_DIR, "train")
EDGE_TEST_DIR = os.path.join(PROCESSED_DIR, "test")

split_ratio = 0.9
random.seed(12347556)

BATCH_SIZE = 100
IMAGE_SIZE = (32,32)

# Transformation

The transformation structures

In [18]:
resize_transform = transforms.Compose([
    transforms.Resize(IMAGE_SIZE)
])

augmentation_transforms = transforms.Compose([
    transforms.Resize(IMAGE_SIZE),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomRotation(degrees=30),
    transforms.ColorJitter(brightness=0.3, contrast=0.4, saturation=0.3, hue=0.2),
    transforms.ToTensor()
])

save_transform = transforms.ToPILImage()



Processing normal images

In [19]:
def process_images(category):    
    raw_path = os.path.join(RAW_DIR, category)
    processed_path = PROCESSED_DIR
    os.makedirs(processed_path, exist_ok=True)

    images = [os.path.join(raw_path, f) for f in os.listdir(raw_path) if f.endswith((".png", ".jpg", ".jpeg"))]

    image_counter = 0

    def get_unique_filename(base_path, base_name, counter):
        while True:
            filename = os.path.join(base_path, f"{base_name}_{counter}.jpg")
            if not os.path.exists(filename):
                return filename
            counter += 1

    for i in tqdm(range(0, 20000, BATCH_SIZE), desc=f"Processing {category}"): #len(images)
        batch = images[i:i + BATCH_SIZE]
        for img_path in batch:
            try:
                img = Image.open(img_path).convert("RGB")  # Kép betöltése

                # Save a simple resized version
                resized_img = resize_transform(img)
                resized_save_path = get_unique_filename(processed_path, category, image_counter)
                resized_img.save(resized_save_path)
                image_counter += 1

                # Save two augmented versions
                for aug_idx in range(0):
                    augmented_img = augmentation_transforms(img)  # Apply augmentation
                    augmented_img = save_transform(augmented_img)  # Convert tensor to PIL image
                    aug_save_path = get_unique_filename(processed_path, category, image_counter)
                    augmented_img.save(aug_save_path)
                    image_counter += 1

            except Exception as e:
                print(f"Error processing {img_path}: {e}")

 move data into respective folder

In [20]:
def split_and_move_data(split_ratio):
    # Get all image paths from the processed directory
    image_paths = [os.path.join(PROCESSED_DIR, f) for f in os.listdir(PROCESSED_DIR) if f.endswith((".png", ".jpg", ".jpeg"))]
    
    # Shuffle the image paths for random split
    random.shuffle(image_paths)
    
    # Calculate the split index
    split_index = int(len(image_paths) * split_ratio)
    
    # Split the images into train and test sets
    train_images = image_paths[:split_index]
    test_images = image_paths[split_index:]
    
    # Move images to the 'train' directory
    for img_path in train_images:
        shutil.move(img_path, os.path.join(TRAIN_DIR, os.path.basename(img_path)))
    
    # Move images to the 'test' directory
    for img_path in test_images:
        shutil.move(img_path, os.path.join(TEST_DIR, os.path.basename(img_path)))

    print(f"Total images: {len(image_paths)}")
    print(f"Training set: {len(train_images)}")
    print(f"Test set: {len(test_images)}")

In [21]:
os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(TEST_DIR, exist_ok=True)

In [6]:
os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(TEST_DIR, exist_ok=True)

for category in CATEGORIES:
    process_images(category)

split_and_move_data(split_ratio)


Total images: 2910
Training set: 2619
Test set: 291


In [22]:
RAW_DIR = "data/raw/140k face"

In [23]:
for category in CATEGORIES:
    process_images(category)

Processing fake: 100%|██████████| 200/200 [00:51<00:00,  3.89it/s]
Processing real: 100%|██████████| 200/200 [01:35<00:00,  2.10it/s]


In [24]:
split_and_move_data(split_ratio)

Total images: 40000
Training set: 36000
Test set: 4000


In [13]:
RAW_DIR = "data/raw/archive (2)/train"

In [14]:
for category in CATEGORIES:
    process_images(category)

Processing fake:   0%|          | 3/3125 [05:48<100:39:41, 116.07s/it]


KeyboardInterrupt: 

In [15]:
split_and_move_data(split_ratio)

FileNotFoundError: [Errno 2] No such file or directory: 'data/processed\\train\\real_27936.jpg'