In [None]:
# ============================================================
# STAGE 1 ‚Äì INSTALLS
# ============================================================
!pip install -q torch torchvision scikit-learn opencv-python-headless


In [None]:
# ============================================================
# 1) IMPORTS + SEED
# ============================================================
import os
import json
import random
import numpy as np

import torch
from torchvision.datasets import ImageFolder
from torchvision import transforms
from PIL import Image
import cv2

from google.colab import drive

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
print("‚úÖ Seed set to 42")


‚úÖ Seed set to 42


In [None]:
# ============================================================
# 2) PATHS (Drive + Dataset + PREPROC ROOT)
# ============================================================
drive.mount('/content/drive')

BASE_PATH = '/content/drive/MyDrive/MultiBanFake/Dataset'
TRAIN_DIR = os.path.join(BASE_PATH, 'Train')
VAL_DIR   = os.path.join(BASE_PATH, 'Validation')
TEST_DIR  = os.path.join(BASE_PATH, 'Test')

print("Train:", TRAIN_DIR)
print("Val  :", VAL_DIR)
print("Test :", TEST_DIR)

PREPROC_ROOT = "/content/drive/MyDrive/MultiBanFake/preprocessed"
os.makedirs(PREPROC_ROOT, exist_ok=True)
print("PREPROC_ROOT:", PREPROC_ROOT)


Mounted at /content/drive
Train: /content/drive/MyDrive/MultiBanFake/Dataset/Train
Val  : /content/drive/MyDrive/MultiBanFake/Dataset/Validation
Test : /content/drive/MyDrive/MultiBanFake/Dataset/Test
PREPROC_ROOT: /content/drive/MyDrive/MultiBanFake/preprocessed


In [None]:
# ============================================================
# 3) BLUR ANALYSIS ON TRAIN (percentile-based threshold)
# ============================================================
def laplacian_var(pil_img):
    g = np.array(pil_img.convert("L"))
    return cv2.Laplacian(g, cv2.CV_64F).var()

def compute_blur_values(root):
    ds = ImageFolder(root=root)
    vals = []
    for path, _ in ds.samples:
        try:
            img = Image.open(path).convert("RGB")
        except:
            continue
        vals.append(laplacian_var(img))
    vals = np.array(vals)
    print("Total images:", len(vals))
    print("min:", vals.min(), "max:", vals.max())
    for p in [1, 2, 5, 10]:
        print(f"p{p}:", np.percentile(vals, p))
    return vals

blur_vals_train = compute_blur_values(TRAIN_DIR)
BLUR_THRESH_TRAIN = float(np.percentile(blur_vals_train, 1))  # bottom 1% extreme blur
print("üëâ Using BLUR_THRESH_TRAIN =", BLUR_THRESH_TRAIN)


Total images: 7680
min: 1.2552251173109121 max: 41068.54664945558
p1: 32.72085731788798
p2: 48.9287317643156
p5: 79.87211807113705
p10: 122.56347494283445
üëâ Using BLUR_THRESH_TRAIN = 32.72085731788798


In [None]:
# ============================================================
# 4) PREPROCESS SPLITS -> SAVE AS NPZ (Train/Val/Test)
# ============================================================
IMG_SIZE = 224  # sob model er jonno common size

def preprocess_split(root, apply_blur_filter=False, blur_thresh=None):
    ds = ImageFolder(root=root)
    resize_tf = transforms.Resize((IMG_SIZE, IMG_SIZE))

    images = []
    labels = []
    kept = 0
    skipped = 0

    print(f"\nüîç Preprocessing: {root}")
    for path, lbl in ds.samples:
        try:
            img = Image.open(path).convert("RGB")
        except Exception as e:
            print("‚ö†Ô∏è Failed to open:", path, "|", e)
            continue

        # ‚úÖ blur handle (agey je chilo)
        if apply_blur_filter and (blur_thresh is not None):
            v = laplacian_var(img)
            if v < blur_thresh:
                skipped += 1
                continue

        img = resize_tf(img)
        img = np.array(img, dtype=np.uint8)  # HWC, uint8

        images.append(img)
        labels.append(lbl)
        kept += 1

    images = np.stack(images, axis=0)   # [N, H, W, C]
    labels = np.array(labels, dtype=np.int64)

    print(f"{root}: kept={kept}, skipped_extreme_blur={skipped}")
    return images, labels, ds.classes

# üîπ Train: extreme blur drop
train_imgs, train_labels, classes = preprocess_split(
    TRAIN_DIR, apply_blur_filter=True, blur_thresh=BLUR_THRESH_TRAIN
)

# üîπ Val/Test: usually blur filter off (but chao to on korte paro)
val_imgs, val_labels, _ = preprocess_split(
    VAL_DIR, apply_blur_filter=False, blur_thresh=None
)
test_imgs, test_labels, _ = preprocess_split(
    TEST_DIR, apply_blur_filter=False, blur_thresh=None
)

print("\nTrain shape:", train_imgs.shape, train_labels.shape)
print("Val   shape:", val_imgs.shape, val_labels.shape)
print("Test  shape:", test_imgs.shape, test_labels.shape)

# ‚úÖ Direct drive e NPZ save (ekbar run = sob model use)
np.savez_compressed(
    os.path.join(PREPROC_ROOT, "train_npz.npz"),
    images=train_imgs, labels=train_labels
)
np.savez_compressed(
    os.path.join(PREPROC_ROOT, "val_npz.npz"),
    images=val_imgs, labels=val_labels
)
np.savez_compressed(
    os.path.join(PREPROC_ROOT, "test_npz.npz"),
    images=test_imgs, labels=test_labels
)

with open(os.path.join(PREPROC_ROOT, "classes.json"), "w") as f:
    json.dump(classes, f, indent=4)

print("\n‚úÖ Saved preprocessed numpy datasets to", PREPROC_ROOT)
print("   - train_npz.npz")
print("   - val_npz.npz")
print("   - test_npz.npz")
print("   - classes.json")



üîç Preprocessing: /content/drive/MyDrive/MultiBanFake/Dataset/Train
/content/drive/MyDrive/MultiBanFake/Dataset/Train: kept=7603, skipped_extreme_blur=77

üîç Preprocessing: /content/drive/MyDrive/MultiBanFake/Dataset/Validation
/content/drive/MyDrive/MultiBanFake/Dataset/Validation: kept=957, skipped_extreme_blur=0

üîç Preprocessing: /content/drive/MyDrive/MultiBanFake/Dataset/Test
/content/drive/MyDrive/MultiBanFake/Dataset/Test: kept=957, skipped_extreme_blur=0

Train shape: (7603, 224, 224, 3) (7603,)
Val   shape: (957, 224, 224, 3) (957,)
Test  shape: (957, 224, 224, 3) (957,)

‚úÖ Saved preprocessed numpy datasets to /content/drive/MyDrive/MultiBanFake/preprocessed
   - train_npz.npz
   - val_npz.npz
   - test_npz.npz
   - classes.json
