In [10]:
!pip install -q datasets torch torchvision pillow timm

In [11]:
import os
import random
from collections import defaultdict
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim
import timm
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [12]:
from google.colab import drive
drive.mount('/content/drive')
BASE_DIR = "/content/drive/MyDrive/ai_image_dataset"
os.makedirs(BASE_DIR, exist_ok=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
dataset = load_dataset("Parveshiiii/AI-vs-Real", split="train", streaming=True)


In [14]:
splits = ["train", "val", "test"]
classes = ["real", "ai_generated"]

for split in splits:
    for cls in classes:
        os.makedirs(f"{BASE_DIR}/{split}/{cls}", exist_ok=True)


In [None]:
import os
import random
from collections import defaultdict
from PIL import Image

# ==========================
# Parameters
# ==========================
counts_per_class = {"real": 500, "ai_generated": 500}  # ~1000 images total
split_ratio = {"train": 0.7, "val": 0.15, "test": 0.15}
image_size = (224, 224)
saved_counts = defaultdict(lambda: defaultdict(int))

# Folder structure must already exist
# splits = ["train", "val", "test"]
# classes = ["real", "ai_generated"]
# BASE_DIR = "/content/ai_image_dataset"  # or your Drive path

# ==========================
# Helper function to save images
# ==========================
def save_image(img, path):
    img = img.convert("RGB").resize(image_size)
    img.save(path, format="JPEG", quality=85)

# ==========================
# Sample and save images
# ==========================
for sample in dataset:
    # FIXED: Use 'class' key instead of 'label'
    img_class = sample.get("class")  # 1 = real, 0 = AI-generated
    if img_class is None:
        continue  # skip if key missing

    label = "real" if img_class == 1 else "ai_generated"

    # Check if we already have enough images
    total_saved = sum(saved_counts[split][label] for split in splits)
    if total_saved >= counts_per_class[label]:
        continue

    # Decide which split the image goes to
    r = random.random()
    if r < split_ratio["train"]:
        split = "train"
    elif r < split_ratio["train"] + split_ratio["val"]:
        split = "val"
    else:
        split = "test"

    # Skip if this split is already full for this class
    if saved_counts[split][label] >= counts_per_class[label] * split_ratio[split]:
        continue

    # Save image
    out_path = f"{BASE_DIR}/{split}/{label}/{saved_counts[split][label]}.jpg"
    save_image(sample["image"], out_path)
    saved_counts[split][label] += 1

    # Stop if both classes are done
    if all(sum(saved_counts[s][c] for s in splits) >= counts_per_class[c] for c in classes):
        break

# ==========================
# Verify counts
# ==========================
for split in splits:
    for cls in classes:
        path = f"{BASE_DIR}/{split}/{cls}"
        print(f"{split} / {cls}: {len(os.listdir(path))} images")


In [None]:
class AIImageDataset(Dataset):
    def __init__(self, root_dir, split, transform=None):
        self.images = []
        self.labels = []
        for cls_idx, cls in enumerate(classes):
            folder = os.path.join(root_dir, split, cls)
            for img_name in os.listdir(folder):
                self.images.append(os.path.join(folder, img_name))
                self.labels.append(cls_idx)
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img = Image.open(self.images[idx]).convert("RGB")
        if self.transform:
            img = self.transform(img)
        label = self.labels[idx]
        return img, label

transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])

train_dataset = AIImageDataset(BASE_DIR, "train", transform=transform)
val_dataset   = AIImageDataset(BASE_DIR, "val", transform=transform)
test_dataset  = AIImageDataset(BASE_DIR, "test", transform=transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [None]:
model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=2)
model = model.to(device)

# Freeze backbone
for param in model.features.parameters():
    param.requires_grad = False

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [None]:
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    train_acc = 100 * correct / total
    train_loss = running_loss / total

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {train_loss:.4f}, Accuracy: {train_acc:.2f}%")


In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_acc = 100 * correct / total
print(f"Test Accuracy: {test_acc:.2f}%")
