In [1]:
%cd ..

/home/adrian/synthetic-image-detection


New Mapping

In [14]:
import os
import pandas as pd

mappings = {
    '__deprecated__/annotations/train_annotations.txt': 'data/train.csv',
    '__deprecated__/annotations/test_all.txt': 'data/test.csv',
}

for src, dest in mappings.items():
    with open(src, 'r') as f:
        lines = f.readlines()
        
    ann = []

    for line in lines:
        values = line.strip().split(',')
        ann.append({
            'image_path': os.path.join('data/images/', values[0][2:]), # [2:] to remove the leading './'
            'label': values[1],
            'content_type': values[2],
            'model': values[3],
            'specific_model': values[4],
        })

    pd.DataFrame(ann).to_csv(dest, index=False)

Data

In [None]:
# AUGMENTATIONS
import albumentations as A

def default_augmentation():
    return A.Compose([
            A.PadIfNeeded(96, 96),
            A.RandomCrop(96, 96),
            A.OneOf([
                A.OneOf([
                    A.ImageCompression(quality_lower=50, quality_upper=95, compression_type=0, p=1),
                    A.ImageCompression(quality_lower=50, quality_upper=95, compression_type=1, p=1),
                ], p=1),
                A.GaussianBlur(blur_limit=(3, 7), p=1.0),
                A.GaussNoise(var_limit=(3.0, 10.0), p=1.0),
            ], p=0.5),
            A.RandomRotate90(p=0.33),
            A.OneOf([
                A.HorizontalFlip(p=0.5),
                A.VerticalFlip(p=0.5),
            ], p=0.33),
        ], p=1.0)

# TRANSFORMATIONS
from torchvision import transforms as T

IMAGINET_MEAN = (0.485, 0.456, 0.406)
IMAGINET_STD = (0.229, 0.224, 0.225)
IMAGINET_SIZE = 256

class TwoViewsTransform:
    """Create two views of the same image"""
    def __init__(self, transform):
        self.transform = transform

    def __call__(self, x):
        return [self.transform(x), self.transform(x)]

def default_imaginet_transform(randaug=False, multiview=False):
    transform = T.Compose([
        T.ToTensor(),
        T.Normalize(mean=IMAGINET_MEAN, std=IMAGINET_STD)
    ])

    if randaug:
        transform.transforms.insert(0, T.RandAugment(num_ops=2, magnitude=9))
    
    if multiview: # Create two views of the same image for contrastive learning
        transform = TwoViewsTransform(transform)
    
    return transform
    
# DATASETS
import torch
import numpy as np
import pandas as pd
from PIL import Image, ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True
Image.MAX_IMAGE_PIXELS = 933120000

class ImagiNet(torch.utils.data.Dataset):
    def __init__(self, split='train', task='all', transform=None, augmentation=None):
        self.split = split
        self.data = pd.read_csv(f'data/{split}.csv')

        # Define label with respect to the task
        if task == 'all':
            self.task = ['label', 'content_type', 'model', 'specific_model']
        elif task in ['label', 'content_type', 'model', 'specific_model']:
            self.task = [task]
        else:
            raise ValueError(f"Unknown task: {task}")
        
        # Augmentations
        self.augmentation = augmentation if augmentation is not None else default_augmentation()

        # Transformations
        self.transform = transform if transform is not None else default_imaginet_transform()

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # IMAGE LOADING
        image = np.asarray(Image.open(row['image_path']).convert('RGB')) # Load as a numpy array

        if self.split in ['train', 'val']: # Training/Validation set
            image = self.augmentation(image=image)['image']

        image = self.transform(image)

        # LABEL FORMAT
        label = torch.tensor([row[t] for t in self.task], dtype=torch.long)

        return image, label  



(tensor([[[ 0.2796,  0.4166,  0.4851,  ..., -1.3644, -1.3815, -1.5528],
          [ 0.3481,  0.4166,  0.3309,  ..., -1.5870, -1.5699, -1.5014],
          [ 0.0912,  0.2282,  0.2967,  ..., -1.5357, -1.3815, -1.2103],
          ...,
          [ 0.2111,  0.1597,  0.1426,  ...,  0.2624,  0.3309,  0.3994],
          [ 0.1768,  0.2796,  0.2796,  ...,  0.3138,  0.2796,  0.2624],
          [ 0.2796,  0.3481,  0.2453,  ...,  0.4679,  0.3652,  0.3481]],
 
         [[ 0.2402,  0.3978,  0.4153,  ..., -1.1954, -1.2304, -1.3354],
          [ 0.3102,  0.3978,  0.2227,  ..., -1.4230, -1.4580, -1.3704],
          [-0.0049,  0.1352,  0.1702,  ..., -1.3529, -1.3004, -1.1078],
          ...,
          [ 0.2402,  0.1877,  0.1702,  ...,  0.2927,  0.4503,  0.4678],
          [ 0.1877,  0.2402,  0.2402,  ...,  0.2577,  0.2402,  0.2752],
          [ 0.2752,  0.2927,  0.1877,  ...,  0.3978,  0.3627,  0.3452]],
 
         [[-0.1661, -0.0615, -0.0615,  ..., -0.9853, -1.1247, -1.2467],
          [-0.0964, -0.0615,

Model

In [27]:
from transformers import AutoImageProcessor, ResNetForImageClassification
import torch

processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
model = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [32]:
model

ResNetForImageClassification(
  (resnet): ResNetModel(
    (embedder): ResNetEmbeddings(
      (embedder): ResNetConvLayer(
        (convolution): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        (normalization): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (activation): ReLU()
      )
      (pooler): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    )
    (encoder): ResNetEncoder(
      (stages): ModuleList(
        (0): ResNetStage(
          (layers): Sequential(
            (0): ResNetBottleNeckLayer(
              (shortcut): ResNetShortCut(
                (convolution): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
                (normalization): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              )
              (layer): Sequential(
                (0): ResNetConvLayer(
                  (convolution): Conv2d(64

__deprecated__

In [2]:
import os
import torch
from PIL import Image, ImageFile
from torch.utils.data import Dataset
import albumentations as A
import numpy as np
import random

ImageFile.LOAD_TRUNCATED_IMAGES = True
Image.MAX_IMAGE_PIXELS = 933120000

# COMPRESION TYPES
# 0: JPEG
# 1: WEBP
    
class ImagiNet(Dataset):
    def __init__(self, root_dir, annotations_file, track="all", load_pil=False, get_only_sd=False, train=True, resize=False, default_aug_album=None, anchor=False,  test_aug=False, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.train = train
        self.anchor = anchor
        self.resize = resize
        self.test_aug = test_aug
        self.load_pil = load_pil
        with open(annotations_file) as f:
            lines = f.readlines()
        tracks = ["origin", "content_type", "model", "specific_model", "all"]
        if track not in tracks:
            raise Exception("Not valid track")
        ann = [{'image': line.strip().split(",")[0], 'label': line.strip().split(",")[1:]} for line in lines]
        if track == tracks[0]:
            self.annotations = [{'image': a["image"], 'label': a["label"][0]} for a in ann]
        elif track == tracks[1]:
            self.annotations = [{'image': a["image"], 'label': a["label"][1]} for a in ann if int(a["label"][0]) == 1]
        elif track == tracks[2]:
            self.annotations = [{'image': a["image"], 'label': a["label"][2]} for a in ann if int(a["label"][0]) == 1]
        elif track == tracks[3]:
            self.annotations = [{'image': a["image"], 'label': a["label"][3]} for a in ann if int(a["label"][0]) == 1]
        else:
            self.annotations = [{'image': a["image"], 'label': a["label"]} for a in ann]
        
        self.albu_transform = A.Compose([
            A.PadIfNeeded(96, 96),
            A.RandomCrop(96, 96),
            A.OneOf([
                A.OneOf([
                    A.ImageCompression(quality_lower=50, quality_upper=95, compression_type=0, p=1),
                    A.ImageCompression(quality_lower=50, quality_upper=95, compression_type=1, p=1),
                ], p=1),
                A.GaussianBlur(blur_limit=(3, 7), p=1.0),
                A.GaussNoise(var_limit=(3.0, 10.0), p=1.0),
            ], p=0.5),
            A.RandomRotate90(p=0.33),
            A.OneOf([
                A.HorizontalFlip(p=0.5),
                A.VerticalFlip(p=0.5),
            ], p=0.33),
        ], p=1.0)
        default_transform = [A.PadIfNeeded(256, 256), A.CenterCrop(256, 256)]
        self.albu_default = A.Compose(default_transform, p=1) if default_aug_album is None else default_aug_album
        self.label_transform = lambda data: torch.tensor(data, dtype=torch.long)

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.annotations[idx]['image'])
        image_o = Image.open(img_path).convert('RGB')
        label = self.annotations[idx]['label']
        image = None
        if self.transform:
            if self.load_pil:
                image = self.transform(image_o)
            else:
                image_o =  np.asarray(image_o)
                if self.train == True:
                    if self.resize:
                        h = random.randint(160, max(min(image_o.shape[0:2]), 160))
                        interp = 4 if h > 256 else 2
                        resize =  A.Compose([
                                A.PadIfNeeded(h, h),
                                A.RandomCrop(h,h),
                                A.Resize(256, 256, interpolation=interp)], p=0.5)
                        image_o = resize(image=image_o)["image"]
                    image = self.albu_transform(image=image_o)["image"]
                    image = self.transform(image)
                else:
                    if self.anchor:
                        h = random.randint(160, max(min(image_o.shape[0:2]), 160))
                        interp = 4 if h > 256 else 2
                        res = A.Compose([
                                    A.PadIfNeeded(256, 256),
                                    A.RandomCrop(256, 256),
                                    A.OneOf([
                                        A.OneOf([
                                            A.ImageCompression(quality_lower=50, quality_upper=95, compression_type=0, p=1),
                                            A.ImageCompression(quality_lower=50, quality_upper=95, compression_type=1, p=1),
                                        ], p=1),
                                        A.GaussianBlur(blur_limit=(3, 7), p=1.0),
                                        A.GaussNoise(var_limit=(3.0, 10.0), p=1.0)
                                    ], p=1)
                                ], p=0.5)
                        if self.resize:
                            res = A.OneOf([
                                A.Compose([
                                    A.PadIfNeeded(256, 256),
                                    A.RandomResizedCrop(256, 256),
                                    A.OneOf([
                                        A.ImageCompression(quality_lower=50, quality_upper=95, compression_type=0, p=1),
                                        A.ImageCompression(quality_lower=50, quality_upper=95, compression_type=1, p=1),
                                    ], p=0.5),
                                ], p=0.3),
                                A.Compose([
                                    A.PadIfNeeded(256, 256),
                                    A.RandomCrop(256, 256),
                                    A.OneOf([
                                        A.OneOf([
                                            A.ImageCompression(quality_lower=50, quality_upper=95, compression_type=0, p=1),
                                            A.ImageCompression(quality_lower=50, quality_upper=95, compression_type=1, p=1),
                                        ], p=1),
                                        A.GaussianBlur(blur_limit=(3, 7), p=1.0),
                                        A.GaussNoise(var_limit=(3.0, 10.0), p=1.0)
                                    ], p=0.7)
                                ], p=1)], p=0.5)
                        anchor = A.Compose([
                        res,
                        A.PadIfNeeded(256, 256),
                        A.RandomCrop(256, 256),
                        A.RandomRotate90(p=0.33),
                        A.Flip(p=0.33),
                        ], p=1.0)
                        image = anchor(image=image_o)["image"]
                    else:

                        if self.test_aug:
                            h = random.randint(256, max(min(min(image_o.shape[0:2]), 1000), 256))
                            interp = 4
                            resize =  A.Compose([
                                    A.PadIfNeeded(h, h),
                                    A.RandomCrop(h,h),
                                    A.Resize(256, 256, interpolation=interp),
                                    A.OneOf([
                                            A.ImageCompression(quality_lower=60, quality_upper=100, compression_type=0, p=1),
                                            A.ImageCompression(quality_lower=60, quality_upper=100, compression_type=1, p=1),
                                    ], p=0.75),
                                    ], p=1)
                            image = resize(image=image_o)["image"]
                        else:
                            image = self.albu_default(image=image_o)["image"]
                    image = self.transform(image)
        return image, self.label_transform(list(map(int, label)))


In [3]:
from torchvision import transforms
from torchvision.transforms import ToTensor, Normalize

transform = []
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transform += [transforms.ToTensor(), normalize]
train_transform = transforms.Compose(transform)

d = ImagiNet('data/images', "__deprecated__/annotations/train_annotations.txt", track="all", train=True, resize=False, transform=train_transform)

In [4]:
d[0]

(tensor([[[ 0.0569,  0.0912,  0.1083,  ..., -0.6281, -1.0048, -1.3130],
          [ 0.0912,  0.1083,  0.1597,  ..., -0.9534, -1.3302, -1.3815],
          [ 0.0056,  0.0056,  0.1939,  ..., -1.3473, -1.3302, -1.2617],
          ...,
          [-0.1314, -0.1486, -0.1657,  ..., -0.5253, -0.5424, -0.5253],
          [-0.0458, -0.0629, -0.0972,  ..., -0.4911, -0.5082, -0.4568],
          [-0.0116, -0.0116, -0.0116,  ..., -0.3712, -0.3541, -0.2342]],
 
         [[ 0.1352,  0.1352,  0.1176,  ..., -0.4601, -0.9503, -1.3354],
          [ 0.2052,  0.1877,  0.2052,  ..., -0.7227, -1.1779, -1.2304],
          [ 0.0651,  0.0301,  0.1702,  ..., -1.0028, -1.0203, -0.9503],
          ...,
          [-0.0924, -0.1099, -0.1275,  ..., -0.5301, -0.5476, -0.5301],
          [-0.0749, -0.0749, -0.0924,  ..., -0.4776, -0.4951, -0.4426],
          [-0.1099, -0.1099, -0.0924,  ..., -0.3375, -0.3375, -0.2150]],
 
         [[ 0.2348,  0.2522,  0.1999,  ..., -0.0790, -0.5670, -0.9678],
          [ 0.3393,  0.3219,