In [17]:
import pandas as pd
from torchvision.datasets import ImageFolder
import re

data_path = '/ix/akovashka/arr159/imagenet-r' 
dataset = ImageFolder(root=data_path)

# extract relative path
img_ids = ['/'.join(path.split('/')[-2:]) for path, _ in dataset.samples]

# extract attributes
pattern = re.compile(r"([^/]+)_(\d+)\.jpg$")

attributes=[]
for path, _ in dataset.samples:
    match = pattern.search(path)
    if match:
        transformation_type = match.group(1)
    else:
        raise Exception
    attributes.append(transformation_type)
        
ground_truth_classes = [label for _, label in dataset.samples]
gt_codes = [path.split('/')[-2] for path, _ in dataset.samples]

df = pd.DataFrame({
    'img_id': img_ids,
    'attribute': attributes,
    'gt_code': gt_codes,
    'gt': ground_truth_classes
})

In [7]:
df['attribute'].unique()

array(['art', 'cartoon', 'deviantart', 'embroidery', 'graffiti',
       'graphic', 'misc', 'origami', 'painting', 'sculpture', 'sketch',
       'sticker', 'toy', 'videogame', 'tattoo'], dtype=object)

In [14]:
from os import path

mapping_raw = open(path.join(data_path, 'README.txt')).readlines()
mapping = {line.split()[0]: line.split()[1].rstrip() for line in mapping_raw[13:]}

In [19]:
df.to_csv('mock_data/dataset.csv')

In [22]:
import json
with open('mock_data/mapping.json', 'w') as json_file:
    json.dump(mapping, json_file, indent=4)

# Create Pred Splits

In [4]:
import random

def corruption_fn(df, condition_fn, corruption_matrix, corruption_prob):
    predictions = []
    for _, row in df.iterrows():
        if condition_fn(row):
            if random.random() < corruption_prob:
                sampled_class = random.choices(list(range(len(corruption_matrix[row['gt']]))), weights=corruption_matrix[row['gt']], k=1)[0] 
                predictions.append(sampled_class)
            else:
                predictions.append(row['pred'])
        else:
            predictions.append(row['pred']) # keep it the same
    return predictions

In [35]:
import numpy as np
condition = lambda row: True
df['pred'] = df['gt']
corruption_matrix = np.ones((200,200)) # these are the weights for class change
# set diagonal to zero
np.fill_diagonal(corruption_matrix, 0)

predictions = corruption_fn(df, condition, corruption_matrix, corruption_prob=0.25)
with open('mock_data/pred_splits/split_0.txt', 'w') as f:
    for pred in predictions:
        f.write(f'{pred}\n')

In [44]:
condition = lambda row: row['attribute']=='sketch' and mapping[row['gt_code']] == 'goose'
df['pred'] = df['gt']
corruption_matrix = np.ones((200,200)) # these are the weights for class change
# set diagonal to zero
np.fill_diagonal(corruption_matrix, 0)

predictions = corruption_fn(df, condition, corruption_matrix, corruption_prob=0.6)
df['pred'] = predictions

condition = lambda row: not (row['attribute']=='sketch' and mapping[row['gt_code']] == 'goose')
corruption_matrix = np.ones((200,200)) # these are the weights for class change
# set diagonal to zero
np.fill_diagonal(corruption_matrix, 0)

predictions = corruption_fn(df, condition, corruption_matrix, corruption_prob=0.25)

In [45]:
(np.array(predictions) != df['gt'].values).mean()

0.25016666666666665

In [46]:

with open('mock_data/pred_splits/split_1.txt', 'w') as f:
    for pred in predictions:
        f.write(f'{pred}\n')

In [53]:
gt_code2idx = dict(df[['gt_code', 'gt']].drop_duplicates().values)
class_name2idx = {mapping[gt_code]: idx for gt_code, idx in gt_code2idx.items()}

In [54]:
condition = lambda row: row['attribute']=='sketch' and mapping[row['gt_code']] == 'orangutan'
df['pred'] = df['gt']
corruption_matrix = np.ones((200,200)) # these are the weights for class change
# set diagonal to zero
np.fill_diagonal(corruption_matrix, 0)
corruption_matrix[class_name2idx['orangutan']][class_name2idx['chimpanzee']]=50

predictions = corruption_fn(df, condition, corruption_matrix, corruption_prob=0.6)
df['pred'] = predictions

condition2 = lambda row: not condition(row)
corruption_matrix = np.ones((200,200)) # these are the weights for class change
# set diagonal to zero
np.fill_diagonal(corruption_matrix, 0)

predictions = corruption_fn(df, condition2, corruption_matrix, corruption_prob=0.25)

In [55]:
(np.array(predictions) != df['gt'].values).mean()

0.25216666666666665

In [56]:
with open('mock_data/pred_splits/split_2.txt', 'w') as f:
    for pred in predictions:
        f.write(f'{pred}\n')

# Create Demo GT Class-Based Error Slice Splits

In [9]:
import pandas as pd
import numpy as np
import json

with open('mock_data/mapping.json', 'r') as json_file:
    mapping=json.load(json_file)
    
df = pd.read_csv('mock_data/dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,img_id,attribute,gt_code,gt
0,0,n01443537/art_0.jpg,art,n01443537,0
1,1,n01443537/art_1.jpg,art,n01443537,0
2,2,n01443537/art_10.jpg,art,n01443537,0
3,3,n01443537/art_11.jpg,art,n01443537,0
4,4,n01443537/art_12.jpg,art,n01443537,0


In [12]:
condition = lambda row: row['attribute']=='sketch' and mapping[row['gt_code']] == 'goose'
df['pred'] = df['gt']
corruption_matrix = np.ones((200,200)) # these are the weights for class change
# set diagonal to zero
np.fill_diagonal(corruption_matrix, 0)

predictions = corruption_fn(df, condition, corruption_matrix, corruption_prob=1.0)
df['pred'] = predictions

condition = lambda row: not (row['attribute']=='sketch' and mapping[row['gt_code']] == 'goose')
corruption_matrix = np.ones((200,200)) # these are the weights for class change
# set diagonal to zero
np.fill_diagonal(corruption_matrix, 0)

predictions = corruption_fn(df, condition, corruption_matrix, corruption_prob=0.01)
df['pred']=predictions

In [19]:
df[df['pred'] != df['gt']].groupby(['gt']).count()['gt_code'].values

array([ 2,  1,  1,  1,  1,  1,  3,  3,  1,  1,  1,  3,  2,  1,  4,  3,  1,
        2,  1, 24,  1,  2,  1,  1,  6,  1,  2,  2,  1,  5,  1,  2,  2,  1,
        1,  1,  2,  1,  1,  2,  3,  1,  1,  2,  1,  2,  2,  1,  2,  1,  1,
        2,  1,  2,  2,  1,  3,  3,  1,  3,  2,  1,  4,  1,  2,  1,  3,  2,
        1,  1,  1,  2,  1,  1,  2,  4,  3,  2,  3,  3,  1,  2,  2,  2,  2,
        2,  2,  2,  2,  1,  2,  1,  2,  3,  1,  1,  3,  2,  1,  1,  1,  1,
        4,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  1,  2,  1,
        3,  1,  3,  3,  1,  2,  3,  2,  1,  3,  7,  1,  1,  4,  1,  4,  4,
        1,  3,  3,  1,  1,  3,  4,  1,  2,  3,  2,  3,  1,  1,  2,  1,  5,
        5,  2,  3])

In [20]:
with open('mock_data/demo_pred_splits/split_0_goose_sketch.txt', 'w') as f:
    for pred in predictions:
        f.write(f'{pred}\n')