In [27]:
# load err analysis data

import torch
from torch.utils.data import Dataset
from PIL import Image
import pandas as pd
import os
import json


class ErrorAnalysisDataset(Dataset):
    def __init__(self, dataset_root, pred_split, img_root_dir=None, transform=None):

        self.data = pd.read_csv(os.path.join(dataset_root, "dataset.csv"))
        self.img_root_dir = img_root_dir
        self.transform = transform
        
        with open(os.path.join(dataset_root, 'mapping.json'), 'r') as f:
            self.mapping = json.load(f)
            idx_to_mapping = list(self.mapping)
            
        predictions = pd.read_csv(os.path.join(dataset_root, f"pred_splits/{pred_split}"), header=None, names=['pred'])
        self.data['pred']=predictions['pred'].values
        self.data['pred']=self.data['pred'].apply(lambda pred: self.mapping[idx_to_mapping[pred]])
        self.data['gt']=self.data['gt'].apply(lambda pred: self.mapping[idx_to_mapping[pred]])

        
    def __len__(self):
        return len(self.data)
        
    def load_image(self, img_path):
        img_path = os.path.join(self.img_root_dir, img_path)
        image = Image.open(img_path).convert('RGB')  # Ensure RGB
        
        # Apply transformations if specified
        if self.transform:
            image = self.transform(image)

        return image
        
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # Load image
        img_path = os.path.join(self.img_root_dir, self.data.iloc[idx]['img_id'])
        image = Image.open(img_path).convert('RGB')  # Ensure RGB
        
        # Apply transformations if specified
        if self.transform:
            image = self.transform(image)
        
        # Extract metadata
        attribute = self.data.iloc[idx]['attribute']
        gt_code = self.data.iloc[idx]['gt_code']
        gt = self.data.iloc[idx]['gt']
        pred = self.data.iloc[idx]['pred']

        # Return a dictionary with the image and metadata
        sample = {
            'image': image,
            # 'attribute': attribute,
            # 'gt_code': gt_code,
            # 'gt': gt,
            # 'pred': pred
        }

        return sample


In [28]:
# create a Context class that you can add to and read from, should be LLM ready

In [29]:
# confusion matrix analysis module
import numpy as np
class InitialAnalysis:
    def __init__(self, df, prediction_col, ground_truth_col, k=5):
        # Get unique classes
        classes = sorted(df[ground_truth_col].unique())
        
        # Initialize confusion matrix
        confusion_matrix = pd.DataFrame(
            np.zeros((len(classes), len(classes)), dtype=int),
            index=classes,
            columns=classes
        )
        
        # Populate the confusion matrix
        for _, row in df.iterrows():
            actual = row[ground_truth_col]
            predicted = row[prediction_col]
            confusion_matrix.loc[actual, predicted] += 1
        
        # Extract non-diagonal elements
        errors = []
        for actual in classes:
            for predicted in classes:
                if actual != predicted and confusion_matrix.loc[actual, predicted] > 0:
                    errors.append(((actual, predicted), confusion_matrix.loc[actual, predicted]))

        # Sort errors by count and take top k
        top_k_errors = sorted(errors, key=lambda x: x[1], reverse=True)[:k]

        self.confusion_matrix = confusion_matrix
        self.top_k_errors_pred_conditional = top_k_errors
        self.k = k
        self.classes = classes
        
    def human_readable_topk_pred_conditional_errors(self):
        errors_nl = []
        for (actual, predicted), err_count in self.top_k_errors_pred_conditional:
            errors_nl.append(f"The actual class is '{actual}', however model incorrectly predicts '{predicted}' {err_count} times")
            
        return '\n '.join(errors_nl)
    
    def human_readable_topk_errors_gt(self):
        marginal_errs = []
        for gt in self.classes:
            marginal_errs.append((gt, (self.confusion_matrix.loc[gt].sum()-self.confusion_matrix.loc[gt, gt]).item()))

        # select top k
        marginal_errs = sorted(marginal_errs, key=lambda x: x[1], reverse=True)[:self.k]
        
        return f"The top five marginal errors are for these classes: {marginal_errs}"


In [30]:
# test it out
dataset = ErrorAnalysisDataset(dataset_root='../mock_data_creation/mock_data', pred_split='split_0.txt', img_root_dir='/ix/akovashka/arr159/imagenet-r')
dataset.data.head()

Unnamed: 0.1,Unnamed: 0,img_id,attribute,gt_code,gt,pred
0,0,n01443537/art_0.jpg,art,n01443537,goldfish,missile
1,1,n01443537/art_1.jpg,art,n01443537,goldfish,goldfish
2,2,n01443537/art_10.jpg,art,n01443537,goldfish,soccer_ball
3,3,n01443537/art_11.jpg,art,n01443537,goldfish,goldfish
4,4,n01443537/art_12.jpg,art,n01443537,goldfish,goldfish


In [31]:
analysis = InitialAnalysis(dataset.data, prediction_col='pred', ground_truth_col='gt')
analysis.human_readable_topk_errors_gt()

"The top five marginal errors are for these classes: [('mushroom', 125), ('toucan', 97), ('flamingo', 96), ('bee', 81), ('jellyfish', 80)]"

In [6]:
analysis.human_readable_topk_pred_conditional_errors()

"The actual class is 'ant', however model incorrectly predicts 'burrito' 4 times\n The actual class is 'hotdog', however model incorrectly predicts 'west_highland_white_terrier' 4 times\n The actual class is 'mushroom', however model incorrectly predicts 'pig' 4 times\n The actual class is 'tank', however model incorrectly predicts 'parachute' 4 times\n The actual class is 'African_chameleon', however model incorrectly predicts 'shih_tzu' 3 times"

In [7]:
def sample(data, slice_condition, prediction_col='pred', ground_truth_col='gt', n=10):
    # filter data based on slice condition
    filtered_data = data[data.apply(slice_condition, axis=1)]
    
    # divide into error set and non error set
    error_set = filtered_data[filtered_data[prediction_col] != filtered_data[ground_truth_col]]
    non_error_set = filtered_data[filtered_data[prediction_col] == filtered_data[ground_truth_col]]

    # sample n from both sets
    sampled_error = error_set.sample(n=min(len(error_set), n), random_state=42)  # Use random_state for reproducibility
    sampled_non_error = non_error_set.sample(n=min(len(non_error_set), n), random_state=42)

    return sampled_error['img_id'].values, sampled_non_error['img_id'].values

In [8]:
sample(dataset.data, lambda x: True)

(array(['n04086273/art_3.jpg', 'n02108915/misc_42.jpg',
        'n01882714/toy_15.jpg', 'n07697537/misc_118.jpg',
        'n02356798/cartoon_11.jpg', 'n02094433/misc_37.jpg',
        'n07873807/misc_6.jpg', 'n02480855/tattoo_28.jpg',
        'n02007558/embroidery_5.jpg', 'n10565667/sketch_3.jpg'],
       dtype=object),
 array(['n03649909/misc_0.jpg', 'n01498041/misc_1.jpg',
        'n07768694/toy_0.jpg', 'n02134084/misc_119.jpg',
        'n01518878/origami_7.jpg', 'n01833805/misc_4.jpg',
        'n02802426/tattoo_6.jpg', 'n02769748/cartoon_4.jpg',
        'n01632777/sketch_17.jpg', 'n01843383/embroidery_1.jpg'],
       dtype=object))

In [9]:
# load mllm model
MAX_LENGTH = 384
MODEL_ID = "llava-hf/llava-1.5-7b-hf"

from transformers import AutoProcessor, LlavaForConditionalGeneration
import torch
processor = AutoProcessor.from_pretrained(MODEL_ID)
processor.tokenizer.padding_side = "right" # during training, one always uses padding on the right

model = LlavaForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        # _attn_implementation="flash_attention_2",
).cuda()

Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
images = [dataset[0]['image']] # testing
texts = []
print(predictions)

test


Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


['The image features a colorful fish painted on a pillar, with a blue background. The fish is the main focus of the scene, and it appears to be a decorative piece or a piece of art. The fish is positioned in the middle of the pillar, and its vibrant colors make it stand out against the blue background.']


In [41]:
# captioning, prompt for caption Caption the image with the most salient details

def caption_sets(model, processor, dataset, err_list, non_error_list, prompt="Caption the image with the most salient details"):
    images = []
    texts=[]
    images.extend([dataset.load_image(img_path) for img_path in err_list])
    images.extend([dataset.load_image(img_path) for img_path in non_error_list])

    for img in images:
        prompt = f"USER: <image>\nCaption the image with the most salient details.\nASSISTANT:"
        texts.append(prompt)

    batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
    
    input_ids = batch["input_ids"].cuda()
    attention_mask = batch["attention_mask"].cuda()
    pixel_values = batch["pixel_values"].cuda()
    
    generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask,
                                           pixel_values=pixel_values, max_new_tokens=MAX_LENGTH)
    
    predictions = processor.batch_decode(generated_ids[:, input_ids.size(1):], skip_special_tokens=True)
    
    err_list_prompt = [f'A sample with a incorrect prediction has description: ' for i, _ in enumerate(err_list)]
    non_err_list_prompt = [f'A sample with a correct prediction has description: ' for i, _ in enumerate(non_error_list)]
    return [f'{text}{predictions[i]}' for i, text in enumerate([*err_list_prompt, *non_err_list_prompt])]


In [42]:
# testing
err_list, non_err_list = sample(dataset.data, lambda x: True)

caption_sets(model, processor, dataset, err_list, non_err_list)

["A sample with a incorrect prediction has description: The image features a woman wearing a yellow shirt with a gun design on it. The shirt has a green and red color scheme, and the gun is prominently displayed on the front. The woman appears to be the main focus of the image, and the shirt's design adds a unique touch to her outfit.",
 "A sample with a incorrect prediction has description: The image is a black and white drawing of a small dog with a bow tie. The dog is looking directly at the viewer, capturing attention. The drawing is quite detailed, showcasing the dog's facial features and the bow tie it is wearing. The overall composition of the drawing is visually appealing and captures the essence of the dog's personality.",
 'A sample with a incorrect prediction has description: The image features a stuffed koala bear sitting on a wooden table. The koala is wearing a white shirt and has a brown and white color scheme. The bear is positioned in the center of the scene, occupying

In [None]:
# hypothesis + validator class

# 

In [None]:
# conclusion? max steps?

In [None]:
# llm brain class