In [None]:
# sys.path.insert(0, "./docs/transformers/transformers/src")

from datasets import concatenate_datasets
from transformers import Blip2Processor

from lib.easy_vqa.easyvqa_generation import EasyVQAGeneration
from lib.types import HFRepos, Suffix, VQAParameters
from lib.visualization import (
    calculate_label_frequency,
    create_label_frequency_boxplot,
    display_sample_images,
)

%load_ext autoreload
%autoreload 2

# Load dependencies
processor = Blip2Processor.from_pretrained(HFRepos.BLIP2_OPT)

DIR = "analysis/easyvqa/"

# Easy-VQA dataset
## Base Dataset
### Visualization

In [None]:
args = VQAParameters(Suffix.Val, recompute=False)  # using combined dataset
args.processor = processor
dataset = EasyVQAGeneration(args)
display_sample_images(
    dataset.raw_dataset, "EASY-VQA", f"{DIR}/0.easyvqa_sample_images.pdf", 10, 16
)

### 1. Label Frequency Bar Chart


In [None]:
args = VQAParameters(Suffix.Train, recompute=False)  # using combined dataset
args.processor = processor
dataset = EasyVQAGeneration(args)
calculate_label_frequency(
    dataset,
    None,
    None,
    path=f"{DIR}/1.easyvqa_bar_base",
    title="Easy-VQA Label Frequency Bar Chart",
)

### 2. Label Frequency Boxplot

In [None]:
create_label_frequency_boxplot(
    dataset,
    path=f"{DIR}/2.easyvqa_boxplot_base",
    title="Easy-VQA Label Frequency Boxplot",
)

## Processed Dataset
Below are the label frequency bar charts and boxplots for the processed dataset.

### 3. Label Frequency Bar Chart

In [None]:
args = VQAParameters("train", recompute=True, use_proportional_split=True)
args.processor = processor
dataset_train = EasyVQAGeneration(args)

args = VQAParameters("val", recompute=True, use_proportional_split=True)
args.processor = processor
dataset_val = EasyVQAGeneration(args)

args = VQAParameters("test", recompute=True, use_proportional_split=True)
args.processor = processor
dataset_test = EasyVQAGeneration(args)

calculate_label_frequency(
    dataset_train,
    dataset_val,
    dataset_test,
    path=f"{DIR}/3.easyvqa_bar_processed",
    title="Easy-VQA Processed Dataset Bar Chart",
)

## 4. Label Frequency Boxplot

In [None]:
dataset = concatenate_datasets(
    [dataset_train.raw_dataset, dataset_val.raw_dataset, dataset_test.raw_dataset]
)
create_label_frequency_boxplot(
    dataset,
    path=f"{DIR}/4.easyvqa_boxplot_processed",
    title="Easy-VQA Processed Dataset Boxplot",
)