In [None]:
# sys.path.insert(0, "./docs/transformers/transformers/src")


from datasets import concatenate_datasets
from transformers import Blip2Processor

from lib.daquar.daquar_classification import DaquarClassification
from lib.daquar.daquar_generation import DaquarGeneration
from lib.types import HFRepos, Suffix, VQAParameters
from lib.visualization import (
    calculate_label_frequency,
    create_label_frequency_boxplot,
    display_sample_images,
)

%load_ext autoreload
%autoreload 2

# Load dependencies
processor = Blip2Processor.from_pretrained(HFRepos.BLIP2_OPT)

DIR = "analysis/daquar/"

# DAQUAM dataset
## Base Dataset
### Visualization

In [None]:
args = VQAParameters(Suffix.All, recompute=True)  # using combined dataset
args.processor = processor
dataset = DaquarGeneration(args)
display_sample_images(
    dataset.raw_dataset, "DAQUAR", f"{DIR}/0.daquar_sample_images.pdf", 10, 16
)

### 1. Label Frequency Bar Chart

In [None]:
calculate_label_frequency(
    dataset,
    None,
    None,
    path=f"{DIR}/1.daquar_bar_base",
    title="DAQUAR Base Dataset Bar Chart",
    multilabel=True,
)

### 2. Label Frequency Boxplot

In [None]:
create_label_frequency_boxplot(
    dataset,
    path=f"{DIR}/1.daquar_boxplot_base",
    multilabel=True,
    title="DAQUAR Base Dataset Boxplot",
)

## Processed Dataset

### 3. Label Frequency Bar Chart

In [None]:
args = VQAParameters(
    Suffix.Train, use_proportional_split=True
)  # using combined dataset
args.processor = processor
train_dataset = DaquarClassification(args)

args = VQAParameters(Suffix.Val, use_proportional_split=True)  # using combined dataset
args.processor = processor
val_dataset = DaquarClassification(args)
calculate_label_frequency(
    train_dataset,
    val_dataset,
    None,
    f"{DIR}/3.daquar_boxplot_processed",
    multilabel=True,
    title="DAQUAR Processed Dataset Bar Chart",
)

### 4. Label Frequency Boxplot

In [None]:
dataset = concatenate_datasets([train_dataset.raw_dataset, val_dataset.raw_dataset])
create_label_frequency_boxplot(
    dataset,
    path=f"{DIR}/4.daquar_boxplot_processed",
    title="DAQUAR Processed Dataset Boxplot",
)