In [34]:
from common.social_model import SocialModel, BLIP_SocialModel, Random_SocialModel
from typing import Callable
from PIL import ImageFile, Image
import numpy as np
from scipy.stats import ttest_rel
import pandas as pd
from nltk.tokenize import SpaceTokenizer
from nltk.translate.meteor_score import single_meteor_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import os
import json


In [35]:
def compare_models_results(
        results_1 : list[str], 
        results_2 : list[str], 
        true_captions : list[str], 
        metric : Callable, 
        significance_level : float = 0.05
    ) -> bool:
    """
    Compares two models' results based on some already captioned_images and determines if model_1 is better than model_2 .\n
    Returns True if model_1 is significantly better; otherwise, False.
    """

    scores_1 = []
    scores_2 = []
    for true_caption, generated_pair in zip(true_captions, zip(results_1, results_2)):
        generated_caption1, generated_caption2 = generated_pair
        scores_1.append(metric(true_caption, generated_caption1))
        scores_2.append(metric(true_caption, generated_caption2))

    _, p_value = ttest_rel(scores_1, scores_2)

    NORMAL_CONST = 1.96 # 95% interval confidence
    # NORMAL_CONST = 1.645 # 90% interval confidence
    mean_diff = np.mean(np.array(scores_1) - np.array(scores_2))
    std_diff = np.std(np.array(scores_1) - np.array(scores_2))

    margin_of_error = NORMAL_CONST * std_diff / np.sqrt(len(scores_1))
    conf_int = (mean_diff - margin_of_error, mean_diff + margin_of_error)

    mean_1, std_1 = np.mean(scores_1), np.std(scores_1)
    mean_2, std_2 = np.mean(scores_2), np.std(scores_2)
    cohen_d = (mean_1 - mean_2) / np.sqrt((std_1**2 + std_2**2) / 2)


    # Combine conditions
    is_stat_significant = p_value < significance_level
    is_confidence_valid = conf_int[0] > 0 or conf_int[1] < 0
    is_effect_size_meaningful = cohen_d > 0.2  # Small effect threshold

    print(f'p-value: {p_value}')
    print(f'confidence-interval: {conf_int}')
    print(f'effect-size: {cohen_d}')

    return bool(is_stat_significant and is_confidence_valid and is_effect_size_meaningful)

In [3]:
def compare_models(
    model_1 : SocialModel, 
    model_2: SocialModel, 
    captioned_images : list[tuple[str, str]], 
    metric : Callable, 
    significance_level : float = 0.05
) -> bool:
    """
    Compares two models based on some already captioned_images and determines if model_1 is better than model_2 .\n
    Returns True if model_1 is significantly better; otherwise, False.
    """

    true_captions = [pair[1] for pair in captioned_images]
    generated_captions_1 = []
    generated_captions_2 = []
    for image_path, _ in captioned_images:
        generated_captions_1.append(model_1.caption(image_path))
        generated_captions_2.append(model_2.caption(image_path))

    return compare_models_results(
        generated_captions_1, 
        generated_captions_2, 
        true_captions, 
        metric,
        significance_level
        )

In [4]:
dataset_path = '../../../external/datasets/flickr8/'
captions_csv = pd.read_csv(f'{dataset_path}captions.txt')
THRESHOLD = 10

captioned_set = set()
captioned_images = []

for value in captions_csv.values:
    image_name = value[0]
    image_caption = value[1] 

    if image_name not in captioned_set:
        captioned_set.add(image_name)
        image_path = f'{dataset_path}/Images/{image_name}'
        captioned_images.append((image_path, image_caption))
    if len(captioned_set) >= THRESHOLD:
        break

In [48]:
def meteor_metric(str1 : str, str2 : str) -> float:
    tokenizer = SpaceTokenizer()
    tokens1 = tokenizer.tokenize(str1)
    tokens2 = tokenizer.tokenize(str2)
    return single_meteor_score(tokens1, tokens2)

def bleu_metric(str1 : str, str2 : str) -> float:
    smooth_fn = SmoothingFunction().method4
    tokenizer = SpaceTokenizer()
    tokens1 = tokenizer.tokenize(str1)
    tokens2 = tokenizer.tokenize(str2)
    return sentence_bleu([tokens1], tokens2, smoothing_function=smooth_fn)

In [6]:
BLIP_PATH = '../../../external/trained_models/blip_image_captioning_base'
model1 = BLIP_SocialModel(BLIP_PATH)
model2 = Random_SocialModel()

result1 = compare_models(model1, model2, captioned_images, meteor_metric)
result2 = compare_models(model2, model1, captioned_images, meteor_metric)
result1, result2

p-value: 0.0008658795285143208
confidence-interval: (np.float64(0.1380198434003062), np.float64(0.30768779973773297))
effect-size: 2.271568666318965
p-value: 0.0007518885897098205
confidence-interval: (np.float64(-0.3151024444733975), np.float64(-0.14395279224752938))
effect-size: -2.3510398703305166


(True, False)

In [7]:
result1 = compare_models(model1, model1, captioned_images, meteor_metric)
result2 = compare_models(model2, model2, captioned_images, meteor_metric)
result1, result2

p-value: nan
confidence-interval: (np.float64(0.0), np.float64(0.0))
effect-size: 0.0
p-value: nan
confidence-interval: (np.float64(0.0), np.float64(0.0))
effect-size: nan


  cohen_d = (mean_1 - mean_2) / np.sqrt((std_1**2 + std_2**2) / 2)


(False, False)

In [64]:
BASE_PATH = '../../../external/'
pipelines_path = f'{BASE_PATH}pipelines/pipeline_2.0/'
test_path = f'{BASE_PATH}test_images.json'

In [65]:
test_dict = []

with open(test_path, 'r') as file:
    test_dict = json.load(file)

test_keys = []
test_values = []

for d in test_dict:
    test_keys.append(d['id'])
    test_values.append(d['caption'])

results : dict[str, list[str]] = {}

for file_name in os.listdir(pipelines_path):
    if 'results' in file_name:
        results[file_name] = []
        with open(pipelines_path+file_name) as file:
            result_dict = json.load(file)
        for key in test_keys:
            results[file_name].append(result_dict[f'{key}'])

In [66]:
best_key : str = None
best_result : str = None
for key, result in results.items():
    if best_key != None:
        first = compare_models_results(best_result, result, test_values, meteor_metric)
        second = compare_models_results(result, best_result, test_values, meteor_metric)
        if not first and second:
            best_key = key
            best_result = result
    else:
        best_key = key
        best_result = result

p-value: nan
confidence-interval: (np.float64(0.0), np.float64(0.0))
effect-size: 0.0
p-value: nan
confidence-interval: (np.float64(0.0), np.float64(0.0))
effect-size: 0.0
p-value: 4.774717803177695e-05
confidence-interval: (np.float64(-0.34968210430562785), np.float64(-0.152775395297167))
effect-size: -1.3390851348672508
p-value: 4.774717803177695e-05
confidence-interval: (np.float64(0.152775395297167), np.float64(0.34968210430562785))
effect-size: 1.3390851348672508
p-value: 0.00022925971400374344
confidence-interval: (np.float64(0.10815343949404373), np.float64(0.28305388033777307))
effect-size: 1.0020185571067852
p-value: 0.00022925971400374344
confidence-interval: (np.float64(-0.28305388033777307), np.float64(-0.10815343949404373))
effect-size: -1.0020185571067852
p-value: 4.182918693198484e-05
confidence-interval: (np.float64(0.15247791986311557), np.float64(0.3456904523061408))
effect-size: 1.3281345026562237
p-value: 4.182918693198484e-05
confidence-interval: (np.float64(-0.345

In [67]:
best_key

'results0.json'