In [1]:
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import os
import random

In [2]:
models = ["llava", "llava_x", "git", "valley", "videochatgpt", "video_llava"]
mapping_texts = ["M1", "M2", "M3", "M4", "M5", "M6"]

In [3]:
with open(f"/research/video_metaphor/LLaVA/playground/data/video_data/manual_eval_june/outputs/ann1_op.json", 'r') as file:
    evaluations1 = json.load(file)

with open(f"/research/video_metaphor/LLaVA/playground/data/video_data/manual_eval_june/outputs/ann2_op.json", 'r') as file:
    evaluations2 = json.load(file)

with open(f"/research/video_metaphor/LLaVA/playground/data/video_data/manual_eval_june/outputs/ann3_op.json", 'r') as file:
    evaluations3 = json.load(file)


print(len(evaluations1), len(evaluations2), len(evaluations3))
# evaluations1[0]


50 50 50


In [4]:
all_evaluations = evaluations1 + evaluations2 + evaluations3

In [5]:
mapper_basepath = "/research/video_metaphor/LLaVA/playground/data/video_data/manual_eval_june/permutation_mapping/"

scores = {
    "fluency" : {},
    "creativity": {},
    "pcc": {},
    "consistency": {}
}

for key, value in scores.items():
    for mod in models:
        scores[key][mod] = 0
# print(scores)

for count in range(1,151):
    # print(count)
    if count in range(26, 51) or count in range(76, 101):
        # For the videos annotated by all 3 annotators (last 25 of each 50 videos)
        # Ignoring annotations of 2 annotators, so that they don't overweigh the final results
        continue
    permutation_count = count % 50
    if permutation_count == 0:
        permutation_count = 50
    with open(f"{mapper_basepath}{permutation_count}.json", 'r') as file:
        column_mapper = json.load(file)
    model_mapper = dict((v,k) for k,v in column_mapper.items())
    # print("mapper: ")
    # print(model_mapper)
    evaluation_data = all_evaluations[count-1]

    for key_indexer in range(1, 25):
        if key_indexer >=1 and key_indexer <=6:
            metric = "fluency"
        elif key_indexer >=7 and key_indexer <=12:
            metric = "creativity"
        elif key_indexer >=13 and key_indexer <=18:
            metric = "pcc"
        elif key_indexer >=19 and key_indexer <=24:
            metric = "consistency"

        data_key_value = f"q{key_indexer}_answer"
        model_ind = (key_indexer%6) - 1
        model_id = mapping_texts[model_ind]
        model_name = model_mapper[model_id]
        try:
            current_answer = evaluation_data[data_key_value]
        except:
            current_answer = "No"
            print(count)
            print("Key: ", key_indexer)
        if current_answer == "Yes":
            scores[metric][model_name]+= 1

In [6]:
scores

{'fluency': {'llava': 99,
  'llava_x': 98,
  'git': 95,
  'valley': 36,
  'videochatgpt': 1,
  'video_llava': 96},
 'creativity': {'llava': 89,
  'llava_x': 96,
  'git': 87,
  'valley': 30,
  'videochatgpt': 2,
  'video_llava': 91},
 'pcc': {'llava': 59,
  'llava_x': 53,
  'git': 12,
  'valley': 15,
  'videochatgpt': 10,
  'video_llava': 74},
 'consistency': {'llava': 22,
  'llava_x': 15,
  'git': 3,
  'valley': 3,
  'videochatgpt': 8,
  'video_llava': 34}}

In [7]:
for metric in scores.keys():
    print("Metric: ", metric)
    model_scores = scores[metric]
    for key, value in model_scores.items():
        print(f"Model: {key} score: {value/100}")

Metric:  fluency
Model: llava score: 0.99
Model: llava_x score: 0.98
Model: git score: 0.95
Model: valley score: 0.36
Model: videochatgpt score: 0.01
Model: video_llava score: 0.96
Metric:  creativity
Model: llava score: 0.89
Model: llava_x score: 0.96
Model: git score: 0.87
Model: valley score: 0.3
Model: videochatgpt score: 0.02
Model: video_llava score: 0.91
Metric:  pcc
Model: llava score: 0.59
Model: llava_x score: 0.53
Model: git score: 0.12
Model: valley score: 0.15
Model: videochatgpt score: 0.1
Model: video_llava score: 0.74
Metric:  consistency
Model: llava score: 0.22
Model: llava_x score: 0.15
Model: git score: 0.03
Model: valley score: 0.03
Model: videochatgpt score: 0.08
Model: video_llava score: 0.34


In [8]:
def get_annotations_flattened(evaluation_set):
    ann_scores = []
    for data in evaluation_set:
        for i in range(1, 25):
            pred = data[f"q{i}_answer"]
            ann_scores.append(pred)
    print(len(ann_scores))
    return ann_scores

ann1_scores = get_annotations_flattened(evaluations1)
ann2_scores = get_annotations_flattened(evaluations2)
ann3_scores = get_annotations_flattened(evaluations3)

1200
1200
1200


In [9]:
def cohen_kappa(ann1, ann2, name):
    """Computes Cohen kappa for pair-wise annotators.
    :param ann1: annotations provided by first annotator
    :type ann1: list
    :param ann2: annotations provided by second annotator
    :type ann2: list
    :rtype: float
    :return: Cohen kappa statistic
    """
    count = 0
    for an1, an2 in zip(ann1, ann2):
        if an1 == an2:
            count += 1
    A = count / len(ann1)  # observed agreement A (Po)

    uniq = set(ann1 + ann2)
    E = 0  # expected agreement E (Pe)
    for item in uniq:
        cnt1 = ann1.count(item)
        cnt2 = ann2.count(item)
        count = ((cnt1 / len(ann1)) * (cnt2 / len(ann2)))
        E += count

    val = round((A - E) / (1 - E), 4)
    print(name, " : ", val)
    return val

In [10]:
cohen_kappa(ann1_scores, ann2_scores, "1 vs 2")
cohen_kappa(ann1_scores, ann3_scores, "1 vs 3")
cohen_kappa(ann2_scores, ann3_scores, "2 vs 3")

1 vs 2  :  0.6392
1 vs 3  :  0.6378
2 vs 3  :  0.685


0.685

In [None]:
!pip install statsmodels

In [11]:
from statsmodels.stats import inter_rater as irr


ann1_values = [1 if i=="Yes" else 0 for i in ann1_scores]
ann2_values = [1 if i=="Yes" else 0 for i in ann2_scores]
ann3_values = [1 if i=="Yes" else 0 for i in ann3_scores]
true_false_array = []

for (x , y, z) in zip(ann1_values, ann2_values, ann3_values):
    no_of_trues = x + y + z #no of 1 prediction
    no_of_false = 3 - (no_of_trues)
    true_false_array.append([no_of_trues, no_of_false])

irr.fleiss_kappa(true_false_array, method='fleiss')


0.6528666318049821