 WildCLIP by Gabeff et al.  
© ECEO and A. Mathis Lab  
https://github.com/amathislab/wildclip   

Licensed under GNU Lesser General Public License v3.0


# Performance evaluation

Given a prediction file, the original input file, and a list of queries, computes the mAP for these test queries

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import sys
from matplotlib import pyplot as plt
from sklearn.metrics import average_precision_score
sys.path.append("../")

from utils import visualization as viz

In [None]:
# Path to cropped images
data_path = Path("../<path_to_test_images>/")

# Path to output csv from eval_clip.py
predictions_file = Path("../<path_to_output_predictions>")

# Path to input csv to eval_clip.py
annotations_file = Path("../<path_to_input_csv_to_test>")

# Path to queries to compute performance for. 
# Must have an entry both in the "true" column of the prediction file (e.g. has ground truth) 
# and be one of the test column (e.g. "similarity has been computed").
# Must contain entries: 
#   "query": the query to test
#   "template": id of the template, either 1, 8, 9 or 10
#   "n_attributes": number of attributes in the query
# 
# Alternatively, can be a text file with a test query per row
queries_path = Path("../captions/<path_to_test_queries>") 


# We test with queries following either template 1 or template 8-10
templates = ["template_1"]
#templates = ["template_8", "template_9", "template_10"]

# Number of attributes in a test query: e.g "A camera trap picture of a lion eating" contains two attributes ("lion" and "eating" for species and behavior, respectively). 
# For simplicitiy in the paper, we tested with n_attr = 1 only.
n_attr = 1

# We test with queries containing either words from the base or the novel vocabulary
with open("../captions/ood_words.txt", "r") as f:
    ood_words = f.read().splitlines()

In [None]:
def mean_average_precision_score(queries, results_df):
    """Computes mAP for a set of query and predicted relevance score to each query"""
    APi = {}
    for query in queries:
        true = results_df["true"].apply(lambda d: query in d).astype(int).values
        if np.sum(true) == 0:
            #Query never seen in ground truth.
            APi[query] = np.nan
            continue
        scores = results_df[query].values #Cosine similarity between this query and all images
        AveP = average_precision_score(true, scores)
        APi[query] = AveP
    
    if len(queries): 
        return np.nanmean(list(APi.values())), APi
    else:
        return np.nan, APi


## One result file

In [None]:
predictions_df = pd.read_csv(predictions_file, index_col=0)
annotations_df = pd.read_csv(annotations_file, index_col=0)
results_df = (
    annotations_df[["crop_path", "CaptureEventID"]]
    .merge(predictions_df, left_index=True, right_index=True) #test and input csv crops have the same index
)

# Each image corresponds to many captions, all separated by a "; "
results_df["true"] = results_df["true"].str.split("; ")

# We get the queries that matches parameters of interest, that have at least one ground truth image, and the cosine similarity with test images have been computed
if queries_path.suffix == ".txt":
    with open(queries_path, "r") as f:
        queries = f.read().splitlines()
elif queries_path.suffix == ".csv":
    queries_df = pd.read_csv(queries_path, index_col=0)
    queries_subset_df = queries_df[(queries_df["template"].isin(templates)) & (queries_df["n_attributes"] == n_attr)]
    queries = queries_subset_df["query"].values
queries = list(set(queries).intersection(set([q for qs in results_df["true"].values for q in qs])))
queries = list(set(queries).intersection(set(results_df.columns)))
new_queries = [q for q in queries if any([o in q for o in ood_words])]
base_queries = list(set(queries) - set(new_queries))

# We take max similarity for all queries along the event images to also get predictions at the event level
predictions_event_df = (
    results_df
    [queries+["CaptureEventID"]]
    .groupby("CaptureEventID")
    .max() # for each column, which are the similarity between test queries and test images
    .reset_index()
)
results_event_df = results_df.drop(queries, axis=1).merge(predictions_event_df, on="CaptureEventID").drop_duplicates("CaptureEventID")

In [None]:
#Compute performance separately for test base and test novel queries, at the image and event level 
base_mAP, base_APi = mean_average_precision_score(base_queries, results_df)
new_mAP, new_APi = mean_average_precision_score(new_queries, results_df)
print("*", predictions_file.stem, f"mAP base classes at crop level: {base_mAP: .3f}")
print("*", predictions_file.stem, f"mAP new classes at crop level: {new_mAP: .3f}")

base_mAP_event, base_APi_event = mean_average_precision_score(base_queries, results_event_df)
new_mAP_event, new_APi_event = mean_average_precision_score(new_queries, results_event_df)
print("*", predictions_file.stem, f"mAP base classes at event level: {base_mAP_event: .3f}")
print("*", predictions_file.stem, f"mAP new classes at event level: {new_mAP_event: .3f}")

In [None]:
sorted(base_APi_event.items(), key=lambda x:x[1], reverse=True)

In [None]:
sorted(new_APi_event.items(), key=lambda x:x[1], reverse=True)

## Image Retrieval Visualization

In [None]:
#Be careful with the number of queries to plot
for query in sorted(base_queries + new_queries):
    print(query)

    #We drop duplicates at the event level -- we are interested in retrieving separate events
    top_10_annotations = results_df.sort_values(query, ascending=False).drop_duplicates("CaptureEventID", keep="first").head(10)
    preds = top_10_annotations[query].values
    
    fig, axs = plt.subplots(2, 
                            5,
                            figsize=(4*5, 4*2), 
                            gridspec_kw={"wspace":0.01, "hspace":0.01},
                            squeeze=True)
    axs = axs.flatten()
    for i, row in enumerate(top_10_annotations.itertuples()):
        crop_name = row.crop_path
        img = viz.open_img(data_path / crop_name, max_dims=600)
        if query in row.true:
            img = viz.add_border_color(img, "green", width=8)
        else:
            img = viz.add_border_color(img, "red", width=8)
        axs[i].set_axis_off()
        axs[i].imshow(img, aspect="equal") 
        axs[i].patch.set_linewidth('10')  
        axs[i].text(20, 20, str(round(preds[i], 3)), backgroundcolor="w")
        axs[i].text(20, 370, Path(row.crop_path).stem, backgroundcolor="w")
    plt.show()