# Which is Better Analysis

This program will perform the calculation of Elo ratings and correlation of these ratings with fitness function distances for image data from thr Which is Better experiment.

We begin by importing the libraries we need and choosing our image and its corresponding data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import matplotlib.pyplot as pplt
import EloAnalysis as EA
import Correlation as Cor
import imageio.v2 as imageio
import os

image = "Waterbottle"
files = [f"Which_Is_Better_Data/Dirk_{image}_choices_matrix.csv",
         f"Which_Is_Better_Data/Maryam_{image}_choices_matrix.csv",
         f"Which_Is_Better_Data/Nathan_{image}_choices_matrix.csv",
         f"Which_Is_Better_Data/Doruk_{image}_choices_matrix.csv",
         f"Which_Is_Better_Data/Andrew_{image}_choices_matrix.csv"]


Now we will calculate and plot the raw Elo scores for this image set.

In [None]:
K = 32
initial_ratings = EA.process_files(files, K)
sorted_ratings = sorted(initial_ratings.items(), key=lambda x: x[1], reverse=True)
objects, ratings = zip(*sorted_ratings)  # Unpack the objects and their ratings
EA.plot_ratings(objects, ratings, "Raw Elo Scores", "Objects", "Elo Score")

These will be more useful for comparison with fitness function results when normalized to a 0 to 1 scale, so we proceed with this and plot them once more.

In [None]:
normalized_scores = EA.sort_and_normalize_ratings(initial_ratings)
distance_matrix, objects_list = EA.create_distance_matrix(normalized_scores)
objects, normalized_ratings = zip(*normalized_scores)  # Unpack the objects and their normalized ratings
EA.plot_ratings(objects, normalized_ratings, "Waterbottle - Normalized ELO Scores", "Objects", "Normalized Score")

Let's take a look at the actual ground truth images in the order they were ranked, best to worst from left to right. Because of the simplified image naming convention for the data, we need to check what the file extension is for each image, which might take a few seconds.

In [None]:
N = len(objects)
fig, axes = pplt.subplots(1, N, figsize=(30, 15))
extensions = ['.png', '.jpg', '.jpeg']

for i, ax in enumerate(axes):
    base_path = f"images/{image}/{image}_{objects[i]}"
    for ext in extensions:
        try:
            file_path = f"{base_path}{ext}"
            im = imageio.imread(file_path)
            ax.imshow(im)
            ax.axis('off')
            break  # Exit the loop if image is successfully read
        except FileNotFoundError:
            continue  # Try the next extension
    else:
        # This block executes if no file is found after trying all extensions
        print(f"File not found for {objects[i]} in any of the expected formats.")

And now we'll export this to a CSV file for use in the regression analysis.

In [None]:
distance_df = pd.DataFrame(distance_matrix, index=objects_list, columns=objects_list)
csv_file_path = f'images/{image}/Elo_distance_matrix.csv'
distance_df.to_csv(csv_file_path)

Now we'll produce the distance matrices for the fitness functions. It takes time to run all of the fitness functions, so this one may take a little while.

In [None]:
Cor.process_images_in_folder(f"images/{image}")

To have tidy data, we will want the order of rows and columns as well as the names to be the same throughout all of the distance matrices, so we'll use some data cleaning functions to accomplish this.

In [None]:
Cor.update_csv_labels(f'images/{image}')
Cor.reorder_csv(f'images/{image}')

Then, finally, we'll run and visualize our linear regression.

In [None]:
independent_csv = f'images/{image}/Elo_distance_matrix.csv'
dependent_csvs = [f'images/{image}/FF_ML2DHD_V2_distance_matrix.csv', 
                  f'images/{image}/FF_Gamma_distance_matrix.csv',
                  f'images/{image}/FF_Hamming_distance_matrix.csv',
                  f'images/{image}/FF_ML2DHD_distance_matrix.csv']
labels = ['MADLAD', 'Gamma', 'Hamming', 'LAD']

Cor.plot_regression(independent_csv, dependent_csvs, labels)
Cor.perform_regression_significance_test(independent_csv, dependent_csvs, labels)

Since our correlations weren't very strong, comparison may be easier if we use a column chart as opposed to scatterplots.

In [None]:
r_squared = {
    'Gamma': [0.001, 0.001, 0.16],
    'Hamming': [0.001, 0.01, 0.001],
    'LAD': [0.23, 0.49, 0.15],
    'MADLAD': [0.16, 0.10, 0.06]
}
categories = list(r_squared.keys())
image_labels = ['A Walk in the Park', 'Mushrooms', 'Waterbottle']
colors = ['green', 'brown', 'blue']
n_categories = len(categories)
pos = np.arange(n_categories)
bar_width = 0.2
fig, ax = plt.subplots(figsize=(10, 6))

for i, img in enumerate(image_labels):
    r_squared_values = [r_squared[cat][i] for cat in categories]
    ax.bar(pos + i*bar_width, r_squared_values, bar_width, label=img, color=colors[i])

ax.set_xticks(pos + bar_width)
ax.set_xticklabels(categories)
ax.set_ylabel('$R^2$ Values')
ax.set_title('$R^2$ Values by Fitness Function and Image')
ax.legend(title='Images')

plt.show()



We will also do the same thing for the p-values.

In [None]:
p_value = {
    'Gamma': [0.73, 0.78, 0.005],
    'Hamming': [0.70, 0.53, 0.005],
    'LAD': [0.005, 0.005, 0.005],
    'MADLAD': [0.005, 0.02, 0.08]
}
categories = list(p_value.keys())
image_labels = ['A Walk in the Park', 'Mushrooms', 'Waterbottle']
colors = ['green', 'brown', 'blue']
n_categories = len(categories)
pos = np.arange(n_categories)
bar_width = 0.2
fig, ax = plt.subplots(figsize=(10, 6))

for i, img in enumerate(image_labels):
    p_values = [p_value[cat][i] for cat in categories]
    ax.bar(pos + i*bar_width, p_values, bar_width, label=img, color=colors[i])

ax.set_xticks(pos + bar_width)
ax.set_xticklabels(categories)
ax.set_ylabel('p Values')
ax.set_title('p Values by Fitness Function and Image')
ax.legend(title='Images')

plt.show()