<a href="https://colab.research.google.com/github/ZhePang/Painting_Identification/blob/main/Painting_Identification_Evaluation_local.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Evaluation (local)

*   evaluate the performance of the finetuned ResNetV2 model using training and test data



load training data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import pandas as pd

# Path to your CSV file on Google Drive
csv_file_path = '/content/drive/My Drive/painting_identification_data.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

In [3]:
import zipfile
import os

# Path to your zip file on Google Drive
zip_file_path = '/content/drive/My Drive/painting_identification_data.zip'

# Directory where you want to unzip the contents
extract_to_path = '/content/data'

# Create directory if it doesn't exist
os.makedirs(extract_to_path, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

Create Evaluation Dataset and Dataloader

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
import numpy as np
from torchvision import transforms

In [6]:
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7ac3f956fcf0>

In [7]:
class TripletDataset(Dataset):
    def __init__(self, dataframe, base_path, transform=None):
        """
        Args:
            dataframe (DataFrame): DataFrame containing 'Filename' and 'Label'.
            base_path (str): Directory where images are stored.
            P (int): Number of different classes to sample in each batch.
            K (int): Number of images per class to sample in each batch.
        """
        self.dataframe = dataframe
        self.base_path = base_path
        self.transform = transform or transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])


    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Choose P labels randomly
        row = self.dataframe.iloc[idx]
        image_path = row['Filepath']
        image = Image.open(image_path).convert('RGB')
        image = self.transform(image)
        label = row['Label']
        return image, label

In [10]:
dataset = TripletDataset(df, '/content/data')
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

load finetuned model

In [None]:
!pip install timm

In [None]:
import timm

# Reinitialize the model architecture
model_rn = timm.create_model('resnetv2_50x1_bit.goog_in21k', pretrained=False, num_classes=0)

# Example using a Google Drive path
model_rn.load_state_dict(torch.load('/content/drive/My Drive/painting_identification_model_weights_p18k4n500e10.pth'))

# Transfer to GPU if necessary
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_rn.to(device)

generate embeddings of training data

In [12]:
from tqdm.notebook import tqdm

def collect_embeddings(model, dataloader, device):
    model.eval()
    embeddings_list = []
    labels_list = []

    with torch.no_grad():
        for images, labels in tqdm(dataloader, total=len(dataloader), desc="Test"):
            images = images.to(device)
            embeddings = model(images)
            embeddings_list.append(embeddings.cpu().numpy())
            labels_list.append(labels.cpu().numpy())

    # Flatten the list of arrays into a single array
    embeddings_array = np.vstack(embeddings_list)
    labels_array = np.concatenate(labels_list)
    return embeddings_array, labels_array


In [13]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

def plot_embeddings_3d(embeddings, labels, method='PCA'):
    if method == 'PCA':
        pca = PCA(n_components=3)
        reduced_embeddings = pca.fit_transform(embeddings)
    elif method == 't-SNE':
        tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
        reduced_embeddings = tsne.fit_transform(embeddings)

    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111, projection='3d')
    scatter = ax.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], reduced_embeddings[:, 2], c=labels, cmap='viridis', alpha=0.7)

    legend1 = ax.legend(*scatter.legend_elements(), title="Classes")
    ax.add_artist(legend1)

    plt.title(f'3D Embedding Space Visualization using {method}')
    plt.show()


In [13]:
# Usage
embeddings, labels = collect_embeddings(model_rn, dataloader, device)

Test:   0%|          | 0/1300 [00:00<?, ?it/s]

In [None]:
plot_embeddings_3d(embeddings, labels, method='t-SNE')

load original painting data and generate refence embeddings for searching

In [47]:
# Path to your CSV file on Google Drive
csv_file_path = '/content/drive/My Drive/painting_identification_original_data.csv'

# Load the CSV file into a DataFrame
original_df = pd.read_csv(csv_file_path)

In [48]:
# Path to your zip file on Google Drive
zip_file_path = '/content/drive/My Drive/painting_identification_original_data.zip'

# Directory where you want to unzip the contents
extract_to_path = '/content/originals'

# Create directory if it doesn't exist
os.makedirs(extract_to_path, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

In [49]:
original_dataset = TripletDataset(original_df, '/content/originals')
original_dataloader = DataLoader(original_dataset, batch_size=16, shuffle=False)

In [50]:
original_embeddings, original_labels = collect_embeddings(model_rn, original_dataloader, device)

Test:   0%|          | 0/62 [00:00<?, ?it/s]

evaluate accuracy on training data

In [52]:
def evaluate_embeddings(test_embeddings, test_labels, original_embeddings, original_labels):
    # Calculate pairwise Euclidean distances
    distances = torch.cdist(test_embeddings, original_embeddings)

    # Find the index of the minimum distance in each row (closest original embedding for each test embedding)
    min_indices = torch.argmin(distances, dim=1)

    # Retrieve labels of the closest embeddings
    closest_labels = original_labels[min_indices]

    # Compare labels
    matches = test_labels == closest_labels

    # Calculate accuracy: number of correct matches over total test embeddings
    accuracy = matches.float().mean().item()

    return accuracy


In [53]:
# Usage
accuracy = evaluate_embeddings(torch.tensor(embeddings), torch.tensor(labels),
                               torch.tensor(original_embeddings), torch.tensor(original_labels))
print(f"Accuracy of matching original images: {accuracy:.2%}")

Accuracy of matching original images: 97.26%


load test data

In [23]:
import zipfile

# Path to your zip file on Google Drive
zip_file_path = '/content/drive/My Drive/painting_identification_test_data.zip'

# Directory where you want to unzip the contents
extract_to_path = '/content/evaluate'

# Create directory if it doesn't exist
os.makedirs(extract_to_path, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

In [31]:
# Path to your CSV file on Google Drive
csv_file_path = '/content/drive/MyDrive/painting_identification_test_data.csv'

# Load the CSV file into a DataFrame
df_evaluate = pd.read_csv(csv_file_path)

In [42]:
df_evaluate.head()

Unnamed: 0,Filename,Title,Label,Filepath
0,Cornelia_Street_John_French_Sloan_1920_origina...,Cornelia_Street_John_French_Sloan_1920,556,/content/evaluate/Cornelia_Street_John_French_...
1,Cornelia_Street_John_French_Sloan_1920_augment...,Cornelia_Street_John_French_Sloan_1920,556,/content/evaluate/Cornelia_Street_John_French_...
2,Cornelia_Street_John_French_Sloan_1920_augment...,Cornelia_Street_John_French_Sloan_1920,556,/content/evaluate/Cornelia_Street_John_French_...
3,"O_Diabo,_a_Paraquedista,_Etc_Rene_Bertholo_199...","O_Diabo,_a_Paraquedista,_Etc_Rene_Bertholo_1997.0",858,"/content/evaluate/O_Diabo,_a_Paraquedista,_Etc..."
4,"O_Diabo,_a_Paraquedista,_Etc_Rene_Bertholo_199...","O_Diabo,_a_Paraquedista,_Etc_Rene_Bertholo_1997.0",858,"/content/evaluate/O_Diabo,_a_Paraquedista,_Etc..."


evaluate accuracy on test data

In [43]:
test_dataset = TripletDataset(df_evaluate, '/content/evaluate')
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [44]:
test_embeddings, test_labels = collect_embeddings(model_rn, test_dataloader, device)

Test:   0%|          | 0/186 [00:00<?, ?it/s]

In [54]:
test_accuracy = evaluate_embeddings(torch.tensor(test_embeddings), torch.tensor(test_labels),
                               torch.tensor(original_embeddings), torch.tensor(original_labels))
print(f"Accuracy of matching original images: {test_accuracy:.2%}")

Accuracy of matching original images: 97.61%
