In [1]:
!pip install pandas
!pip install torch
!pip install torchvision
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git
!pip install scikit-learn
!pip install tensorboard


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip

In [2]:
import csv
import glob
import numpy as np
import os
import pandas as pd
import pickle
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

from pathlib import Path
from tqdm.auto import trange

from classes.adaptation import calc_loss, construct_model
from classes.asymmetricRecall import AsymmetricRecall
from classes.embDataset import EmbDataset

from utils.adaptationPreprocess import create_paired_embeddings_dict, merge_dicts_with_csv
from utils.train_functions import train, validate, split_train_test

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
LEFT_FILES_PATH = 'data/train/left/*'
RIGHT_FILES_PATH = 'data/train/right/*'
ALL_FILES_PATH = 'data/train/all/*'
ALL_ENCODINGS_PATH = 'data/train_all_encodings.npy'
ALL_FULL_ENCODINGS_PATH = 'data/train_all_full_encodings.npy'
CSV_FILENAME = 'data/train.csv'
SAVE_TO = 'output/'

TEST_LEFT_FILES_PATH = 'data/test/left/*'
TEST_RIGHT_FILES_PATH = 'data/test/right/*'
TEST_ALL_FILES_PATH = 'data/test/all/*'
TEST_ALL_FULL_ENCODINGS_PATH = 'data/test_all_full_encodings.npy'
TEST_CSV_FILENAME = 'data/test_candidates.csv'

train_csv = 'data/train.csv'
train_candidates_csv = 'data/train_candidates.csv'
test_candidates_csv = 'data/test_candidates.csv'

In [4]:
is_cuda = torch.cuda.is_available()
if not is_cuda:
    device = torch.device("cpu")
else:
    device = torch.device("cuda")

Define parameters

In [5]:
runs = 1
temperature = 1
topk = [2]
epochs = 10
test_split = 0.7
model_name = 'original'

In [6]:
# Load encoded images
test_images_enc = np.load('data/test_all_full_encodings.npy')

# Read candidates
test_candidates = pd.read_csv(test_candidates_csv)

# Get images names
test_images_names = np.array([x.split('.')[0] for x in os.listdir('data/test/all')])

In [8]:
# test_all_full_encodings = np.load(TEST_ALL_FULL_ENCODINGS_PATH)

# dimensions_before_pca = test_all_full_encodings.shape[1]
# print('All encodings shape: ', test_all_full_encodings.shape)

# test_paired_embeddings = create_paired_embeddings_dict(test_all_full_encodings, TEST_ALL_FILES_PATH, TEST_LEFT_FILES_PATH, TEST_RIGHT_FILES_PATH)
# test_merged_dicts = merge_test_dicts_with_csv(TEST_CSV_FILENAME, test_paired_embeddings)
# print(test_merged_dicts)

Import peviously computed encodings

In [9]:
all_encodings = np.load(ALL_ENCODINGS_PATH)
all_full_encodings = np.load(ALL_FULL_ENCODINGS_PATH)

dimensions_before_pca = all_encodings.shape[1]
print('All encodings shape: ', all_encodings.shape)
print('All full encodings shape:', all_full_encodings.shape)

All encodings shape:  (4000, 640)
All full encodings shape: (4000, 6080)


Match embeddings with their original image name, link the left image with the ground truth right image

In [10]:
paired_embeddings = create_paired_embeddings_dict(all_full_encodings, ALL_FILES_PATH, LEFT_FILES_PATH, RIGHT_FILES_PATH)

merged_dicts = merge_dicts_with_csv(CSV_FILENAME, paired_embeddings)
print(merged_dicts['left'].shape)

(2000, 6080)


Define et construct the model

In [11]:
model = construct_model(model_name, dimensions_before_pca).to(device)

Load the dataset and split

In [12]:
embds_dataset = EmbDataset(merged_dicts, only_original=False)

print("Total dataset size:  ", len(embds_dataset))
in_dim = embds_dataset[0][0]["left"].shape[0]

splitted = split_train_test(embds_dataset, test_split, device)
train_set, valid_set = splitted["train"], splitted["test"]

print("train set without aug size ", len(train_set[0]["left"]))
print("train set with    aug size ", len(train_set[1]["left"]))
print("test  set without aug size  ", len(valid_set["left"]))

metrics_of_all_runs = []

info = (
    "Embd path: "
    + str(ALL_FULL_ENCODINGS_PATH)
    + "\n"
    + "Using augmented images: "
    + str(embds_dataset.augmented)
    + "\n"
)

Total dataset size:   2000
train set without aug size  300
train set with    aug size  300
test  set without aug size   700


Train the model, save it, and get the metrics/loss for each run

In [13]:
for run in trange(runs):
    run_path = Path(SAVE_TO) / f"run{run+1}"
    model = construct_model(model_name, in_dim).to(device)

    writer = SummaryWriter(run_path)
    writer.add_text("Temperature: ", str(temperature))
    writer.add_text("Model", str(model).replace("\n", "  \n"))
    writer.add_text("Informations: ", info.replace("\n", "  \n"))

    if model != "dummy":
        optimizer = torch.optim.Adam(model.parameters())
        # writer.add_text("Optimizer", str(optimizer).replace("\n", "  \n"))

        metrics_per_run = train(
            model,
            optimizer,
            calc_loss,
            train_set,
            valid_set,
            epochs,
            run_path,
            temperature,
            topk,
            device,
        )
        # save the model and the optimizer state_dics for this run
        torch.save(
            {
                "epoch": epochs,
                "state_dict": model.state_dict(),
                "optimzier": optimizer.state_dict(),
            },
            run_path / "model_and_optimizer.pt",
        )
    else:
        metrics_per_run = validate(
            model,
            calc_loss,
            valid_set,
            temperature,
            topk,
            device,
        )
        for metric, val in metrics_per_run.items():
            writer.add_scalar(metric, val, 1)
            writer.flush()
            

    np.save(run_path / "data", metrics_per_run)

    metrics_of_all_runs.append(metrics_per_run)

    splitted = split_train_test(embds_dataset, test_split, device)
    train_set, valid_set = splitted["train"], splitted["test"]

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch  10 / 10

100%|██████████| 10/10 [00:06<00:00,  1.63it/s]
100%|██████████| 1/1 [00:06<00:00,  6.81s/it]


Save all the metrics

In [14]:
# save all runs metrics into one file
np.save(Path(SAVE_TO) / "all_runs", metrics_of_all_runs)

avg_path = Path(SAVE_TO) / "avg"
writer = SummaryWriter(avg_path)

Display the saved metrics

In [15]:
test = np.load('output/run1/data.npy', allow_pickle=True)
print(test)

{'train': {'Train/Loss': [5.70373010635376, 5.70353889465332, 5.703559875488281, 5.703567981719971, 5.703570365905762, 5.703568458557129, 5.703555107116699, 5.7035417556762695, 5.703525066375732, 5.7035064697265625], 'Train/top 2': [0.0033333333333333335, 0.016666666666666666, 0.056666666666666664, 0.07333333333333333, 0.09333333333333334, 0.09333333333333334, 0.1, 0.1, 0.1, 0.10666666666666667]}, 'test': {'Test/Loss': [6.551097869873047, 6.551090240478516, 6.551087856292725, 6.551086902618408, 6.55108642578125, 6.551087379455566, 6.551086902618408, 6.551088333129883, 6.551088333129883, 6.551088333129883], 'Test/top 2': [0.002857142857142857, 0.002857142857142857, 0.002857142857142857, 0.002857142857142857, 0.002857142857142857, 0.002857142857142857, 0.004285714285714286, 0.004285714285714286, 0.002857142857142857, 0.002857142857142857]}}


### Inference

We load the test_dataset, and the previously saved model. We get the prediction of the model on the test dataset, then post process it before submitting

In [16]:
# Start with training data for validation score
images = np.load('data/train_all_full_encodings.npy')
pca = pickle.load(open('models/pca_module.pkl', 'rb'))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_np(left, right_images):
    """Compute cosine similarity between left image and all right images in row"""
    return np.array([cosine_similarity(left, right_images[i])[0][0] for i in range(len(right_images))])

def cos_sim_enc(images_enc, images_names, candidates_csv):
    """returns a data frame with format suitable for submitting, except that values 
    don't represent probabilities but cosine similarities."""
    results = []
    start_time = time.time()
    for i, row in candidates_csv.iterrows():

        # Compute cosine similarity between left and all other images in row
        left = images_enc[np.where(images_names == row['left'])]
        right_images = np.array([images_enc[np.where(images_names == row[f'c{i}'])] for i in range(20)])
        sim_array = cos_sim_np(left, right_images)
        res = [row['left']] + list(sim_array)
        results.append(list(res))

        if i%400 == 0 and i!=0:
            print(f"Processed: {i}")
            print("Elapsed time: ", time.time() - start_time)
            start_time = time.time()
            
    results = np.array(results)
    column_names = ['left'] + [f'c{i}' for i in range(20)]
    df = pd.DataFrame(results, columns=column_names)

    return df

# Functions to make dataframes values into probabilities
def softmax(x):
    """Compute softmax values for each sets of scores in x (usually a row of dataframe)."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def make_df_as_probs(df):
    # Make the values into probabilities with softmax
    df_as_probs = df.apply(lambda row: softmax(np.array(row[1:]).astype(np.float64)), axis=1)
    # Make into new dataframe
    df_as_probs = pd.DataFrame(df_as_probs.values.tolist(), columns=df.columns[1:])
    # Add left column in first position
    df_as_probs.insert(0, 'left', df['left'])
    
    return df_as_probs

# Evaluate result for training data (gives an idea of performance). For each row in df_train get 
# index of colmuns of top 2 values and check if the true match is in the top 2
def get_score(row, row_nb, true_labels, candidates):
    """return 1 if true label in top_2, else return 0"""
    # Get top 2
    top_2 = row[1:].argsort()[-2:][::-1].values

    # Get true label
    label_row = true_labels[true_labels['left'] == row['left']]
    true_label = label_row['right'].values[0]

    # Get top 2 predicted labels. i+1 to account for 'left' column
    top_2_names = [candidates.iloc[row_nb, i+1] for i in top_2]
    if true_label in top_2_names:
        return 1
    else:
        return 0

def eval(df, true_labels, candidates):
    """Evaluate score on df"""
    score = 0 
    for i, row in df.iterrows():
        score += get_score(row, i, true_labels, candidates)
    return score/len(df)

In [79]:
# Load encoded images
train_images_enc = np.load('data/train_all_full_encodings.npy')
test_images_enc = np.load('data/test_all_full_encodings.npy')

# Read candidates
test_candidates = pd.read_csv(test_candidates_csv)
train_candidates = pd.read_csv(train_candidates_csv)

# Get images names
test_images_names = np.array([x.split('.')[0] for x in os.listdir('data/test/all')])
train_images_names = np.array([x.split('.')[0] for x in os.listdir('data/train/all')])

In [80]:

# Apply PCA on embedded images
images_train_pca = pca.transform(train_images_enc)
images_test_pca = pca.transform(test_images_enc)

# # Get dataframe with cosine similarities between left image and candidates
# df_train = cos_sim_enc(images_train_pca, train_images_names, train_candidates)
# df_test = cos_sim_enc(images_test_pca, test_images_names, test_candidates)

# # Make the values into probabilities with softmax
# df_train_as_probs = make_df_as_probs(df_train)
# df_test_as_probs = make_df_as_probs(df_test)

# # Evaluate performance on training set
# train_labels = pd.read_csv('data/train.csv')
# eval(df_train_as_probs, train_labels, train_candidates)

# # Write csv
# df_test_as_probs.to_csv('output/cos_sim_full_enc_pca.csv', index=False)

In [19]:
print(type(model))
model.eval()

# output = model()

<class 'classes.adaptation.Adaptation'>


Adaptation(
  (weight_matrix): Linear(in_features=6080, out_features=1024, bias=False)
)

In [108]:
def inference(model, loss_func, topk, left, right_images):
    """Compute cosine similarity between left image and all right images in row"""
    for right_image in right_images:
        print(right_image.all() == left.all())
        out_left, out_right = model(torch.tensor(left)), model(torch.tensor(right_image))
        if left.all() == right_image.all():
            print('error')
        # print(type(left), type(right), type(out_left), type(out_right))
        # print(left.shape, left)
        # print(out_left, out_right)
        test_loss = loss_func(out_left, out_right, temperature, device)
        # print(test_loss)
        metrics = {"Test/Loss": test_loss.item()}

        aR = AsymmetricRecall(out_left.detach().cpu(), out_right.detach().cpu())

        for k in topk:
            metrics[f"Test/top {k}"] = aR.eval(at=k)

        return metrics
    return np.array([model(left, right_images[i])[0][0] for i in range(len(right_images))])


def get_score(model, loss_func, topk,images_enc, images_names, candidates_csv):
    """returns a data frame with format suitable for submitting, except that values 
    don't represent probabilities but cosine similarities."""
    results = []
    start_time = time.time()
    for i, row in candidates_csv.iterrows():

        # Compute cosine similarity between left and all other images in row
        # print(np.where(images_names == row['left'])[0] -1)
        # print([np.where(images_names == row[f'c{i}']) for i in range(20)])
        left = images_enc[np.where(images_names == row['left'])[0] -1]
        right_images = np.array([images_enc[np.where(images_names == row[f'c{i}'])] for i in range(20)])
        print([np.where(images_names == row['left'])[0] -1])
        print([np.where(images_names == row[f'c{i}']) for i in range(20)])
        print(left - right_images[1])
        sim_array = inference(model, loss_func, topk, left, right_images)
        print(sim_array)
        res = [row['left']] + list(sim_array)
        results.append(list(res))

        if i%400 == 0 and i!=0:
            print(f"Processed: {i}")
            print("Elapsed time: ", time.time() - start_time)
            start_time = time.time()
        break
    print('okkk')
    results = np.array(results)
    column_names = ['left'] + [f'c{i}' for i in range(20)]
    df = pd.DataFrame(results, columns=column_names)

    return df

In [109]:
get_score(model, calc_loss, topk, test_images_enc, test_images_names, test_candidates)

[array([2267])]
[(array([1378]),), (array([1462]),), (array([467]),), (array([2561]),), (array([1813]),), (array([2836]),), (array([1344]),), (array([156]),), (array([788]),), (array([2794]),), (array([2778]),), (array([3324]),), (array([2062]),), (array([1670]),), (array([3569]),), (array([2977]),), (array([3109]),), (array([413]),), (array([868]),), (array([1766]),)]
[[ 0.          0.          0.         ...  0.01539125  0.03738412
  -0.03599776]]
True
error
{'Test/Loss': 0.0, 'Test/top 2': 1.0}
okkk


ValueError: Shape of passed values is (1, 3), indices imply (1, 21)