In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

from classes.CNNModel import CNNModel
from classes.TrainDataset import TrainDataset
from classes.TestDataset import TestDataset
from utils.cos_sim import cos_sim_make_output

## Plan:
- Use a pre-trained model for feature extraction
- Make pre-trained/homemade model for feature processing
- Compute 20*20 asymetric cosine similarity matrix
- Select top 2 images most similar to 'left' image

# 0. Pipeline

In [2]:
train_folder = 'data/train/'
test_folder = 'data/test/'
csv_folder = 'data/'

train_csv = csv_folder + 'train.csv'
train_candidates_csv = csv_folder + 'train_candidates.csv'
test_candidates_csv = csv_folder + 'test_candidates.csv'

In [3]:
# Define the model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [4]:
# Define the data transformations
transform = transforms.Compose([
    transforms.Resize((49, 40)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Define the dataset and data loader
train_dataset = TrainDataset(train_csv, train_candidates_csv, train_folder+'left', train_folder+'all', transform=transform)
test_dataset = TestDataset(test_candidates_csv, test_folder+'left', test_folder+'all', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)

Make baseline predictions, no models, no preprocessing. We use cosine similarity as our metric

In [5]:
df_output = cos_sim_make_output(test_loader, test_candidates_csv)

Processed 200 samples in 5.35 seconds
Processed 400 samples in 9.44 seconds
Processed 600 samples in 13.36 seconds
Processed 800 samples in 17.21 seconds
Processed 1000 samples in 21.03 seconds
Processed 1200 samples in 25.17 seconds
Processed 1400 samples in 29.05 seconds
Processed 1600 samples in 33.11 seconds
Processed 1800 samples in 37.12 seconds
Processed 2000 samples in 41.24 seconds


Result: 0.10 on Kaggle, twice as much as random guess.

# 1. Feature Extraction

#### Clip ResNet 50x4

To import clip model (source: https://github.com/openai/CLIP). Python version 3.10 recommended 

In [6]:
# !conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
# !pip install ftfy regex tqdm
# !pip install git+https://github.com/openai/CLIP.git
# !pip install torch

In [21]:
from clip import clip
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model, transform = clip.load("RN50x4", device)

# Define our own transform, suitable for our problem
t = transforms.Compose([transforms.ToPILImage(), 
                        transform, 
                        # add batch dimension
                        transforms.Lambda(lambda x: x.unsqueeze(0))])
transform

cpu


Compose(
    Resize(size=288, interpolation=bicubic, max_size=None, antialias=warn)
    CenterCrop(size=(288, 288))
    <function _convert_image_to_rgb at 0x0000024C4D9BD090>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

The model is composed of a visual model: Modified ResNet model (that we're going to use). A transformer, a token embedding and a final layer. Uncomment and execute next cell for details. 

In [8]:
# model

The Modified resNet model is composed of 4 layers and an attention pool. We're going to extract the representation after each layer and concatenate them in one vector of size 6000.

In [22]:
# names can be found in the detail of the model, have to get indices to get the modules
layers_names = ["layer1", "layer2", "layer3", "layer4", "attnpool"]

# get index of layer in the model by layer name
def get_layer_index(model, layer_name):
    return list(model.modules()).index(getattr(model, layer_name))

# get indices
layers_indices = [get_layer_index(model.visual, layer) for layer in layers_names]
print(layers_indices)

# Those are the modules we're interested in getting the output from
modules = []
for layer_index in layers_indices:
    layer_name, module = list(model.visual.named_modules())[layer_index]
    modules.append(module)

# Those are the modules used before layer1, we need them to get the input of layer1
# (they are the first 10 modules)
pre_modules = []
for layer_index in range(1, 11):
    layer_name, pre_module = list(model.visual.named_modules())[layer_index]
    pre_modules.append(pre_module)
    print(layer_name, pre_module)

[11, 60, 131, 246, 317]
conv1 Conv2d(3, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
bn1 BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
relu1 ReLU(inplace=True)
conv2 Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
bn2 BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
relu2 ReLU(inplace=True)
conv3 Conv2d(40, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
bn3 BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
relu3 ReLU(inplace=True)
avgpool AvgPool2d(kernel_size=2, stride=2, padding=0)


In [25]:
# Make PCA on training data
import glob
import time
import matplotlib.image as mpimg

def get_full_encoding(folder_path, pre_modules, modules, transform):
    """Get encoding for all images in folder_path. Concatenates the output 
    of the modules (not pre-modules). For outputs that are tensors with more than 2
    dimensions (including batch_size dimension), take the average of the spatial dimensions.
    Normalize the vectors with L2 norm.
    
    Encoded images are in he same order as the images in the folder."""
    images = []
    start_time = time.time()

    # One by one with blob
    count = 0
    for img_path in glob.glob(f'{folder_path}/*.jpg'):
        # get and preprocess
        img = mpimg.imread(img_path)
        img_t = transform(img)

        # layers for which we're not interested in the output
        out = img_t
        for pre_module in pre_modules:
            out = pre_module(out)

        # layers for which we're interested in the output
        for i in range(len(modules)):
            module = modules[i]
            out = module(out)
            if out.ndim == 4:
                # Average spatial dimensions
                vec = torch.mean(out, dim=(2, 3))
                # L2 normalization
                vec = vec / vec.norm(dim=-1, keepdim=True)
            else:
                vec = out
                vec = vec / vec.norm(dim=-1, keepdim=True)

            # concatenate outputs
            if i == 0:
                img_encoding = vec
            else:
                img_encoding = torch.cat((img_encoding, vec), dim=1)

        # Make numpy array from tensor
        img_enc = img_encoding.detach().numpy().flatten()
        images.append(img_enc)
        count += 1
    
        if count%200 == 0:
            print(f"Processed: {count}, Elapsed time: ", time.time() - start_time)
            start_time = time.time()
    return images

In [26]:
# Expect 25 mins of processing. Encode all training images for validation
images = get_full_encoding("data/train/all", pre_modules, modules, t)
images_np = np.array(images)
np.save('data/train_all_full_encodings.npy', images_np)

Processed: 200, Elapsed time:  60.352022647857666
Processed: 400, Elapsed time:  57.6754264831543
Processed: 600, Elapsed time:  60.924506187438965
Processed: 800, Elapsed time:  61.80785918235779
Processed: 1000, Elapsed time:  55.01278114318848
Processed: 1200, Elapsed time:  56.16243553161621
Processed: 1400, Elapsed time:  57.27299475669861
Processed: 1600, Elapsed time:  56.95402932167053
Processed: 1800, Elapsed time:  57.45021677017212
Processed: 2000, Elapsed time:  57.05836224555969
Processed: 2200, Elapsed time:  57.81105041503906
Processed: 2400, Elapsed time:  57.90114736557007
Processed: 2600, Elapsed time:  59.954755783081055
Processed: 2800, Elapsed time:  60.240116119384766
Processed: 3000, Elapsed time:  58.31308674812317
Processed: 3200, Elapsed time:  59.88385820388794
Processed: 3400, Elapsed time:  57.90711307525635
Processed: 3600, Elapsed time:  58.74224328994751
Processed: 3800, Elapsed time:  58.89434480667114
Processed: 4000, Elapsed time:  56.58669114112854


In [27]:
from sklearn.decomposition import PCA
import pickle

# Embedded vectors have 6000 dimensions, we reduce to 256 (93% variance explained)
pca = PCA(n_components=256)
pca.fit(images)
images_pca = pca.transform(images)
sum(pca.explained_variance_ratio_)

# Save the model
with open('models/pca_module.pkl', 'wb') as f:
    pickle.dump(pca, f)

In [28]:
# Expect 25 mins of processing. Encode all test images. We'll use the PCA fit on training data
images = get_full_encoding("data/test/all", pre_modules, modules, t)
images_np = np.array(images)
np.save('data/test_all_full_encodings.npy', images_np)

Processed: 200, Elapsed time:  60.38086676597595
Processed: 400, Elapsed time:  57.38793420791626
Processed: 600, Elapsed time:  58.32379364967346
Processed: 800, Elapsed time:  59.23546552658081
Processed: 1000, Elapsed time:  58.334389209747314
Processed: 1200, Elapsed time:  58.5935435295105
Processed: 1400, Elapsed time:  59.32722544670105
Processed: 1600, Elapsed time:  59.9698121547699
Processed: 1800, Elapsed time:  58.60418176651001
Processed: 2000, Elapsed time:  58.26839780807495
Processed: 2200, Elapsed time:  57.99907374382019
Processed: 2400, Elapsed time:  57.608824729919434
Processed: 2600, Elapsed time:  57.610082387924194
Processed: 2800, Elapsed time:  57.27380180358887
Processed: 3000, Elapsed time:  58.05987501144409
Processed: 3200, Elapsed time:  59.79752492904663
Processed: 3400, Elapsed time:  57.83853077888489
Processed: 3600, Elapsed time:  58.522308349609375
Processed: 3800, Elapsed time:  57.76077127456665
Processed: 4000, Elapsed time:  57.74093961715698


# 2. Adaptator 

# 3. Similarity score

In [29]:
# Start with training data for validation score
images = np.load('data/train_all_full_encodings.npy')
pca = pickle.load(open('models/pca_module.pkl', 'rb'))

In [37]:
# Load encoded images
train_images_enc = np.load('data/train_all_full_encodings.npy')
test_images_enc = np.load('data/test_all_full_encodings.npy')

# Read candidates
test_candidates = pd.read_csv(test_candidates_csv)
train_candidates = pd.read_csv(train_candidates_csv)

# Get images names
test_images_names = np.array([x.split('.')[0] for x in os.listdir('data/test/all')])
train_images_names = np.array([x.split('.')[0] for x in os.listdir('data/train/all')])

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_np(left, right_images):
    """Compute cosine similarity between left image and all right images in row"""
    return np.array([cosine_similarity(left, right_images[i])[0][0] for i in range(len(right_images))])

def cos_sim_enc(images_enc, images_names, candidates_csv):
    """returns a data frame with format suitable for submitting, except that values 
    don't represent probabilities but cosine similarities."""
    results = []
    start_time = time.time()
    for i, row in candidates_csv.iterrows():

        # Compute cosine similarity between left and all other images in row
        left = images_enc[np.where(images_names == row['left'])]
        right_images = np.array([images_enc[np.where(images_names == row[f'c{i}'])] for i in range(20)])
        sim_array = cos_sim_np(left, right_images)
        res = [row['left']] + list(sim_array)
        results.append(list(res))

        if i%400 == 0 and i!=0:
            print(f"Processed: {i}")
            print("Elapsed time: ", time.time() - start_time)
            start_time = time.time()
            
    results = np.array(results)
    column_names = ['left'] + [f'c{i}' for i in range(20)]
    df = pd.DataFrame(results, columns=column_names)

    return df

# Functions to make dataframes values into probabilities
def softmax(x):
    """Compute softmax values for each sets of scores in x (usually a row of dataframe)."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def make_df_as_probs(df):
    # Make the values into probabilities with softmax
    df_as_probs = df.apply(lambda row: softmax(np.array(row[1:]).astype(np.float64)), axis=1)
    # Make into new dataframe
    df_as_probs = pd.DataFrame(df_as_probs.values.tolist(), columns=df.columns[1:])
    # Add left column in first position
    df_as_probs.insert(0, 'left', df['left'])
    
    return df_as_probs

# Evaluate result for training data (gives an idea of performance). For each row in df_train get 
# index of colmuns of top 2 values and check if the true match is in the top 2
def get_score(row, row_nb, true_labels, candidates):
    """return 1 if true label in top_2, else return 0"""
    # Get top 2
    top_2 = row[1:].argsort()[-2:][::-1].values

    # Get true label
    label_row = true_labels[true_labels['left'] == row['left']]
    true_label = label_row['right'].values[0]

    # Get top 2 predicted labels. i+1 to account for 'left' column
    top_2_names = [candidates.iloc[row_nb, i+1] for i in top_2]
    if true_label in top_2_names:
        return 1
    else:
        return 0

def eval(df, true_labels, candidates):
    """Evaluate score on df"""
    score = 0 
    for i, row in df.iterrows():
        score += get_score(row, i, true_labels, candidates)
    return score/len(df)

In [43]:
# Apply PCA on embedded images
images_train_pca = pca.transform(train_images_enc)
images_test_pca = pca.transform(test_images_enc)

# Get dataframe with cosine similarities between left image and candidates
df_train = cos_sim_enc(images_train_pca, train_images_names, train_candidates)
df_test = cos_sim_enc(images_test_pca, test_images_names, test_candidates)

# Make the values into probabilities with softmax
df_train_as_probs = make_df_as_probs(df_train)
df_test_as_probs = make_df_as_probs(df_test)

# Evaluate performance on training set
train_labels = pd.read_csv('data/train.csv')
print(f'Accuracy:  {eval(df_train_as_probs, train_labels, train_candidates)*100}%')

# Write csv
df_test_as_probs.to_csv('output/cos_sim_full_enc_pca.csv', index=False)

Processed: 400
Elapsed time:  2.1653501987457275
Processed: 800
Elapsed time:  2.0913102626800537
Processed: 1200
Elapsed time:  2.081334114074707
Processed: 1600
Elapsed time:  2.110325813293457
Processed: 400
Elapsed time:  2.1007297039031982
Processed: 800
Elapsed time:  2.1041040420532227
Processed: 1200
Elapsed time:  2.123558282852173
Processed: 1600
Elapsed time:  2.1378159523010254
Accuracy:  63.55%


Performance of 64% accuracy is compelling.

# 4. Improving the model

Here are a few approaches to try for improving the model:
-   Preprocessing the input. Right now we use the transform provided with the model. The resizing and cropping are necessary for the model, but the normalization is standard for images from ImageNet. Our dataset comes from the reddit TTL, we might look into deriving the mean and std values from our dataset.

In [42]:
transform

Compose(
    Resize(size=288, interpolation=bicubic, max_size=None, antialias=warn)
    CenterCrop(size=(288, 288))
    <function _convert_image_to_rgb at 0x0000024C4D9BD090>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

-   Finetuning values: is 93% variance too much ? If we reduce the number of components in the PCA, will that improve or hurt the performance, the generalisation. To explore.
-   Include more layers output in the representation