In [2]:
import cv2
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_auc_score, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm

# data augmentation
import albumentations as A
from albumentations.pytorch import ToTensorV2

# pretrained models
import torchvision
from torchvision import models, transforms

from functions import *
from classes import *

## Load in metadata

In [2]:
all_metadata = pd.read_csv('data/all_data_info.csv')

In [3]:
# select artists for the task
artists = ['John Singer Sargent', 'Pablo Picasso', 'Pierre-Auguste Renoir', 'Paul Cezanne', 'Camille Pissarro', 'Paul Gauguin', 
           'Claude Monet', 'Edgar Degas', 'Henri Matisse', 'Vincent van Gogh', 'Childe Hassam', 'Pyotr Konchalovsky', 'Martiros Saryan', 
           'Boris Kustodiev', 'Nicholas Roerich', 'Salvador Dali', 'Alfred Sisley', 'Henri Martin', 'Rene Magritte', 'Konstantin Korovin', 
           'Mary Cassatt', 'Gustave Loiseau', 'John Henry Twachtman', 'Georges Braque', 'Pierre Bonnard', "Georgia O'Keeffe", 
           'Gustave Caillebotte', 'Ilya Mashkov', 'Andy Warhol', 'Theo van Rysselberghe', 'Georges Seurat', 'Edward Hopper', 'Maxime Maufra', 
           'Diego Rivera', 'Henri-Edmond Cross', 'Robert Julian Onderdonk', 'Guy Rose', 'Andre Derain', 'Willard Metcalf', 'Frida Kahlo', 
           'Paul Signac', 'William James Glackens', 'Frantisek Kupka', 'Julian Alden Weir', 'Paul Serusier', 'Max Pechstein', 
           'Victor Borisov-Musatov', 'Armand Guillaumin', 'Spyros Papaloukas', 'Nicolae Darascu', 'Albert Marquet', 'Ion Theodorescu-Sion']

In [4]:
all_metadata['selected'] = all_metadata['artist'].apply(lambda x: x in artists)

In [5]:
metadata = all_metadata[all_metadata['selected'] == True]

In [6]:
metadata = metadata.drop('selected', axis=1)

In [7]:
metadata.head()

Unnamed: 0,artist,date,genre,pixelsx,pixelsy,size_bytes,source,style,title,artist_group,in_train,new_filename
15,Paul Serusier,1890.0,genre painting,7099.0,5857.0,9803854.0,wikiart,Cloisonnism,Seaweed Gatherer,train_and_test,False,32996.jpg
41,Georges Seurat,1884.0,,6367.0,4226.0,11579390.0,wikipedia,Pointillism,Bathers at Asnières,train_and_test,True,39751.jpg
65,Paul Signac,,cityscape,5616.0,4312.0,10612858.0,wikiart,Pointillism,View of the Port of Marseilles,train_and_test,True,74221.jpg
69,Georges Seurat,1884.0,genre painting,5910.0,4001.0,5330653.0,wikiart,Pointillism,Study for A Sunday on La Grande Jatte,train_and_test,True,31337.jpg
96,Gustave Caillebotte,1881.0,genre painting,5164.0,4087.0,3587461.0,wikiart,Impressionism,Rising Road,train_and_test,False,29616.jpg


In [8]:
metadata.to_csv('data/metadata.csv', index=None)

In [3]:
df = pd.read_csv('data/metadata.csv')

In [4]:
df.head()

Unnamed: 0,artist,date,genre,pixelsx,pixelsy,size_bytes,source,style,title,artist_group,in_train,new_filename
0,Paul Serusier,1890.0,genre painting,7099.0,5857.0,9803854.0,wikiart,Cloisonnism,Seaweed Gatherer,train_and_test,False,32996.jpg
1,Georges Seurat,1884.0,,6367.0,4226.0,11579390.0,wikipedia,Pointillism,Bathers at Asnières,train_and_test,True,39751.jpg
2,Paul Signac,,cityscape,5616.0,4312.0,10612858.0,wikiart,Pointillism,View of the Port of Marseilles,train_and_test,True,74221.jpg
3,Georges Seurat,1884.0,genre painting,5910.0,4001.0,5330653.0,wikiart,Pointillism,Study for A Sunday on La Grande Jatte,train_and_test,True,31337.jpg
4,Gustave Caillebotte,1881.0,genre painting,5164.0,4087.0,3587461.0,wikiart,Impressionism,Rising Road,train_and_test,False,29616.jpg


In [5]:
# split train and test dataframes
train_df = df[df['in_train'] == True]
train_df = train_df.reset_index(drop=True)

In [6]:
test_df = df[df['in_train'] == False]
test_df = test_df.reset_index(drop=True)

## Create matched pairs dataframe

In [7]:
# create training pairs df
col_names = ['id1', 'id2', 'filename_1', 'filename_2', 'same_artist']

pairs = []
for i in range(len(train_df)):
    for j in random.choices(range(len(train_df)), k=36):
        row = (i, j, f"data/train_224_crop/{train_df.iloc[i, 11]}", f"data/train_224_crop/{train_df.iloc[j, 11]}", (train_df.iloc[i, 0] == train_df.iloc[j, 0]))
        pairs.append(row)

all_train_pairs_df = pd.DataFrame(pairs, columns=col_names)

In [8]:
# create test pairs df
pairs = []
for i in range(len(test_df)):
    for j in random.choices(range(len(test_df)), k=36):
        row = (i, j, f"data/test_224_crop/{test_df.iloc[i, 11]}", f"data/test_224_crop/{test_df.iloc[j, 11]}", (test_df.iloc[i, 0] == test_df.iloc[j, 0]))
        pairs.append(row)
        
test_pairs_df = pd.DataFrame(pairs, columns=col_names)

In [119]:
train_pairs_df = all_train_pairs_df.sample(frac=0.8)
val_pairs_df = all_train_pairs_df[~all_train_pairs_df.index.isin(train_pairs_df.index)]
train_pairs_df.reset_index(drop=True, inplace=True)
val_pairs_df.reset_index(drop=True, inplace=True)

In [107]:
# balance the dataset
train_true_pairs = train_pairs_df[train_pairs_df['same_artist'] == True]
train_false_pairs = train_pairs_df[train_pairs_df['same_artist'] != True].sample(frac=0.05)
train_pairs_df_small = pd.concat([train_true_pairs, train_false_pairs], ignore_index=True)

In [125]:
val_true_pairs = val_pairs_df[val_pairs_df['same_artist'] == True]
val_false_pairs = val_pairs_df[val_pairs_df['same_artist'] != True].sample(frac=0.05)
val_pairs_df_small = pd.concat([val_true_pairs, val_false_pairs], ignore_index=True)

In [179]:
val_pairs_df_small.to_csv('validation_pairs.csv', index=None)

In [185]:
# make the size of the test set more manageable
test_pairs_df_small = test_pairs_df.sample(frac=0.09)

In [189]:
test_pairs_df_small.to_csv('test_pairs.csv', index=None)

## Data preprocessing

In [None]:
# resize all of the images to 256x256
for idx in tqdm(range(len(df))):
    row = df.iloc[idx]
    fname = row['new_filename']
    if row['in_train'] == True:
        old_path = 'data/my_train'+'/'+fname
        new_path = 'data/train_256_border'+'/'+fname
        resize_img(old_path, new_path, 256)
    else:
        old_path = 'data/my_test'+'/'+fname
        new_path = 'data/test_256_border'+'/'+fname
        resize_img(old_path, new_path, 256)

In [33]:
# center crop all images to 224x224 area
for idx in tqdm(range(len(df))):
    row = df.iloc[idx]
    fname = row['new_filename']
    if row['in_train'] == True:
        old_path = 'data/train_256/'+fname
        new_path = 'data/train_224_crop/'+fname
        center_crop(old_path, new_path, 224)
    else:
        old_path = 'data/test_256/'+fname
        new_path = 'data/test_224_crop/'+fname
        center_crop(old_path, new_path, 224)

  0%|          | 0/13894 [00:00<?, ?it/s]

## Create dataloader

In [127]:
train_ds = ArtistPairsDataset(train_pairs_df_small)
train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)

In [128]:
val_ds = ArtistPairsDataset(val_pairs_df_small)
val_dl = DataLoader(val_ds, batch_size=16, shuffle=True)

In [186]:
test_ds = ArtistPairsDataset(test_pairs_df_small)
test_dl = DataLoader(test_ds, batch_size=16, shuffle=True)

## Model

In [90]:
features = models.alexnet(pretrained=True)

In [91]:
# only use up to layer 3 of the classifier block
features.classifier = nn.Sequential(*[features.classifier[i] for i in range(4)])

In [93]:
# freeze pretrained network
for param in features.parameters():
    param.requires_grad = False

### LR = 0.005

In [130]:
model = SiameseNetwork(features)

In [131]:
optimizer = optim.Adam(model.parameters(), lr=0.005)
lossFun = nn.BCEWithLogitsLoss()

In [132]:
num_epochs = 5
train_losses = []
valid_losses = []

for epoch in range(num_epochs):
    print('Epoch: ', epoch)
    
    train_loss, train_acc = one_pass_siamese(model, train_dl, optimizer, lossFun)
    train_losses.append(train_loss)
    print('Train loss: ', train_loss)
    print('Train accuracy: ', train_acc)
    
    valid_loss, valid_acc = one_pass_siamese(model, val_dl, optimizer, lossFun, backwards=False)
    valid_losses.append(valid_loss)
    print('Valid loss: ', valid_loss)
    print('Valid accuracy: ', valid_acc)

Epoch:  0


  0%|          | 0/1473 [00:00<?, ?it/s]

Train loss:  0.7570927166364182
Train accuracy:  0.6303292600135777


  0%|          | 0/371 [00:00<?, ?it/s]

Valid loss:  0.6419377786451272
Valid accuracy:  0.6326370170709794
Epoch:  1


  0%|          | 0/1473 [00:00<?, ?it/s]

Train loss:  0.6498417607878473
Train accuracy:  0.6359640190088255


  0%|          | 0/371 [00:00<?, ?it/s]

Valid loss:  0.6417733874282425
Valid accuracy:  0.6328616352201257
Epoch:  2


  0%|          | 0/1473 [00:00<?, ?it/s]

Train loss:  0.645276148405839
Train accuracy:  0.6369568906992532


  0%|          | 0/371 [00:00<?, ?it/s]

Valid loss:  0.6405560459409442
Valid accuracy:  0.6325808625336927
Epoch:  3


  0%|          | 0/1473 [00:00<?, ?it/s]

Train loss:  0.6478489681076858
Train accuracy:  0.6370672097759674


  0%|          | 0/371 [00:00<?, ?it/s]

Valid loss:  0.6412584229781622
Valid accuracy:  0.6330300988319856
Epoch:  4


  0%|          | 0/1473 [00:00<?, ?it/s]

Train loss:  0.6454342398526955
Train accuracy:  0.6369908350305499


  0%|          | 0/371 [00:00<?, ?it/s]

Valid loss:  0.6377931555326416
Valid accuracy:  0.6328616352201257


In [190]:
torch.save(model, 'models/siamese_lr005')

### LR = 0.1

In [136]:
model_2 = SiameseNetwork(features)

In [137]:
optimizer = optim.Adam(model_2.parameters(), lr=0.1)
lossFun = nn.BCEWithLogitsLoss()

In [138]:
num_epochs = 5
train_losses = []
valid_losses = []

for epoch in range(num_epochs):
    print('Epoch: ', epoch)
    
    train_loss, train_acc = one_pass_siamese(model_2, train_dl, optimizer, lossFun)
    train_losses.append(train_loss)
    print('Train loss: ', train_loss)
    print('Train accuracy: ', train_acc)
    
    valid_loss, valid_acc = one_pass_siamese(model_2, val_dl, optimizer, lossFun, backwards=False)
    valid_losses.append(valid_loss)
    print('Valid loss: ', valid_loss)
    print('Valid accuracy: ', valid_acc)

Epoch:  0


  0%|          | 0/1473 [00:00<?, ?it/s]

Train loss:  6.174260411247135
Train accuracy:  0.6345128988458928


  0%|          | 0/371 [00:00<?, ?it/s]

Valid loss:  1.134132933102528
Valid accuracy:  0.6333670260557053
Epoch:  1


  0%|          | 0/1473 [00:00<?, ?it/s]

Train loss:  1.7190021152247075
Train accuracy:  0.6366768499660557


  0%|          | 0/371 [00:00<?, ?it/s]

Valid loss:  0.8360682600592024
Valid accuracy:  0.6330300988319856
Epoch:  2


  0%|          | 0/1473 [00:00<?, ?it/s]

Train loss:  1.3450786353412
Train accuracy:  0.636591989137814


  0%|          | 0/371 [00:00<?, ?it/s]

Valid loss:  0.6581791618442279
Valid accuracy:  0.6329177897574124
Epoch:  3


  0%|          | 0/1473 [00:00<?, ?it/s]

Train loss:  0.71133600738853
Train accuracy:  0.6371350984385608


  0%|          | 0/371 [00:00<?, ?it/s]

Valid loss:  0.6573680991111097
Valid accuracy:  0.6329177897574124
Epoch:  4


  0%|          | 0/1473 [00:00<?, ?it/s]

Train loss:  0.7947154426688009
Train accuracy:  0.6363034623217922


  0%|          | 0/371 [00:00<?, ?it/s]

Valid loss:  0.6579936725110057
Valid accuracy:  0.6329177897574124


In [191]:
torch.save(model_2, 'models/siamese_lr1')

## Compare between the two siamese models

In [176]:
validation_check_siamese(model, val_dl)

  0%|          | 0/371 [00:00<?, ?it/s]

[[3750    0]
 [2176    0]]
0.5493735294117648


In [177]:
validation_check_siamese(model_2, val_dl)

  0%|          | 0/371 [00:00<?, ?it/s]

[[3750    0]
 [2174    2]]
0.5004595588235294


(array([0.3860285, 0.3860285, 0.3860285, ..., 0.3860285, 0.3860285,
        0.3860285]),
 array([0., 0., 0., ..., 1., 0., 0.]))

Based on the confusion matrix, model 2 seems better than model one.

## Run test

In [187]:
validation_check_siamese(model_2, test_dl)

  0%|          | 0/643 [00:00<?, ?it/s]

[[10002     0]
 [  282     0]]
0.5


This model classified all test cases as "not the same artist".