In [1]:
%matplotlib inline
from PIL import Image
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
import os
import itertools
from tqdm import *
from tqdm import tqdm_notebook as tqdm
import random
import numpy as np
import math

from IPython.display import clear_output

# Step 1: Load Data Set

In [189]:
fileNames = []
for file in os.listdir("data/train_1"):
    if file.endswith(".jpg"):
        fileNames.append(file)
# fileNamesDF = pd.DataFrame(fileNames)

In [201]:
#only take certain artist so that the probabilities aren't basically all false when doing the tupples
minNumPaintingsPerArtist = 200

# #for each artist, only take n of their paintings
numPaintingsPerAuthor = 100


trainInfo = pd.read_csv('data/train_info.csv')
mostPopularArtists = trainInfo['artist'].value_counts()[trainInfo['artist'].value_counts() > minNumPaintingsPerArtist]

trainInfo = trainInfo[trainInfo['filename'].isin(fileNames)]
fileNamesDFAll = trainInfo[trainInfo['artist'].isin(mostPopularArtists.index)]


fileNamesDF = pd.DataFrame()
for artist in tqdm(mostPopularArtists.index):
    fileNamesDF = fileNamesDF.append(trainInfo[trainInfo['artist'] == artist][:numPaintingsPerAuthor])
    
fileNamesDF = fileNamesDF['filename']

HBox(children=(IntProgress(value=0, max=69), HTML(value='')))




In [202]:
# Load the all_data_info for the dataset we are using
allInfo = pd.read_csv('data/all_data_info.csv')
allInfo = allInfo[allInfo['new_filename'].isin(fileNamesDF)]

In [204]:
featuresDF = pd.DataFrame(allInfo['new_filename'])
featuresDF['pixelsx'] = allInfo['pixelsx']
featuresDF['pixelsy'] = allInfo['pixelsy']

In [205]:
def normalizeSeries(series, maxNum):
    series = series/maxNum
    return series

In [206]:
def featurizeImage(filename):
    # Extract the features from the actual image
    jpgfile = Image.open("data/train_1/" + filename)
#     print(jpgfile)
    
    bands = jpgfile.getbands()
    if bands[0] == 'L':
        # grayscale image
        imageDF = pd.DataFrame(list(jpgfile.getdata()),columns=['grayscale'])
#         imageDF['grayscale'] = normalizeSeries(imageDF['grayscale',])
    elif len(bands) == 4:
        #get red, green and blue chanels 
        imageDF = pd.DataFrame(list(jpgfile.getdata()),columns=['red','green','blue','alpha'])
        imageDF['red'] = normalizeSeries(imageDF['red'],255)
        imageDF['green'] = normalizeSeries(imageDF['green'],255)
        imageDF['blue'] = normalizeSeries(imageDF['blue'],255)  
        imageDF['alpha'] = normalizeSeries(imageDF['alpha'],255)  
    elif len(bands) == 3:
        #get red, green and blue chanels 
        imageDF = pd.DataFrame(list(jpgfile.getdata()),columns=['red','green','blue'])
        imageDF['red'] = normalizeSeries(imageDF['red'],255)
        imageDF['green'] = normalizeSeries(imageDF['green'],255)
        imageDF['blue'] = normalizeSeries(imageDF['blue'],255)  
    else:
        imageDF = pd.DataFrame(list(jpgfile.getdata()))
        print(bands)
        return imageDF, bands

    return imageDF.mean().values

In [207]:
featuresDF.index

Int64Index([   511,    697,    773,    867,    921,    933,    995,   1277,
              1332,   1546,
            ...
            101294, 101629, 101725, 102085, 102113, 102402, 102565, 102862,
            103030, 103049],
           dtype='int64', length=1661)

In [208]:
imageFeaturesDF = pd.DataFrame(index=featuresDF.index,columns=['features']) 
print(imageFeaturesDF.index)
for row in tqdm(list(featuresDF.index)):
    imageFeatures = featurizeImage(featuresDF.loc[row]['new_filename'])
    imageFeaturesDF.loc[row]['features'] = imageFeatures


Int64Index([   511,    697,    773,    867,    921,    933,    995,   1277,
              1332,   1546,
            ...
            101294, 101629, 101725, 102085, 102113, 102402, 102565, 102862,
            103030, 103049],
           dtype='int64', length=1661)


HBox(children=(IntProgress(value=0, max=1661), HTML(value='')))




In [209]:
imageFeaturesDF.head()

Unnamed: 0,features
511,"[0.5428758332157436, 0.567912843558902, 0.4950..."
697,"[0.5401178080546106, 0.5401178080546106, 0.540..."
773,[109.78050646073648]
867,"[0.498340133186191, 0.498340133186191, 0.49834..."
921,"[0.6727659245903405, 0.6727659245903405, 0.672..."


In [210]:
imageFeaturesDF.to_csv('image_features_extracted2.csv')

In [211]:
featuresDF['imgFeatures'] = imageFeaturesDF['features']

In [212]:
featuresDF.head()

Unnamed: 0,new_filename,pixelsx,pixelsy,imgFeatures
511,13211.jpg,4340.0,2952.0,"[0.5428758332157436, 0.567912843558902, 0.4950..."
697,14108.jpg,3899.0,2874.0,"[0.5401178080546106, 0.5401178080546106, 0.540..."
773,1155.jpg,3899.0,2732.0,[109.78050646073648]
867,13673.jpg,3887.0,2601.0,"[0.498340133186191, 0.498340133186191, 0.49834..."
921,16144.jpg,3905.0,2538.0,"[0.6727659245903405, 0.6727659245903405, 0.672..."


In [219]:
# Generate a training Set
trainSet = allInfo.sample(frac=0.1)
trainSet['artist'].index

Int64Index([32661, 76416, 28229, 92405, 35734, 44189, 23441, 37301, 56298,
              933,
            ...
            49947, 42928, 40653, 26010, 18346,  6446, 41905, 88058, 39936,
            26216],
           dtype='int64', length=166)

In [271]:
def compareImages(img1, img2):
    
    diffx = np.abs(img1['pixelsx']-img2['pixelsx']) / (img1['pixelsx'] + img2['pixelsx'])
    diffy = np.abs(img1['pixelsy']-img2['pixelsy']) / (img1['pixelsy'] + img2['pixelsy'])
    
    img1Features = img1['imgFeatures']
    img2Features = img2['imgFeatures']
    
    imgFeatureScore = 0
    
    if (len(img1Features) == len(img2Features)):
        imgFeatureDiffs = np.abs(img1Features - img2Features)
        imgFeatureScore = sum(imgFeatureDiffs)

    score = diffx+diffy+imgFeatureScore
    return score

In [272]:
nTrials = 10
iloc1 = np.floor(np.random.rand(nTrials)*len(featuresDF.index))
iloc2 = np.floor(np.random.rand(nTrials)*len(featuresDF.index))
scores = []
for i in tqdm(range(nTrials)):
    scores.append(compareImages(featuresDF.iloc[int(iloc1[i])],featuresDF.iloc[int(iloc2[i])]))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [273]:
scores

[1.41884168950342,
 0.7600145245765311,
 1.1804089792786792,
 1.6090623187358801,
 1.3314605964809918,
 0.5271163724432565,
 1.4193653655908731,
 2.9654254542892327,
 0.8042178255242909,
 0.6929429410130805]