In [3]:
%matplotlib inline
from PIL import Image
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
import os
import itertools
from tqdm import *
from tqdm import tqdm_notebook as tqdm
import random
import numpy as np
import math

from IPython.display import clear_output

# Step 1: Load Data Set

In [4]:
fileNames = []
for file in os.listdir("data/train_1"):
    if file.endswith(".jpg"):
        fileNames.append(file)
# fileNamesDF = pd.DataFrame(fileNames)

In [12]:
#only take certain artist so that the probabilities aren't basically all false when doing the tupples
minNumPaintingsPerArtist = 150

# #for each artist, only take n of their paintings
numPaintingsPerAuthor = 100


trainInfo = pd.read_csv('data/train_info.csv')
mostPopularArtists = trainInfo['artist'].value_counts()[trainInfo['artist'].value_counts() > minNumPaintingsPerArtist]

trainInfo = trainInfo[trainInfo['filename'].isin(fileNames)]
fileNamesDFAll = trainInfo[trainInfo['artist'].isin(mostPopularArtists.index)]


fileNamesDF = pd.DataFrame()
for artist in tqdm(mostPopularArtists.index):
    fileNamesDF = fileNamesDF.append(trainInfo[trainInfo['artist'] == artist][:numPaintingsPerAuthor])
    
fileNamesDF = fileNamesDF['filename']

HBox(children=(IntProgress(value=0, max=115), HTML(value='')))

In [13]:
# Load the all_data_info for the dataset we are using
allInfo = pd.read_csv('data/all_data_info.csv')
allInfo = allInfo[allInfo['new_filename'].isin(fileNamesDF)]

In [14]:
featuresDF = pd.DataFrame(allInfo['new_filename'])
featuresDF['pixelsx'] = allInfo['pixelsx']
featuresDF['pixelsy'] = allInfo['pixelsy']

In [15]:
def normalizeSeries(series, maxNum):
    series = series/maxNum
    return series

In [16]:
def featurizeImage(filename):
    # Extract the features from the actual image
    jpgfile = Image.open("data/train_1/" + filename)
#     print(jpgfile)
    
    bands = jpgfile.getbands()
    if bands[0] == 'L':
        # grayscale image
        imageDF = pd.DataFrame(list(jpgfile.getdata()),columns=['grayscale'])
#         imageDF['grayscale'] = normalizeSeries(imageDF['grayscale',])
    elif len(bands) == 4:
        #get red, green and blue chanels 
        imageDF = pd.DataFrame(list(jpgfile.getdata()),columns=['red','green','blue','alpha'])
        imageDF['red'] = normalizeSeries(imageDF['red'],255)
        imageDF['green'] = normalizeSeries(imageDF['green'],255)
        imageDF['blue'] = normalizeSeries(imageDF['blue'],255)  
        imageDF['alpha'] = normalizeSeries(imageDF['alpha'],255)  
    elif len(bands) == 3:
        #get red, green and blue chanels 
        imageDF = pd.DataFrame(list(jpgfile.getdata()),columns=['red','green','blue'])
        imageDF['red'] = normalizeSeries(imageDF['red'],255)
        imageDF['green'] = normalizeSeries(imageDF['green'],255)
        imageDF['blue'] = normalizeSeries(imageDF['blue'],255)  
    else:
        imageDF = pd.DataFrame(list(jpgfile.getdata()))
        print(bands)
        return imageDF, bands

    return imageDF.mean().values

In [17]:
featuresDF.index

Int64Index([    74,    110,    129,    141,    142,    149,    159,    178,
               201,    213,
            ...
            102113, 102402, 102551, 102565, 102862, 102888, 103030, 103049,
            103051, 103066],
           dtype='int64', length=4170)

In [19]:
imageFeaturesDF = pd.DataFrame(index=featuresDF.index,columns=['features']) 
print(imageFeaturesDF.index)
for row in tqdm(list(featuresDF.index)):
    imageFeatures = featurizeImage(featuresDF.loc[row]['new_filename'])
    imageFeaturesDF.loc[row]['features'] = imageFeatures


Int64Index([    74,    110,    129,    141,    142,    149,    159,    178,
               201,    213,
            ...
            102113, 102402, 102551, 102565, 102862, 102888, 103030, 103049,
            103051, 103066],
           dtype='int64', length=4170)


HBox(children=(IntProgress(value=0, max=4170), HTML(value='')))





In [20]:
imageFeaturesDF.head()

Unnamed: 0,features
74,"[0.677975966039829, 0.6280603114732923, 0.4498..."
110,"[0.3437587672577176, 0.30431301159169066, 0.28..."
129,"[0.694951524931509, 0.6840296064858785, 0.7064..."
141,"[0.7105538101850071, 0.6567182751663135, 0.573..."
142,"[0.4953544452320413, 0.4249800919821063, 0.388..."


In [21]:
imageFeaturesDF.to_csv('image_features_extracted5.csv')

In [22]:
featuresDF['imgFeatures'] = imageFeaturesDF['features']

In [23]:
featuresDF.head()

Unnamed: 0,new_filename,pixelsx,pixelsy,imgFeatures
74,17354.jpg,5833.0,3985.0,"[0.677975966039829, 0.6280603114732923, 0.4498..."
110,19834.jpg,3701.0,5490.0,"[0.3437587672577176, 0.30431301159169066, 0.28..."
129,15569.jpg,4122.0,4626.0,"[0.694951524931509, 0.6840296064858785, 0.7064..."
141,100478.jpg,3378.0,5448.0,"[0.7105538101850071, 0.6567182751663135, 0.573..."
142,100829.jpg,3672.0,5004.0,"[0.4953544452320413, 0.4249800919821063, 0.388..."


In [24]:
def compareImages(img1, img2):
    
    diffx = np.abs(img1['pixelsx']-img2['pixelsx']) / (img1['pixelsx'] + img2['pixelsx'])
    diffy = np.abs(img1['pixelsy']-img2['pixelsy']) / (img1['pixelsy'] + img2['pixelsy'])
    
    diffSize = (diffx + diffy)/2
    
    img1Features = img1['imgFeatures']
    img2Features = img2['imgFeatures']
    
    imgFeatureScore = 0
    
    if (len(img1Features) == len(img2Features)):
        imgFeatureDiffs = np.abs(img1Features - img2Features)
        imgFeatureScore = sum(imgFeatureDiffs)
        score = (diffSize+imgFeatureScore)/2
    else:
        score = (diffSize+imgFeatureScore)/2

   
    return 1-score

In [25]:
def generatePrediction(score,threshold):
    if score > threshold:
        return True
    else:
        return False

def generatePredictions(scores,threshold):
    predictions = []
    for curScore in tqdm(scores,leave=False):
        predictions.append(generatePrediction(curScore,threshold))

    return predictions

In [46]:
def computeStats(truth,predictions):
    accuracy = (predictions == truth).sum()/len(predictions == truth)
    
    predictionsTrue = (predictions == True)
    predictionsFalse = (predictions == False)
    
    truthTrue = (truth == True)
    truthFalse = (truth == False)
    
#     print(len(predictions))
    truePos = (predictionsTrue == truthTrue).sum()
#     print(truePos.sum())
    trueNeg = (predictionsFalse == truthFalse).sum()
    falsePos = (predictionsTrue == truthFalse).sum()
    falseNeg = (predictionsFalse == truthTrue).sum()
    
    return accuracy, truePos, trueNeg, falsePos, falseNeg

In [53]:
def trainThreshold(featuresTrain,maxNumIterations = 100,initialProb = 0.5, increaseRate = 0.001):
    
    probThreshold = initialProb
    prevAccuracy = 0

    for i in tqdm(range(maxNumIterations),leave=False):
        # the score represents how similar two images are

        scores = []
        iloc1 = featuresTrain.sample(frac=0.5).index
        iloc2 = featuresTrain.sample(frac=0.5).index
        # print(len(iloc2))
        # print(nTrials)

        for i, loc in tqdm(enumerate(iloc1),leave=False):
            scores.append(compareImages(featuresTrain.loc[loc],featuresTrain.loc[iloc2[i]]))

        filenames1 = featuresTrain.loc[iloc1]['new_filename'].values
        filenames2 = featuresTrain.loc[iloc2]['new_filename'].values


        truth = trainInfo[trainInfo['filename'].isin(filenames1)]['artist'].values == trainInfo[trainInfo['filename'].isin(filenames2)]['artist'].values
        predictions = generatePredictions(scores,probThreshold)

        #compute true positive -
        accuracy, tp, tn, fp, fn = computeStats(truth,predictions)
       
        if (accuracy < prevAccuracy):
            print("peak accuracy")
            return probThreshold, prevAccuracy
        
        probThreshold += increaseRate
    
    return probThreshold, accuracy

In [54]:
def computeDevAcc(featuresTrain,probability):

    # the score represents how similar two images are
    scores = []
    iloc1 = featuresTrain.sample(frac=0.5).index
    iloc2 = featuresTrain.sample(frac=0.5).index

    # print(len(iloc2))
    # print(nTrials)

    for i, loc in tqdm(enumerate(iloc1),leave=False):
        scores.append(compareImages(featuresTrain.loc[loc],featuresTrain.loc[iloc2[i]]))

    filenames1 = featuresTrain.loc[iloc1]['new_filename'].values
    filenames2 = featuresTrain.loc[iloc2]['new_filename'].values


    truth = trainInfo[trainInfo['filename'].isin(filenames1)]['artist'].values == trainInfo[trainInfo['filename'].isin(filenames2)]['artist'].values
    predictions = generatePredictions(scores,probability)

    #compute stats
    accuracy, tp, tn, fp, fn = computeStats(truth,predictions)
    return accuracy, tp, tn, fp, fn

In [None]:
nTrainFraction = 0.6
nDevFraction = 0.4

thresholds = []
devAccuracies = []
tp = []
tn = []
fp = []
fn = []

numberSamples = 10
for i in tqdm(range(numberSamples)):
    featuresTrain = featuresDF.sample(frac=nTrainFraction)
    featuresDev = featuresDF.drop(featuresTrain.index)
#     featuresTest = featuresDF.drop(featuresDev.index).drop(featuresTrain.index).sample(n=nTest)
    learnedThreshold, trainAccuracy = trainThreshold(featuresTrain,50,0.5,0.01)
    thresholds.append((learnedThreshold,trainAccuracy))
    meanDevAccuracy = 0
    accuraciesDev = []
    for i in range(10):
        acc, ctp, ctn, cfp, cfn = computeDevAcc(featuresDev,learnedThreshold)
        accuraciesDev.append(acc)
        tp.append(ctp)
        tn.append(ctn)
        fp.append(cfp)
        fn.append(cfn)
    meanDevAccuracy = np.mean(accuraciesDev)
    devAccuracies.append(meanDevAccuracy)
    
print(thresholds)
print(devAccuracies)
print(tp)
print(tn)
print(fp)
print(fn)

print(tp.mean())
print(tp.mean())
print(fp.mean())
print(fn.mean())

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1251), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))