In [213]:
%matplotlib inline
from PIL import Image
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
import os
import itertools
from tqdm import *
from tqdm import tqdm_notebook as tqdm
import random
import numpy as np
import math

from IPython.display import clear_output

## Create pairs of filenames and truth values from training data

In [193]:
fileNames = []
for file in os.listdir("data/train_1"):
    if file.endswith(".jpg"):
        fileNames.append(file)
# fileNamesDF = pd.DataFrame(fileNames)

In [194]:
def isPaintingFromPopularArtist(paintingRow,popularArtists):
    print(paintingRow)
    print(paintingRow[artist])
    return 0

In [195]:
#only take certain artist so that the probabilities aren't basically all false when doing the tupples
minNumPaintingsPerArtist = 400
trainInfo = pd.read_csv('data/train_info.csv')
mostPopularArtists = trainInfo['artist'].value_counts()[trainInfo['artist'].value_counts() > minNumPaintingsPerArtist]
fileNamesDF = trainInfo[trainInfo['artist'].isin(mostPopularArtists.index)]['filename']
# print(trainInfo[trainInfo['filename'].isin(fileNames)].shape)

In [196]:
fileNamesDF.shape

(1216,)

In [197]:
fractionOfData = 0.03
trainFiles = fileNamesDF.sample(frac=fractionOfData)
trainFileNames = trainFiles.values.flatten()

In [222]:
## from itertools documentation to create tupples
def product(*args, repeat=1):
    # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy
    # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111
    pools = [tuple(pool) for pool in args] * repeat
    result = [[]]
    for pool in pools:
        result = [x+[y] for x in result for y in pool]
    for prod in result:
        yield tuple(prod)
def permutations(iterable, r=None):
    pool = tuple(iterable)
    n = len(pool)
    r = n if r is None else r
    for indices in product(range(n), repeat=r):
        if len(set(indices)) == r:
            yield tuple(pool[i] for i in indices)
def combinations(iterable, r):
    pool = tuple(iterable)
    n = len(pool)
    for indices in permutations(range(n), r):
        if sorted(indices) == list(indices):
            yield tuple(pool[i] for i in indices)

def isSameArtist(image1,image2,trainInfo):
    artist1 = trainInfo[trainInfo['filename'] == image1]['artist']
    artist2 = trainInfo[trainInfo['filename'] == image2]['artist']
    return artist1.iloc[0] == artist2.iloc[0]

In [243]:
def generateTrainingSet(trainFileNames,fractionOfData,minNumPaintingsPerArtist):
    print("Generating Training Set for " + str(len(trainFileNames)) + " files.")
    trainInputs = pd.DataFrame(list(combinations(trainFileNames,2)),columns=['image1','image2'])
    trainInputs.to_csv('trainingTupplesNoTruth'+ str(fractionOfData) + '-' + str(minNumPaintingsPerArtist) +'.csv')
    trainInfo = pd.read_csv('data/train_info.csv')
    trainInputs['sameArtist'] = pd.Series([isSameArtist(x[0],x[1],trainInfo) for x in tqdm(trainInputs.values)])
    trainInputs.to_csv('trainingTupples' + str(fractionOfData) + '-' + str(minNumPaintingsPerArtist) + '.csv')
    return trainInputs

In [244]:
trainingInputs = generateTrainingSet(trainFileNames,fractionOfData,minNumPaintingsPerArtist)

Generating Training Set for 36 files.


In [200]:
PercentageTrue = trainInputs[trainInputs['sameArtist'] == True].shape[0] / trainInputs.shape[0]
PercentageFalse = 1 - PercentageTrue

In [201]:
print(PercentageTrue)
print(PercentageFalse)

0.35873015873015873
0.6412698412698412


## Probabilistic Model

In [202]:
def score(testingInputs):
    equality = testingInputs['sameArtist'] == testingInputs['predictions']
    numEqual = equality[equality == True].shape[0]
    numTotal = equality.shape[0]
    return numEqual / numTotal

In [227]:
def runProbModel(tuppleFrame,probTrue):
    returnedValues = []
    returnedPredictions = []
    for i in range(tuppleFrame.shape[0]):
        value = random.random()
        returnedValues.append(value)
        returnedPredictions.append(value < probTrue)
#     tuppleFrame['value'] = returnedValues
#     tuppleFrame['predicted'] = returnedPredictions
    return returnedPredictions 

In [235]:
def computeTruth(testingInputs):
    #compute truths for dev set
    trainInfo = pd.read_csv('data/train_info.csv')
    testingInputs['sameArtist'] = pd.Series([isSameArtist(x[0],x[1],trainInfo) for x in testingInputs.values])
    testingInputs.to_csv('testingResults' + str(fractionOfData) + '.csv')

In [236]:
def getResultsForTestSet(testSet,testFileNames,PercentageTrue):
    testSet.head()
    testingInputs = pd.DataFrame(list(combinations(testFileNames,2)),columns=['image1','image2'])
    testingInputs.to_csv('testingSet.csv')

    modelPredictions = runProbModel(testingInputs,PercentageTrue)
    testingInputs['predictions'] = modelPredictions
    computeTruth(testingInputs)
    
    return score(testingInputs)

In [259]:
def getAvgScoreForNSplits(n,fraction,fileNamesDF,trainFiles):
    # Look at the non-selected files and do score over random samples
    scores = []
    for i in tqdm(range(n)):
        testingSet=fileNamesDF.drop(trainFiles.index).sample(frac=fraction)
        testFileNames = testingSet.values.flatten()
        scores.append(getResultsForTestSet(testingSet,testFileNames,PercentageTrue))
#     print(scores)
#     print("Average Score: " + str(np.mean(scores)))
    return scores, np.mean(scores)

In [260]:
getAvgScoreForNSplits(10,0.01,fileNamesDF,trainFiles)

([0.4696969696969697,
  0.6060606060606061,
  0.5909090909090909,
  0.6515151515151515,
  0.36363636363636365,
  0.5303030303030303,
  0.5303030303030303,
  0.6818181818181818,
  0.4090909090909091,
  0.5757575757575758],
 0.54090909090909089)

## Do training and testing over different splits of train and test

In [264]:
def scoreOverRandomTrainingOverRandomSamples(fileNamesDF,nTraining,nScores,fractionTrain,fractionTest):
    scores = []
    trialScores = []
    for i in tqdm(range(nTraining)):
        trainFiles = fileNamesDF.sample(frac=fractionTrain)
        trainFileNames = trainFiles.values.flatten()
        trainingInputs = generateTrainingSet(trainFileNames,fractionTrain,minNumPaintingsPerArtist)
        PercentageTrue = trainingInputs[trainingInputs['sameArtist'] == True].shape[0] / trainingInputs.shape[0]
        PercentageFalse = 1 - PercentageTrue
        
        perTrialScore, overallScore = getAvgScoreForNSplits(nScores,fractionTest,fileNamesDF,trainFiles)
        scores.append(overallScore)
        trialScores.append(perTrialScore)
#     getAvgScoreForNSplits(nScores,fraction,fileNamesDF,trainFiles)
    return np.mean(scores), scores, trialScores

In [265]:
scoreOverRandomTrainingOverRandomSamples(fileNamesDF,5,10,0.03,0.01)

Generating Training Set for 36 files.


Generating Training Set for 36 files.


Generating Training Set for 36 files.


Generating Training Set for 36 files.


Generating Training Set for 36 files.


(0.54454545454545455,
 [0.53939393939393931,
  0.52121212121212124,
  0.56515151515151518,
  0.5636363636363636,
  0.53333333333333333],
 [[0.5909090909090909,
   0.5303030303030303,
   0.42424242424242425,
   0.5454545454545454,
   0.5606060606060606,
   0.5454545454545454,
   0.5303030303030303,
   0.5757575757575758,
   0.5303030303030303,
   0.5606060606060606],
  [0.5303030303030303,
   0.5909090909090909,
   0.48484848484848486,
   0.5606060606060606,
   0.48484848484848486,
   0.4393939393939394,
   0.6212121212121212,
   0.42424242424242425,
   0.4696969696969697,
   0.6060606060606061],
  [0.5909090909090909,
   0.6060606060606061,
   0.6212121212121212,
   0.6363636363636364,
   0.5303030303030303,
   0.5303030303030303,
   0.6060606060606061,
   0.5151515151515151,
   0.5757575757575758,
   0.4393939393939394],
  [0.5606060606060606,
   0.6212121212121212,
   0.6363636363636364,
   0.48484848484848486,
   0.5909090909090909,
   0.5606060606060606,
   0.5757575757575758,
   0

## Loading an Image

In [None]:
filePair = '10.jpg'
jpgfile = Image.open("data/train_1/" + fileName)

In [None]:
print(jpgfile.getbands())

In [None]:
imageDF = pd.DataFrame(list(jpgfile.getdata()),columns=['red','green','blue'])

In [None]:
imageDF['blue'].hist()

## Generating RGB Features

In [None]:
imgFeatures = pd.concat([imageDF['red'],imageDF['green'],imageDF['blue']])

In [None]:
dataInfo = pd.read_csv('data/all_data_info.csv')
pixelsX = dataInfo[dataInfo['new_filename'] == fileName]['pixelsx'] # pixelsx and pixelsy could also be features
pixelsY = dataInfo[dataInfo['new_filename'] == fileName]['pixelsy'] # pixelsx and pixelsy could also be features

In [None]:
imgFeatures = imgFeatures.append(pixelsX)
imgFeatures = imgFeatures.append(pixelsY)

## Creating and training the model

In [None]:
model = LogisticRegression()
model

# Test

In [9]:
df = pd.read_csv('data/solution_painter.csv')

In [12]:
df.describe()

Unnamed: 0,index,sameArtist
count,21916050.0,21916050.0
mean,10958020.0,0.0131114
std,6326618.0,0.1137519
min,0.0,0.0
25%,5479012.0,0.0
50%,10958020.0,0.0
75%,16437030.0,0.0
max,21916050.0,1.0
