In [76]:
%matplotlib inline
from PIL import Image
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
import os
import itertools
from tqdm import *
from tqdm import tqdm_notebook as tqdm
import random

## Create pairs of filenames and truth values from training data

In [154]:
fileNames = []
for file in os.listdir("data/train_1"):
    if file.endswith(".jpg"):
        fileNames.append(file)
# fileNamesDF = pd.DataFrame(fileNames)

In [155]:
def isPaintingFromPopularArtist(paintingRow,popularArtists):
    print(paintingRow)
    print(paintingRow[artist])
    return 0

In [156]:
#only take certain artist so that the probabilities aren't basically all false when doing the tupples
minNumPaintingsPerArtist = 250
trainInfo = pd.read_csv('data/train_info.csv')
mostPopularArtists = trainInfo['artist'].value_counts()[trainInfo['artist'].value_counts() > minNumPaintingsPerArtist]
# print(trainInfo[trainInfo['artist'].isin(mostPopularArtists.index)].shape)
fileNamesDF = trainInfo[trainInfo['artist'].isin(mostPopularArtists.index)]['filename']
# print(trainInfo[trainInfo['filename'].isin(fileNames)].shape)

In [157]:
fractionOfData = 0.03
trainFiles = fileNamesDF.sample(frac=fractionOfData)
trainFileNames = trainFiles.values.flatten()

In [158]:
## from itertools documentation to create tupples
def product(*args, repeat=1):
    # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy
    # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111
    pools = [tuple(pool) for pool in args] * repeat
    result = [[]]
    for pool in tqdm(pools):
        result = [x+[y] for x in result for y in pool]
    for prod in result:
        yield tuple(prod)
def permutations(iterable, r=None):
    pool = tuple(iterable)
    n = len(pool)
    r = n if r is None else r
    for indices in product(range(n), repeat=r):
        if len(set(indices)) == r:
            yield tuple(pool[i] for i in indices)
def combinations(iterable, r):
    pool = tuple(iterable)
    n = len(pool)
    for indices in tqdm(permutations(range(n), r)):
        if sorted(indices) == list(indices):
            yield tuple(pool[i] for i in indices)

def isSameArtist(image1,image2,trainInfo):
    artist1 = trainInfo[trainInfo['filename'] == image1]['artist']
    artist2 = trainInfo[trainInfo['filename'] == image2]['artist']
    return artist1.iloc[0] == artist2.iloc[0]


In [159]:
print("Generating Training Set for " + str(len(trainFileNames)) + " files.")
trainInputs = pd.DataFrame(list(combinations(trainFileNames,2)),columns=['image1','image2'])
trainInputs.to_csv('trainingTupplesNoTruth.csv')
trainInfo = pd.read_csv('data/train_info.csv')
trainInputs['sameArtist'] = pd.Series([isSameArtist(x[0],x[1],trainInfo) for x in tqdm(trainInputs.values)])
trainInputs.to_csv('trainingTupples' + str(fractionOfData) + '.csv')

Generating Training Set for 36 files.


In [160]:
PercentageTrue = trainInputs[trainInputs['sameArtist'] == True].shape[0] / trainInputs.shape[0]
PercentageFalse = 1 - PercentageTrue

In [161]:
print(PercentageTrue)
print(PercentageFalse)

0.3349206349206349
0.6650793650793652


## Probabilistic Model

In [162]:
# Look at the non-selected files
testingSet=fileNamesDF.drop(trainFiles.index).sample(frac=0.01)
testFileNames = testingSet.values.flatten()

In [163]:
testingSet.head()
testingInputs = pd.DataFrame(list(combinations(testFileNames,2)),columns=['image1','image2'])
testingInputs.to_csv('testingSet.csv')

In [164]:
def runProbModel(tuppleFrame,probTrue):
    returnedValues = []
    returnedPredictions = []
    for i in tqdm(range(tuppleFrame.shape[0])):
        value = random.random()
        returnedValues.append(value)
        returnedPredictions.append(value < probTrue)
#     tuppleFrame['value'] = returnedValues
#     tuppleFrame['predicted'] = returnedPredictions
    return returnedPredictions 

In [165]:
def score(testingInputs):
    equality = testingInputs['sameArtist'] == testingInputs['predictions0']
    numEqual = equality[equality == True].shape[0]
    numTotal = equality.shape[0]
    return numEqual / numTotal

In [166]:
testingInputs
numExperiments = 1
for i in tqdm(range(numExperiments)):
    modelPredictions = runProbModel(testingInputs,PercentageTrue)
    testingInputs['predictions' + str(i)] = modelPredictions

In [167]:
#compute truths for dev set
trainInfo = pd.read_csv('data/train_info.csv')
testingInputs['sameArtist'] = pd.Series([isSameArtist(x[0],x[1],trainInfo) for x in tqdm(testingInputs.values)])
testingInputs.to_csv('testingResults' + str(fractionOfData) + '.csv')

In [168]:
score(testingInputs)

0.5454545454545454

## Loading an Image

In [None]:
filePair = '10.jpg'
jpgfile = Image.open("data/train_1/" + fileName)

In [None]:
print(jpgfile.getbands())

In [None]:
imageDF = pd.DataFrame(list(jpgfile.getdata()),columns=['red','green','blue'])

In [None]:
imageDF['blue'].hist()

## Generating RGB Features

In [None]:
imgFeatures = pd.concat([imageDF['red'],imageDF['green'],imageDF['blue']])

In [None]:
dataInfo = pd.read_csv('data/all_data_info.csv')
pixelsX = dataInfo[dataInfo['new_filename'] == fileName]['pixelsx'] # pixelsx and pixelsy could also be features
pixelsY = dataInfo[dataInfo['new_filename'] == fileName]['pixelsy'] # pixelsx and pixelsy could also be features

In [None]:
imgFeatures = imgFeatures.append(pixelsX)
imgFeatures = imgFeatures.append(pixelsY)

## Creating and training the model

In [None]:
model = LogisticRegression()
model

# Test

In [9]:
df = pd.read_csv('data/solution_painter.csv')

In [12]:
df.describe()

Unnamed: 0,index,sameArtist
count,21916050.0,21916050.0
mean,10958020.0,0.0131114
std,6326618.0,0.1137519
min,0.0,0.0
25%,5479012.0,0.0
50%,10958020.0,0.0
75%,16437030.0,0.0
max,21916050.0,1.0
