In [1]:
%matplotlib inline
from PIL import Image
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
import os
import itertools
from tqdm import *
from tqdm import tqdm_notebook as tqdm
import random
import numpy as np
import math

from IPython.display import clear_output

## Create pairs of filenames and truth values from training data

In [2]:
fileNames = []
for file in os.listdir("data/train_1"):
    if file.endswith(".jpg"):
        fileNames.append(file)
# fileNamesDF = pd.DataFrame(fileNames)

In [3]:
def isPaintingFromPopularArtist(paintingRow,popularArtists):
    print(paintingRow)
    print(paintingRow[artist])
    return 0

In [5]:
#only take certain artist so that the probabilities aren't basically all false when doing the tupples
minNumPaintingsPerArtist = 200
trainInfo = pd.read_csv('data/train_info.csv')
mostPopularArtists = trainInfo['artist'].value_counts()[trainInfo['artist'].value_counts() > minNumPaintingsPerArtist]
fileNamesDF = trainInfo[trainInfo['artist'].isin(mostPopularArtists.index)]['filename']
# print(trainInfo[trainInfo['filename'].isin(fileNames)].shape)

In [6]:
fileNamesDF.shape

(21741,)

In [7]:
fractionOfData = 0.03
trainFiles = fileNamesDF.sample(frac=fractionOfData)
trainFileNames = trainFiles.values.flatten()

In [8]:
## from itertools documentation to create tupples
def product(*args, repeat=1):
    # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy
    # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111
    pools = [tuple(pool) for pool in args] * repeat
    result = [[]]
    for pool in pools:
        result = [x+[y] for x in result for y in pool]
    for prod in result:
        yield tuple(prod)
def permutations(iterable, r=None):
    pool = tuple(iterable)
    n = len(pool)
    r = n if r is None else r
    for indices in product(range(n), repeat=r):
        if len(set(indices)) == r:
            yield tuple(pool[i] for i in indices)
def combinations(iterable, r):
    pool = tuple(iterable)
    n = len(pool)
    for indices in permutations(range(n), r):
        if sorted(indices) == list(indices):
            yield tuple(pool[i] for i in indices)

def isSameArtist(image1,image2,trainInfo):
    artist1 = trainInfo[trainInfo['filename'] == image1]['artist']
    artist2 = trainInfo[trainInfo['filename'] == image2]['artist']
    return artist1.iloc[0] == artist2.iloc[0]

In [9]:
def generateTrainingSet(trainFileNames,fractionOfData,minNumPaintingsPerArtist,keepTQDM = True):
#     print("Generating Training Set for " + str(len(trainFileNames)) + " files.")
    trainInputs = pd.DataFrame(list(combinations(trainFileNames,2)),columns=['image1','image2'])
#     trainInputs.to_csv('trainingTupplesNoTruth'+ str(fractionOfData) + '-' + str(minNumPaintingsPerArtist) +'.csv')
    trainInfo = pd.read_csv('data/train_info.csv')
    trainInputs['sameArtist'] = pd.Series([isSameArtist(x[0],x[1],trainInfo) for x in tqdm(trainInputs.values,leave=keepTQDM,desc='Gen Train Set')])
#     trainInputs.to_csv('trainingTupples' + str(fractionOfData) + '-' + str(minNumPaintingsPerArtist) + '.csv')
    return trainInputs

In [None]:
# trainingInputs = generateTrainingSet(trainFileNames,fractionOfData,minNumPaintingsPerArtist)

In [None]:
# PercentageTrue = trainingInputs[trainingInputs['sameArtist'] == True].shape[0] / trainingInputs.shape[0]
# PercentageFalse = 1 - PercentageTrue

In [None]:
# print(PercentageTrue)
# print(PercentageFalse)

## Probabilistic Model

In [13]:
def score(testingInputs):
    equality = testingInputs['sameArtist'] == testingInputs['predictions']
    numEqual = equality[equality == True].shape[0]
    numTotal = equality.shape[0]
    return numEqual / numTotal

In [14]:
def runProbModel(tuppleFrame,probTrue, keepTQDM=True):
    returnedValues = []
    returnedPredictions = []
    for i in tqdm(range(tuppleFrame.shape[0]),desc='Probability: ',leave=keepTQDM):
        value = random.random()
        returnedValues.append(value)
        returnedPredictions.append(value < probTrue)
#     tuppleFrame['value'] = returnedValues
#     tuppleFrame['predicted'] = returnedPredictions
    return returnedPredictions 

In [15]:
def computeTruth(testingInputs,keepTQDM = True):
    #compute truths for dev set
    trainInfo = pd.read_csv('data/train_info.csv')
    testingInputs['sameArtist'] = pd.Series([isSameArtist(x[0],x[1],trainInfo) for x in tqdm(testingInputs.values,leave=keepTQDM,desc='Gen Tru Dev')])
#     testingInputs.to_csv('testingResults' + str(fractionOfData) + '.csv')

In [16]:
def getResultsForTestSet(testSet,testFileNames,PercentageTrue,keepTQDM):
    testSet.head()
    testingInputs = pd.DataFrame(list(combinations(testFileNames,2)),columns=['image1','image2'])
#     testingInputs.to_csv('testingSet.csv')

    modelPredictions = runProbModel(testingInputs,PercentageTrue,keepTQDM)
    testingInputs['predictions'] = modelPredictions
    computeTruth(testingInputs,keepTQDM)
    
    return score(testingInputs)

In [25]:
def getAvgScoreForNSplits(n,fraction,fileNamesDF,trainFiles,PercentageTrue,keepTQDM = True):
    # Look at the non-selected files and do score over random samples
    scores = []
    for i in tqdm(range(n),leave=keepTQDM,desc='Compute Sco: '):
        testingSet=fileNamesDF.drop(trainFiles.index).sample(frac=fraction)
        testFileNames = testingSet.values.flatten()
        scores.append(getResultsForTestSet(testingSet,testFileNames,PercentageTrue,keepTQDM))
#     print(scores)
#     print("Average Score: " + str(np.mean(scores)))
    return scores, np.mean(scores)

In [18]:
getAvgScoreForNSplits(10,0.01,fileNamesDF,trainFiles,False)

HBox(children=(IntProgress(value=0, description='Compute Sco: ', max=10), HTML(value='')))

NameError: name 'PercentageTrue' is not defined

## Do training and testing over different splits of train and test

In [26]:
def scoreOverRandomTrainingOverRandomSamples(fileNamesDF,nTraining,nScores,fractionTrain,fractionTest,keepTQDM = True):
    scores = []
    trialScores = []
    for i in tqdm(range(nTraining),desc='Progress: ',leave=keepTQDM):
        trainFiles = fileNamesDF.sample(frac=fractionTrain)
        trainFileNames = trainFiles.values.flatten()
        trainingInputs = generateTrainingSet(trainFileNames,fractionTrain,minNumPaintingsPerArtist,keepTQDM)
        PercentageTrue = trainingInputs[trainingInputs['sameArtist'] == True].shape[0] / trainingInputs.shape[0]
        PercentageFalse = 1 - PercentageTrue
        
        perTrialScore, overallScore = getAvgScoreForNSplits(nScores,fractionTest,fileNamesDF,trainFiles,PercentageTrue,keepTQDM = keepTQDM)
        scores.append(overallScore)
        trialScores.append(perTrialScore)
#     getAvgScoreForNSplits(nScores,fraction,fileNamesDF,trainFiles)
    return np.mean(scores), scores, trialScores

In [None]:
meanScoreModel = scoreOverRandomTrainingOverRandomSamples(fileNamesDF,5,10,0.03,0.03,False)[0]

In [None]:
print(meanScoreModel)

## Do training and testing just like before but over different 

In [None]:
results = pd.DataFrame(index=np.linspace(0.01, 0.5, num=10),columns=np.linspace(0.01, 0.5, num=10))
# print(results.head())
numTrain = 3
numDev = 3
for i in tqdm(results.index,desc='train size'):
    for j in tqdm(results.columns,desc='dev size',leave=False):
        meanScoreModel = scoreOverRandomTrainingOverRandomSamples(fileNamesDF,numTrain,numDev,i,j,False)[0]
        results[i][j] = meanScoreModel
        results.to_csv('ScoreResults-'
                       +str(results.shape[0]) +
                       '-'+
                       str(results.shape[1])
                       +'-' + str(numTrain) + 'train'+str(numDev)+'test-'
                       + str(minNumPaintingsPerArtist) + '.csv')        

HBox(children=(IntProgress(value=0, description='train size', max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, description='dev size', max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Progress: ', max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Gen Train Set', max=23436), HTML(value='')))



Exception in thread Thread-5:
Traceback (most recent call last):
  File "/home/adrien/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/adrien/anaconda3/lib/python3.6/site-packages/tqdm/_monitor.py", line 62, in run
    for instance in self.tqdm_cls._instances:
  File "/home/adrien/anaconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



HBox(children=(IntProgress(value=0, description='Compute Sco: ', max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Probability: ', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Gen Tru Dev', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Probability: ', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Gen Tru Dev', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Probability: ', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Gen Tru Dev', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Gen Train Set', max=23436), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Compute Sco: ', max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Probability: ', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Gen Tru Dev', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Probability: ', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Gen Tru Dev', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Probability: ', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Gen Tru Dev', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Gen Train Set', max=23436), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Compute Sco: ', max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Probability: ', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Gen Tru Dev', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Probability: ', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Gen Tru Dev', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Probability: ', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Gen Tru Dev', max=23005), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Progress: ', max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Gen Train Set', max=23436), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Compute Sco: ', max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Probability: ', max=961191), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Gen Tru Dev', max=961191), HTML(value='')))

In [None]:
results.head()

## Loading an Image

In [None]:
filePair = '10.jpg'
jpgfile = Image.open("data/train_1/" + fileName)

In [None]:
print(jpgfile.getbands())

In [None]:
imageDF = pd.DataFrame(list(jpgfile.getdata()),columns=['red','green','blue'])

In [None]:
imageDF['blue'].hist()

## Generating RGB Features

In [None]:
imgFeatures = pd.concat([imageDF['red'],imageDF['green'],imageDF['blue']])

In [None]:
dataInfo = pd.read_csv('data/all_data_info.csv')
pixelsX = dataInfo[dataInfo['new_filename'] == fileName]['pixelsx'] # pixelsx and pixelsy could also be features
pixelsY = dataInfo[dataInfo['new_filename'] == fileName]['pixelsy'] # pixelsx and pixelsy could also be features

In [None]:
imgFeatures = imgFeatures.append(pixelsX)
imgFeatures = imgFeatures.append(pixelsY)

## Creating and training the model

In [None]:
model = LogisticRegression()
model

# Test

In [None]:
df = pd.read_csv('data/solution_painter.csv')

In [None]:
df.describe()