In [6]:
%matplotlib inline
from PIL import Image
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
import os
import itertools
from tqdm import *
from tqdm import tqdm_notebook as tqdm

## Create pairs of filenames and truth values from training data

In [7]:
fileNames = []
for file in os.listdir("data/train_1"):
    if file.endswith(".jpg"):
        fileNames.append(file)
fileNamesDF = pd.DataFrame(fileNames)

In [8]:
trainFiles = fileNamesDF.sample(frac=0.01)
trainFileNames = trainFiles.values.flatten()

In [9]:
## from itertools documentation to create tupples
def product(*args, repeat=1):
    # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy
    # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111
    pools = [tuple(pool) for pool in args] * repeat
    result = [[]]
    for pool in tqdm(pools):
        result = [x+[y] for x in result for y in pool]
    for prod in result:
        yield tuple(prod)
def permutations(iterable, r=None):
    pool = tuple(iterable)
    n = len(pool)
    r = n if r is None else r
    for indices in product(range(n), repeat=r):
        if len(set(indices)) == r:
            yield tuple(pool[i] for i in indices)
def combinations(iterable, r):
    pool = tuple(iterable)
    n = len(pool)
    for indices in tqdm(permutations(range(n), r)):
        if sorted(indices) == list(indices):
            yield tuple(pool[i] for i in indices)

def isSameArtist(image1,image2,trainInfo):
    artist1 = trainInfo[trainInfo['filename'] == image1]['artist']
    artist2 = trainInfo[trainInfo['filename'] == image2]['artist']
    return artist1.iloc[0] == artist2.iloc[0]


In [11]:
print("Generating Training Set for " + str(len(trainFileNames)) + " files.")
trainInputs = pd.DataFrame(list(combinations(trainFileNames,2)),columns=['image1','image2'])
trainInputs.to_csv('trainingTupplesNoTruth.csv')
trainInfo = pd.read_csv('data/train_info.csv')
trainInputs['sameArtist'] = pd.Series([isSameArtist(x[0],x[1],trainInfo) for x in tqdm(trainInputs.values)])
trainInputs.to_csv('trainingTupples.csv')

Generating Training Set for 110 files.


In [22]:
PercentageTrue = trainInputs[trainInputs['sameArtist'] == True].shape[0] / trainInputs.shape[0]
PercentageFalse = 1 - PercentageTrue

In [23]:
print(PercentageTrue)
print(PercentageFalse)

0.002001668056713928
0.997998331943286


## Loading an Image

In [None]:
filePair = '10.jpg'
jpgfile = Image.open("data/train_1/" + fileName)

In [None]:
print(jpgfile.getbands())

In [None]:
imageDF = pd.DataFrame(list(jpgfile.getdata()),columns=['red','green','blue'])

In [None]:
imageDF['blue'].hist()

## Generating RGB Features

In [None]:
imgFeatures = pd.concat([imageDF['red'],imageDF['green'],imageDF['blue']])

In [None]:
dataInfo = pd.read_csv('data/all_data_info.csv')
pixelsX = dataInfo[dataInfo['new_filename'] == fileName]['pixelsx'] # pixelsx and pixelsy could also be features
pixelsY = dataInfo[dataInfo['new_filename'] == fileName]['pixelsy'] # pixelsx and pixelsy could also be features

In [None]:
imgFeatures = imgFeatures.append(pixelsX)
imgFeatures = imgFeatures.append(pixelsY)

## Creating and training the model

In [None]:
model = LogisticRegression()
model