In [1]:
#=====================================================================================================
# Author: Ben Grauer
# Purpose: Script that will read images (split by a list of 4 groups for running in parallel),
#           assign to a class and then write properties to a file to model on
#=====================================================================================================

In [None]:
import cv2
import numpy as np
import pandas as pd
import math

import os
import os.path
import glob
import sys
from datetime import datetime

from PIL import Image
from PIL import ImageEnhance
from PIL import ImageFilter

from sklearn.cluster import KMeans
from scipy.stats import itemfreq

from unidecode import unidecode
import codecs

In [2]:
# All Features we are trying to capture

# Image RGB of Mean, Std, Var
# Image Size
# Color Dominance

# - Sharpness / Blur
# - Brightness
# - Contrast 
# - Dominant Colors - Need to break apart and store

## Class to store all the image properties

In [3]:
class AvitoImage():
    
    def __init__(self, imageFileName):

        # Instantiate Variables
        self.imageFileName = ''
        
        self.imageTypePil = None
        self.imageTypeCv2 = None
        
        # color channels
        self.clrchn_b_shape = 0
        self.clrchn_g_shape = 0
        self.clrchn_r_shape = 0
        
        self.clrchn_b_mean = 0
        self.clrchn_g_mean = 0
        self.clrchn_r_mean = 0
        
        self.clrchn_b_std = 0
        self.clrchn_g_std = 0
        self.clrchn_r_std = 0
        
        self.clrchn_b_var = 0
        self.clrchn_g_var = 0
        self.clrchn_r_var = 0
        
        self.imageSize = 0
        
        self.blurColorScale = 0
        self.blurGreyScale = 0
        
        self.contrastColorScale = 0
        self.contrastGreyScale = 0
        
        self.brightness = 0
        self.brightnessLuminanceA = 0
        self.brightnessLuminanceB = 0
        self.brightnessLuminanceC = 0
        
        self.imageHist = []
        self.imageCentroids = []
        
        self.imageCentroid1 = []
        self.imageCentroid2 = []
        self.imageCentroid3 = []
        self.imageCentroid4 = []
    
        # Assign the input image File Name
        self.imageFileName = imageFileName
    
        # Load / Open the images
        self.load_image()
    
    # load the image (keep both types for the differening operations)
    def load_image(self): # , imageFileName):
        
        try:
        
            self.imageTypeCv2 = cv2.imread(self.imageFileName)
            self.imageTypePil = Image.open(self.imageFileName)
        
        except:
            e = sys.exc_info()[0]
            print('ERROR (load_image): ' + str(e))

    def assign_bluegreenred_channel_statistics(self):
        try:
            # Split the image
            b,g,r = cv2.split(self.imageTypeCv2)

            # All the same
            self.clrchn_b_shape = b.shape
            self.clrchn_g_shape = g.shape
            self.clrchn_r_shape = r.shape

            self.clrchn_b_mean = b.mean()
            self.clrchn_g_mean = g.mean()
            self.clrchn_r_mean = r.mean()

            self.clrchn_b_std = b.std()
            self.clrchn_g_std = g.std()
            self.clrchn_r_std = r.std()

            self.clrchn_b_var = b.var()
            self.clrchn_g_var = g.var()
            self.clrchn_r_var = r.var()
        except:
            e = sys.exc_info()[0]
            print('ERROR (assign_bluegreenred_channel_statistics): ' + str(e))        
    
    # Get the width x height
    # https://stackoverflow.com/questions/1575625/how-can-i-read-how-many-pixels-an-image-has-in-python
    def assign_image_size(self):
        try:
            width, height = self.imageTypePil.size
            self.imageSize = (width * height)
        except:
            e = sys.exc_info()[0]
            print('ERROR (assign_image_size): ' + str(e))  
            
    # used for blurriness of an image
    def variance_of_laplacian(self, image):
        try:
            # compute the Laplacian of the image and then return the focus
            # measure, which is simply the variance of the Laplacian
            return cv2.Laplacian(image, cv2.CV_64F).var()
        except:
            e = sys.exc_info()[0]
            print('ERROR (variance_of_laplacian): ' + str(e))

    def assign_image_blur(self):
        try:
            # Color
            self.blurColorScale = cv2.Laplacian(self.imageTypeCv2, cv2.CV_64F).var()
            # GreyScale
            self.blurGreyScale = self.variance_of_laplacian(cv2.cvtColor(self.imageTypeCv2, cv2.COLOR_BGR2GRAY))
 
        except:
            e = sys.exc_info()[0]
            print('ERROR (assign_image_blur): ' + str(e))          
        
    # Credit for this one
    def determine_brightness_scale(self):
        try:
            #import Image
            imag = self.imageTypePil
            #Convert the image te RGB if it is a .gif for example
            imag = imag.convert ('RGB')
            #coordinates of the pixel
            X,Y = 0,0
            #Get RGB
            pixelRGB = imag.getpixel((X,Y))
            R,G,B = pixelRGB 

            self.brightness = sum([R,G,B])/3 ##0 is dark (black) and 255 is bright (white)
            #print(brightness)

            #Standard
            self.brightnessLuminanceA = (0.2126*R) + (0.7152*G) + (0.0722*B)

            #Percieved A
            self.brightnessLuminanceB = (0.299*R + 0.587*G + 0.114*B)

            #Perceived B, slower to calculate
            self.brightnessLuminanceC = np.sqrt(0.299*R**2 + 0.587*G**2 + 0.114*B**2 )
            
        except:
            e = sys.exc_info()[0]
            print('ERROR (determine_brightness_scale): ' + str(e))        
        
    def determine_contrast(self):
        try:
            self.contrastColorScale = np.var(self.imageTypeCv2)
            self.contrastGreyScale = np.var(cv2.cvtColor(self.imageTypeCv2, cv2.COLOR_BGR2GRAY))

        except:
            e = sys.exc_info()[0]
            print('ERROR (determine_contrast): ' + str(e))
        
    def determine_kmeans_cv2_dominant_colors(self):
        try:
            #img = cv2.imread(file1)
            img = self.imageTypeCv2

            Z = np.float32(img.reshape((-1,3)))
            #Z = np.float32(Z)

            # define criteria, number of clusters(K) and apply kmeans().  Going with 4 for now
            criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
            K=4
            _, labels, self.imageCentroids=cv2.kmeans(Z, K, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)

            #####################
            # Contstruct a histogram
            numLabels = np.arange(0, len(np.unique(labels)) + 1)
            (self.imageHist, _) = np.histogram(labels, bins = numLabels)

            # normalize the histogram, such that it sums to one
            self.imageHist = self.imageHist.astype("float")
            self.imageHist /= self.imageHist.sum()

            # Assign separately (additional for now)
            self.imageCentroid1 = self.imageCentroids[0]
            self.imageCentroid2 = self.imageCentroids[1]
            self.imageCentroid3 = self.imageCentroids[2]
            self.imageCentroid4 = self.imageCentroids[3]
    
        except:
            e = sys.exc_info()[0]
            print('ERROR (determine_kmeans_cv2_dominant_colors): ' + str(e))            
        
    # Run through all the functions below
    def load_all_image_properties(self):
        self.assign_bluegreenred_channel_statistics()
        self.assign_image_size()
        self.assign_image_blur()
        self.determine_brightness_scale()
        self.determine_contrast()
        self.determine_kmeans_cv2_dominant_colors()
        #print('All Properties loaded for image: ' + self.imageFileName)
    
    # used to create a dictionary to write to csv later (and then merge back into main data set)
    def to_dictionary(self):
        return {
            'imageFileName': self.imageFileName,
            'clrchn_b_shape':self.clrchn_b_shape,
            'clrchn_g_shape':self.clrchn_g_shape,
            'clrchn_r_shape':self.clrchn_r_shape,

            'clrchn_b_mean':self.clrchn_b_mean,
            'clrchn_g_mean':self.clrchn_g_mean,
            'clrchn_r_mean':self.clrchn_r_mean,
            
            'clrchn_b_std':self.clrchn_b_std,
            'clrchn_g_std':self.clrchn_g_std,
            'clrchn_r_std':self.clrchn_r_std,
            
            'clrchn_b_var':self.clrchn_b_var,
            'clrchn_g_var':self.clrchn_g_var,
            'clrchn_r_var':self.clrchn_r_var,
            
            'imageSize':self.imageSize,
            'blurColorScale':self.blurColorScale,
            'blurGreyScale':self.blurGreyScale,
            'contrastColorScale':self.contrastColorScale,
            'contrastGreyScale':self.contrastGreyScale,
            'brightness':self.brightness,
            'brightnessLuminanceA':self.brightnessLuminanceA,
            'brightnessLuminanceB':self.brightnessLuminanceB,
            'brightnessLuminanceC':self.brightnessLuminanceC,
            'imageHist':self.imageHist,
            #'imageCentroids':self.imageCentroids,
            
            'imageCentroid1':self.imageCentroid1,
            'imageCentroid2':self.imageCentroid2,
            'imageCentroid3':self.imageCentroid3,
            'imageCentroid4':self.imageCentroid4
            
        }
        

In [4]:
workdir = 'd:/project/data/kg_avito_demand'
testWorkDir = 'd:/project/data/kg_avito_demand/test_jpg/'
trainWorkDir = 'd:/project/data/kg_avito_demand/train_jpg/'

## Now Process the File / Group

In [5]:
## SET THE GROUP NUMBER WE ARE PROCESSING
#CONST_GROUP_NUM = 1
CONST_GROUP_NUM = 2
#CONST_GROUP_NUM = 3
#CONST_GROUP_NUM = 4

In [6]:
def add_additional_columns_to_dataframe(df, columnList):
    
    # Create the translated dataframe column
    for i in columnList:
        # check for existing
        if i in df:
            print(str(i) + ' - exists')
        else:
            print('Added col: ' + str(i))
            df[i] = ''


    return df

In [7]:
# verify we can convert an empty class to a dictionary (for writing the class items to a csv later)
#  also ensure I have captured all the properties
emptyClass = AvitoImage('')

colList = []
dictKeys = emptyClass.to_dictionary()
for key in dictKeys.keys():
    print(key)
    
    colList.append(key)

ERROR (load_image): <class 'AttributeError'>
imageFileName
clrchn_b_shape
clrchn_g_shape
clrchn_r_shape
clrchn_b_mean
clrchn_g_mean
clrchn_r_mean
clrchn_b_std
clrchn_g_std
clrchn_r_std
clrchn_b_var
clrchn_g_var
clrchn_r_var
imageSize
blurColorScale
blurGreyScale
contrastColorScale
contrastGreyScale
brightness
brightnessLuminanceA
brightnessLuminanceB
brightnessLuminanceC
imageHist
imageCentroid1
imageCentroid2
imageCentroid3
imageCentroid4


In [8]:
# read in the file (or resume)
groupedFileName = workdir + '/FullImageFileList_' + str(CONST_GROUP_NUM) + '.csv'

# Check to see if group file exists or not
if os.path.isfile(groupedFileName) == False:
    print('file not found - reading raw file')
    # Create the dataframe from the base file
    dfGroup = pd.read_csv(workdir + '/FullImageFileList.csv')
    
    # append the additional columns
    dfGroup = add_additional_columns_to_dataframe(dfGroup, colList)
    
    # else read it in, and re-write to it
    df = dfGroup[dfGroup['group']==CONST_GROUP_NUM].copy()
    df = df.reset_index(drop=True)
    
else:
    print('file found - read in')
    dfGroup = pd.read_csv(groupedFileName)
    df = dfGroup.copy()

file found - read in


  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
print('Total row to convert: ' + str(len(df[df['processed']==0])))
df.head()

Total row to convert: 19999


Unnamed: 0,filename,test_or_train,group,processed,imageFileName,clrchn_b_shape,clrchn_g_shape,clrchn_r_shape,clrchn_b_mean,clrchn_g_mean,...,contrastGreyScale,brightness,brightnessLuminanceA,brightnessLuminanceB,brightnessLuminanceC,imageHist,imageCentroid1,imageCentroid2,imageCentroid3,imageCentroid4
0,d:/project/data/kg_avito_demand/train_jpg\5c1c...,train,2,1,d:/project/data/kg_avito_demand/train_jpg\5c1c...,"(360, 480)","(360, 480)","(360, 480)",158.97044,156.503704,...,2841.112662,141.0,139.5736,139.869,139.877557,[0.18982639 0.26171875 0.25585069 0.29260417],[245.62064 241.65446 239.44666],[187.77763 183.85225 182.5845 ],[140.56493 136.5893 138.04141],[ 93.08344 94.2135 107.01523]
1,d:/project/data/kg_avito_demand/train_jpg\5c1c...,train,2,1,d:/project/data/kg_avito_demand/train_jpg\5c1c...,"(480, 360)","(480, 360)","(480, 360)",152.528929,159.513519,...,3919.177765,188.666667,192.2242,190.984,191.04422,[0.25399306 0.26102431 0.12935764 0.355625 ],[ 89.84919 108.05334 152.33958],[171.01332 178.05391 181.72234],[37.335255 41.96524 62.39659 ],[225.62987 225.41664 224.77307]
2,d:/project/data/kg_avito_demand/train_jpg\5c1c...,train,2,1,d:/project/data/kg_avito_demand/train_jpg\5c1c...,"(360, 480)","(360, 480)","(360, 480)",128.5607,138.093194,...,1793.43546,124.0,124.4578,125.726,126.166897,[0.33345486 0.18453125 0.29853009 0.1834838 ],[129.87323 152.87642 154.2678 ],[68.41424 68.16198 78.74619],[180.51736 182.70004 181.96062],[102.1311 108.98148 117.57919]
3,d:/project/data/kg_avito_demand/train_jpg\5c1c...,train,2,1,d:/project/data/kg_avito_demand/train_jpg\5c1c...,"(480, 270)","(480, 270)","(480, 270)",128.429228,139.702276,...,756.944826,149.333333,152.3828,153.102,153.53526,[0.21593364 0.41996914 0.07979167 0.28430556],[114.38771 125.91903 147.27962],[134.00777 145.8285 165.53654],[51.312542 60.092255 84.46697 ],[152.49663 163.46423 182.89972]
4,d:/project/data/kg_avito_demand/train_jpg\5c1c...,train,2,1,d:/project/data/kg_avito_demand/train_jpg\5c1c...,"(360, 543)","(360, 543)","(360, 543)",163.693774,106.737022,...,2680.099758,123.666667,124.8516,124.729,124.741328,[0.39176386 0.13312359 0.18520053 0.28991201],[241.27737 145.68028 118.6317 ],[174.61337 170.27794 174.73953],[151.79695 92.76654 92.58876],[61.45172 33.859665 61.450893]


In [11]:
print('Total rows to convert: ' + str(len(df[df['processed']==0])) + ' out of: ' + str(len(df)))
startTime = datetime.now()


# For each row in the data set
for i, row in df.iterrows():

    #print(str(i))
    #print(str(row))
    
    #print('row ' + str(i))
    # Here skip if we have already processed (or only process if still outstanding)
    if df.at[i, 'processed'] == 0:
        
        # load the class item
        imgClass = AvitoImage(df.at[i, 'filename'])
        imgClass.load_all_image_properties()

        dictImage = imgClass.to_dictionary()
        
        # now cycle through each of the individual columns at a time
        for key in dictImage.keys():
            # 
            tempAttr= getattr(imgClass,key)
            # get the attribute of the class to assign to the dataframe
            df.iloc[i,df.columns.get_loc(key)] = str(getattr(imgClass,key))

        # Set the processed
        df.at[i, 'processed'] = 1

        # every 10k records - write the file for checkpoint
        if (i % 10000 == 0):
            print('Processed row (' + str(datetime.now()) + '): ' + str(i) + ' and creating file snapshot. ' + str(datetime.now() - startTime))

            file = codecs.open(groupedFileName, 'w', 'utf-8') 
            df.to_csv(file, index=False)
            file.close()

        # Test 25
        #if i % 10 == 0:
        #    break

print (datetime.now() - startTime)
print('Finished - Writing final File')

# Final Write when finished with loop
file = codecs.open(groupedFileName, 'w', 'utf-8') 
df.to_csv(file, index=False)
file.close()

print('All Rows Processed!')

Total rows to convert: 19999 out of: 500000
Processed row (2018-06-05 00:08:12.745122): 490000 and creating file snapshot. 2:23:05.276118
4:50:34.860955
Finished - Writing final File
All Rows Processed!
