In [None]:
import cv2
import numpy as np
import pandas as pd
import math

import os
import os.path
import glob
import sys
from datetime import datetime

from PIL import Image
from PIL import ImageEnhance
from PIL import ImageFilter

from sklearn.cluster import KMeans
from scipy.stats import itemfreq

from unidecode import unidecode
import codecs

In [None]:
# All Features we are trying to capture

# Image RGB of Mean, Std, Var
# Image Size
# Color Dominance

# - Sharpness / Blur
# - Brightness
# - Contrast 
# - Dominant Colors - Need to break apart and store

## Class to store all the image properties

In [None]:
class AvitoImage():
    
    def __init__(self, imageFileName):

        # Instantiate Variables
        self.imageFileName = ''
        
        self.imageTypePil = None
        self.imageTypeCv2 = None
        
        self.clrchn_b_shape = 0
        self.clrchn_g_shape = 0
        self.clrchn_r_shape = 0
        
        self.clrchn_b_mean = 0
        self.clrchn_g_mean = 0
        self.clrchn_r_mean = 0
        
        self.clrchn_b_std = 0
        self.clrchn_g_std = 0
        self.clrchn_r_std = 0
        
        self.clrchn_b_var = 0
        self.clrchn_g_var = 0
        self.clrchn_r_var = 0
        
        self.imageSize = 0
        
        self.blurColorScale = 0
        self.blurGreyScale = 0
        
        self.contrastColorScale = 0
        self.contrastGreyScale = 0
        
        self.brightness = 0
        self.brightnessLuminanceA = 0
        self.brightnessLuminanceB = 0
        self.brightnessLuminanceC = 0
        
        self.imageHist = []
        self.imageCentroids = []
        
        self.imageCentroid1 = []
        self.imageCentroid2 = []
        self.imageCentroid3 = []
        self.imageCentroid4 = []
    
        # Assign the input image File Name
        self.imageFileName = imageFileName
    
        # Load / Open the images
        self.load_image()
    
    # load the image
    def load_image(self): # , imageFileName):
        
        try:
        
            self.imageTypeCv2 = cv2.imread(self.imageFileName)
            self.imageTypePil = Image.open(self.imageFileName)
        
        except:
            e = sys.exc_info()[0]
            print('ERROR (load_image): ' + str(e))

    def assign_bluegreenred_channel_statistics(self):
        try:
            # Split the image
            b,g,r = cv2.split(self.imageTypeCv2)

            # All the same
            self.clrchn_b_shape = b.shape
            self.clrchn_g_shape = g.shape
            self.clrchn_r_shape = r.shape

            self.clrchn_b_mean = b.mean()
            self.clrchn_g_mean = g.mean()
            self.clrchn_r_mean = r.mean()

            self.clrchn_b_std = b.std()
            self.clrchn_g_std = g.std()
            self.clrchn_r_std = r.std()

            self.clrchn_b_var = b.var()
            self.clrchn_g_var = g.var()
            self.clrchn_r_var = r.var()
        except:
            e = sys.exc_info()[0]
            print('ERROR (assign_bluegreenred_channel_statistics): ' + str(e))        
    
    # Get the width x height
    # https://stackoverflow.com/questions/1575625/how-can-i-read-how-many-pixels-an-image-has-in-python
    def assign_image_size(self):
        try:
            width, height = self.imageTypePil.size
            self.imageSize = (width * height)
        except:
            e = sys.exc_info()[0]
            print('ERROR (assign_image_size): ' + str(e))  
            
    # import the necessary packages
    def variance_of_laplacian(self, image):
        try:
            # compute the Laplacian of the image and then return the focus
            # measure, which is simply the variance of the Laplacian
            return cv2.Laplacian(image, cv2.CV_64F).var()
        except:
            e = sys.exc_info()[0]
            print('ERROR (variance_of_laplacian): ' + str(e))

    def assign_image_blur(self):
        try:
            # Color
            self.blurColorScale = cv2.Laplacian(self.imageTypeCv2, cv2.CV_64F).var()
            # GreyScale
            self.blurGreyScale = self.variance_of_laplacian(cv2.cvtColor(self.imageTypeCv2, cv2.COLOR_BGR2GRAY))

            #greyImage = cv2.cvtColor(self.cv2Image, cv2.COLOR_BGR2GRAY)
            #self.blurGreyScale = cv2.Laplacian(greyImage)    
        except:
            e = sys.exc_info()[0]
            print('ERROR (assign_image_blur): ' + str(e))          
        
    # Credit for this one
    def determine_brightness_scale(self):
        try:
            
            #import Image
            imag = self.imageTypePil
            #Convert the image te RGB if it is a .gif for example
            imag = imag.convert ('RGB')
            #coordinates of the pixel
            X,Y = 0,0
            #Get RGB
            pixelRGB = imag.getpixel((X,Y))
            R,G,B = pixelRGB 

            self.brightness = sum([R,G,B])/3 ##0 is dark (black) and 255 is bright (white)
            #print(brightness)

            #Standard
            self.brightnessLuminanceA = (0.2126*R) + (0.7152*G) + (0.0722*B)

            #Percieved A
            self.brightnessLuminanceB = (0.299*R + 0.587*G + 0.114*B)

            #Perceived B, slower to calculate
            self.brightnessLuminanceC = np.sqrt(0.299*R**2 + 0.587*G**2 + 0.114*B**2 )
            
        except:
            e = sys.exc_info()[0]
            print('ERROR (determine_brightness_scale): ' + str(e))        
        
    def determine_contrast(self):
        try:
            self.contrastColorScale = np.var(self.imageTypeCv2)
            self.contrastGreyScale = np.var(cv2.cvtColor(self.imageTypeCv2, cv2.COLOR_BGR2GRAY))

            # Variance of the color image
            #print(np.var(self.imageTypeCv2))
            # greyscale
            #print(np.var(cv2.cvtColor(self.imageTypeCv2, cv2.COLOR_BGR2GRAY)))
        except:
            e = sys.exc_info()[0]
            print('ERROR (determine_contrast): ' + str(e))
        
    def determine_kmeans_cv2_dominant_colors(self):
        try:
            #img = cv2.imread(file1)
            img = self.imageTypeCv2

            Z = np.float32(img.reshape((-1,3)))
            #Z = np.float32(Z)

            # define criteria, number of clusters(K) and apply kmeans()
            criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
            K=4
            _, labels, self.imageCentroids=cv2.kmeans(Z, K, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)

            #####################
            # Contstruct a histogram
            numLabels = np.arange(0, len(np.unique(labels)) + 1)
            (self.imageHist, _) = np.histogram(labels, bins = numLabels)

            # normalize the histogram, such that it sums to one
            self.imageHist = self.imageHist.astype("float")
            self.imageHist /= self.imageHist.sum()

            # Assign separately (additional for now)
            self.imageCentroid1 = self.imageCentroids[0]
            self.imageCentroid2 = self.imageCentroids[1]
            self.imageCentroid3 = self.imageCentroids[2]
            self.imageCentroid4 = self.imageCentroids[3]
            
            #print('histogram: ' + str(self.imageHist))

            # Print the Centers (of the colors)
            #print('centers: %s' % self.imageCentroids)

            #return hist, centroids
        except:
            e = sys.exc_info()[0]
            print('ERROR (determine_kmeans_cv2_dominant_colors): ' + str(e))            
        
    # Run through all the functions below
    def load_all_image_properties(self):
        self.assign_bluegreenred_channel_statistics()
        self.assign_image_size()
        self.assign_image_blur()
        self.determine_brightness_scale()
        self.determine_contrast()
        self.determine_kmeans_cv2_dominant_colors()
        #print('All Properties loaded for image: ' + self.imageFileName)
        
    def to_dictionary(self):
        return {
            'imageFileName': self.imageFileName,
            'clrchn_b_shape':self.clrchn_b_shape,
            'clrchn_g_shape':self.clrchn_g_shape,
            'clrchn_r_shape':self.clrchn_r_shape,

            'clrchn_b_mean':self.clrchn_b_mean,
            'clrchn_g_mean':self.clrchn_g_mean,
            'clrchn_r_mean':self.clrchn_r_mean,
            
            'clrchn_b_std':self.clrchn_b_std,
            'clrchn_g_std':self.clrchn_g_std,
            'clrchn_r_std':self.clrchn_r_std,
            
            'clrchn_b_var':self.clrchn_b_var,
            'clrchn_g_var':self.clrchn_g_var,
            'clrchn_r_var':self.clrchn_r_var,
            
            'imageSize':self.imageSize,
            'blurColorScale':self.blurColorScale,
            'blurGreyScale':self.blurGreyScale,
            'contrastColorScale':self.contrastColorScale,
            'contrastGreyScale':self.contrastGreyScale,
            'brightness':self.brightness,
            'brightnessLuminanceA':self.brightnessLuminanceA,
            'brightnessLuminanceB':self.brightnessLuminanceB,
            'brightnessLuminanceC':self.brightnessLuminanceC,
            'imageHist':self.imageHist,
            #'imageCentroids':self.imageCentroids,
            
            'imageCentroid1':self.imageCentroid1,
            'imageCentroid2':self.imageCentroid2,
            'imageCentroid3':self.imageCentroid3,
            'imageCentroid4':self.imageCentroid4
            
        }
        

In [None]:
workdir = 'd:/project/data/kg_avito_demand'
testWorkDir = 'd:/project/data/kg_avito_demand/test_jpg/'
trainWorkDir = 'd:/project/data/kg_avito_demand/train_jpg/'


## TESTING FROM HERE

In [None]:
ben = AvitoImage(trainWorkDir + '000cc55987a5dbeef75f6628bd6acda928b1a8758274e196e5738ab2c6b053ee.jpg')
#ben.load_image()
ben.imageFileName

In [None]:
ben.load_all_image_properties()

In [None]:
ben.brightnessLuminanceC

In [None]:
ben.to_dictionary()

In [None]:
df = pd.DataFrame.from_records([ben.to_dictionary()])

In [None]:
df.head()

In [None]:
df.to_csv(index=False, path_or_buf=workdir + '/exportBen.csv')
df.to_pickle(path=workdir + '/exportBen.pickle')  

In [None]:
workdir

In [None]:
newDf = pd.read_csv(workdir + '/exportBen.csv')
newDf

In [None]:
arrList = np.asarray(newDf['imageCentroids'])
#arrList
arrList[0][0][0]

In [None]:
newDfPickle = pd.read_pickle(workdir + '/exportBen.pickle')

In [None]:
newDfPickle.imageCentroids
arrPickle = np.asarray(newDfPickle.imageCentroids)

In [None]:
arrPickle[0][0]

In [None]:
workdir = 'd:/project/data/kg_avito_demand'
testWorkDir = 'd:/project/data/kg_avito_demand/test_jpg/'
trainWorkDir = 'd:/project/data/kg_avito_demand/train_jpg/'

ben = AvitoImage()
ben.load_image(trainWorkDir + '000cc55987a5dbeef75f6628bd6acda928b1a8758274e196e5738ab2c6b053ee.jpg')

In [None]:
# Open the main file
dfFullFileList = pd.read_csv(workdir + '/FullImageFileList.csv')

In [None]:
dfFullFileList.head()

In [None]:
# Now add each of the items to the dataframe

In [None]:
# Then loop through each one for the group and then start appending

In [None]:
dicBen = ben.to_dictionary()

In [None]:
ben2 =  getattr(ben,'imageCentroid2')
ben2

## Now Process the File / Group

In [None]:
## SET THE GROUP NUMBER WE ARE PROCESSING
CONST_GROUP_NUM = 1

In [None]:
def add_additional_columns_to_dataframe(df, columnList):
    
    # Create the translated dataframe column
    for i in columnList:
        # check for existing
        if i in df:
            print(str(i) + ' - exists')
        else:
            print('Added col: ' + str(i))
            df[i] = ''


    return df

In [None]:
emptyClass = AvitoImage('')

colList = []
dictKeys = emptyClass.to_dictionary()
for key in dictKeys.keys():
    print(key)
    
    colList.append(key)

In [None]:
groupedFileName = workdir + '/FullImageFileList_' + str(CONST_GROUP_NUM) + '.csv'

# Check to see if group file exists or not
if os.path.isfile(groupedFileName) == False:
    print('file not found - reading raw file')
    # Create the dataframe from the base file
    dfGroup = pd.read_csv(workdir + '/FullImageFileList.csv')
    
    # append the additional columns
    dfGroup = add_additional_columns_to_dataframe(dfGroup, colList)
    
    # else read it in, and re-write to it
    df = dfGroup[dfGroup['group']==CONST_GROUP_NUM].copy()
    df = df.reset_index(drop=True)
    
else:
    print('file found - read in')
    dfGroup = pd.read_csv(groupedFileName)
    df = dfGroup.copy()

In [None]:
print('Total row to convert: ' + str(len(df[df['processed']==0])) + ' out of: ' + str(len(df)))
df.tail()


In [None]:
print('Total row to convert: ' + str(len(df[df['processed']==0])) + ' out of: ' + str(len(df)))
startTime = datetime.now()


# For each row in the data set
for i, row in df.iterrows():

    #print('row ' + str(i))
    # Here skip if we have already processed (or only process if still outstanding)
    if df.at[i, 'processed'] == 0:
        
        # load the class item
        imgClass = AvitoImage(df.at[i, 'filename'])
        imgClass.load_all_image_properties()

        dictImage = imgClass.to_dictionary()

        # now cycle through each of the individual columns at a time
        for key in dictImage.keys():
            #print(key)
            tempAttr= getattr(imgClass,key)
            #print(str(tempAttr))
            df.iloc[i,df.columns.get_loc(key)] = str(getattr(imgClass,key))

            # Set the processed
        df.at[i, 'processed'] = 1

        # only run on the second column, not each column for the checkpoint
        if (i % 20000 == 0):
            print('Processed row (' + str(datetime.now()) + '): ' + str(i) + ' and creating file snapshot. ' + str(datetime.now() - startTime))

            file = codecs.open(groupedFileName, 'w', 'utf-8') 
            df.to_csv(file, index=False)
            file.close()

    # Test 25
    #if i % 10 == 0:
    #    break

print('Finished')

# old
#with codecs.open(exportFileName, 'a', 'utf-8') as f:  
    #print(df.iloc[i])
    #df.iloc[i].to_csv(f, header=False, index=False, encoding="latin-1") #, quotechar='"')
    #df_EstimatesRun.to_csv(f, header=includeHeaderRunOnce, index=False, quotechar='"')
       
print (datetime.now() - startTime)
print('Finished - Writing final File')

# Final Write
file = codecs.open(groupedFileName, 'w', 'utf-8') 
df.to_csv(file, index=False)
file.close()

print('All Rows Processed!')