In [1]:
import re
import pandas as pd
import os
import glob
import cv2
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm_notebook

In [31]:
#root = os.getcwd()
root = r"D:\adham-till-code"
root

'D:\\adham-till-code'

In [3]:
def total_images(main_folder):
    """
    This function is used for getting total number of jpg/png files in the main folder
    """
    # Define the pattern to search for .jpg files recursively
    file_pattern = main_folder + '/**/*.[jp][pn]g'
    
    # Use glob to find all matching files
    my_list = glob.glob(file_pattern, recursive=True)
    
    # just return the number of total files
    return len(my_list)

In [4]:
# creating the generator
def reader(main_folder, image_counter=0):
    '''
    This function generates through all the images file in the folder
    ---
    image_counter is the counter for unique images. it is the initial counting point
    '''  
    print(main_folder)

    # Define the pattern to search for .jpg files recursively
    file_pattern = main_folder + '/**/*.[jp][pn]g'  
    
    # Use glob to find all matching files
    image_files = glob.glob(file_pattern, recursive=True)

    # Process the list of found .jpg files
    for path in image_files:
        # steps for the file. open with cv2 --> if okay --> yield the id, image array and filepath
        img = cv2.imread(path)

        # check if image can be loaded correctly
        if img is not None:   
            #create new name
            unique_id = f"{image_counter:07}_"

            image_counter += 1

            # yield tuple, so later we need to use
            yield (unique_id, img, path)

        else:
            # check path or image format
            print(f"Error while loading image: {path}")

In [5]:
def get_colour_hist(image):
    """
    This function is used to calculate color histogram for image
    """
    red_channel = image[:, :, 0]
    green_channel = image[:, :, 1]
    blue_channel = image[:, :, 2]

    # Calculate colour histogram
    red_hist = cv2.calcHist([red_channel], [0], None, [30], [0, 256])
    green_hist = cv2.calcHist([green_channel], [0], None, [30], [0, 256])
    blue_hist = cv2.calcHist([blue_channel], [0], None, [30], [0, 256])

    # Normalize the colour histogram value
    red_hist = cv2.normalize(red_hist, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX)
    green_hist = cv2.normalize(green_hist, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX)
    blue_hist = cv2.normalize(blue_hist, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX)

    return red_hist, green_hist, blue_hist

In [24]:
def main(filepath, pc="windows", path_file='file_path.pkl', color_file='color_histogram.pkl', counter=0):
    """
    main function for generator
    ---
    filepath specific where the parent file for images located,
    pc is either windows or apple,
    path_file and color_file is for desired name of pkl file,
    counter is the starting counter for the images
    """
    # Counter for unique image names
    gen = reader(filepath, image_counter=counter)

    master_df = pd.DataFrame()
    # LOOP OVER ALL IMAGES
    
    check_point = 0
    for index, image, file_path in tqdm_notebook(gen, desc="extracting RGB information from the images",
                                                total = total_images(filepath)):
        # get the color histogram
        rgb_array = get_colour_hist(image)

        # extract and format the color values
        red_channel = rgb_array[0]
        green_channel = rgb_array[1]
        blue_channel = rgb_array[2]

        # merged the list from RGB channel into one list
        merged_list = np.concatenate((red_channel, green_channel, blue_channel), axis=1).flatten().tolist()

        # create a DataFrame, tranpose it, add id and merge to master_df
        df = pd.DataFrame(merged_list)
        df = df.transpose()
        df["image_id"] = index
        if pc == "windows":
            df["file_path"] = file_path.lstrip(file_path[0:2]) # save the path that are consistent for both macbook and windows
        else:
            df["file_path"] = file_path
        df = df.set_index('image_id')

        master_df = pd.concat([master_df, df])
        check_point += 1
        
        # add basic checkpoint
        if check_point % 1000 == 0:
            # save into draft, so that the saved progress can be processed later
            master_df.to_pickle(os.path.join(root, 'draft_df.pkl'))
            print(f"draft file saved after {check_point} checkpoints")

    # split the path from the color historgram
    path_df = master_df[["file_path"]]
    color_df = master_df.drop("file_path", axis="columns")
    
    #save as pickle
    path_df.to_pickle(os.path.join(root, path_file))
    color_df.to_pickle(os.path.join(root,color_file))
    
    print("-------------------------")
    print("Notes:")
    if pc == "windows":
        print("path and colour information saved")
    else:
        print("This path might only works on your pc system. Please consider the changes of path name if you are using another device")

In [25]:
#filepath = os.path.join(os.getcwd(), "weather_image_recognition")
#filepath = r"D:\images" # path for image in hard drive
filepath = r"D:\images\FFHQ_images" # extra images

In [None]:
if __name__ == "__main__":
    main(filepath, pc="windows", path_file='file_path_extra.pkl', color_file='color_histogram_extra.pkl', counter=167104)

In [27]:
load_checkpoint = pd.read_pickle(os.path.join(root, 'draft_df.pkl'))

In [32]:
new_path_df = pd.read_pickle(os.path.join(root,'file_path_extra.pkl'))
new_path_df.head()

Unnamed: 0_level_0,file_path
image_id,Unnamed: 1_level_1
0167104_,\images\FFHQ_images\00000\00000.png
0167105_,\images\FFHQ_images\00000\00001.png
0167106_,\images\FFHQ_images\00000\00002.png
0167107_,\images\FFHQ_images\00000\00003.png
0167108_,\images\FFHQ_images\00000\00004.png


In [34]:
color_df = pd.read_pickle(os.path.join(root, 'color_histogram_extra.pkl'))
color_df.shape

(46901, 90)