In [None]:
#import sklearnx to speed up code (must be imported before sklearn)

from sklearnex import patch_sklearn
patch_sklearn() 

In [22]:
from download_from_database import DownloadTables
import pandas as pd
import torch
import numpy as np 
from PIL import Image
from alive_progress import alive_bar
from torchvision.transforms import ToTensor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from os.path import isfile
from clean_tabular import CleanTabular

In [23]:
class PrepareData:
    """
    Class to process images for a classifcation model.

        Attributes:
            crediential_location(str): Optionally, the location of the yaml file containing the database credentials. It should contain DATABASE_TYPE, DBAPI, ENDPOINT, USER, PASSWORD, PORT, DATABASE
    """
    def __init__(self, credentials_location:str=None):
        """
        See help(PrepareData)
        """
        product_df, image_df = self.retrieve_dataframes(credentials_location)

        self.product_df = product_df
        self.image_df = image_df

    def retrieve_dataframes(self, credentials_location:str=None):
        """
        Retrives the Dataframes..

        
        Attributes:
            crediential_location(str): Optionally, the location of the yaml file containing the database credentials. It should contain DATABASE_TYPE, DBAPI, ENDPOINT, USER, PASSWORD, PORT, DATABASE
        
        Returns:
            product_df(DataFrame): The clean DataFrame from the products tables. 
            image_df (DataFrame): The clean DataFrame from the images table. 

        
        """
        #if credentials location is given, attempt to download the tables
        if credentials_location != None:
            downloader = DownloadTables(credentials_location)
            downloader.download_table("images")
            downloader.download_table("products")
        
        # check the appropriate products table exists
        
        product_table_location = "data/products_table_clean.json"
        # product_table_location = "data/products_table_logistic_regression.json"
        if isfile(product_table_location) == False:
            print("Creating ", product_table_location)
            data_location = "data/products_table.json"
            product_data = pd.read_json(data_location)
            
            cleaner = CleanTabular(product_data)
            cleaner.create_main_category_column()
            product_data = cleaner.get_product_df()
            product_data.to_json(product_table_location)

        # read the data into dataframes
        product_df = pd.read_json(product_table_location)
        image_df = pd.read_json("data/images_table.json")

        return product_df, image_df

    def retrieve_image_category(self, image_name:str) -> str:
        """
        Given the ID of an image, retrieves the category. 

        Attributes:
            image_name(str): the name of when 
        """
        image_row = self.image_df.loc[self.image_df["id"] == image_name]
        product_id = image_row.iloc[0]["product_id"]

        product_row = self.product_df.loc[self.product_df["id"] == product_id]
        category = product_row.iloc[0]["main_category"]
        
        return category

    @staticmethod
    def image_to_array(image_location):
        """
        Converts an image to a numpy array, 
        """
        image = Image.open(image_location)
        image = ToTensor()(image)
        image = torch.flatten(image)
        return image.numpy()

    
    def convert_to_image_array(self, image):
        image_category = self.retrieve_image_category(image)
        image_location = images_folder +"/"+ image +".jpg"
        image_array = self.image_to_array(image_location)
        return image_array, image_category

    
    def create_dict_of_categories(self): 
        categories = set(self.product_df["main_category"]) 
        categories_dict = {k: v for v, k in enumerate(categories)} 
        self.categories_dict = categories_dict
        return categories_dict


    def form_arrays(self, image_size:int, n:int = None):
        
        images = list(image_df["id"])

        # if the number of images hasn't been specified, look at all images
        if n == None:
            n = len(images)

        # sets up the arrays
        # array_size is based on a square image with three channels
        array_size = (image_size**2)*3
        X = np.zeros((n,array_size))
        y = np.zeros(n)

        
        #set up a dictionary assigning categories to integers
        pipeline.create_dict_of_categories()
        

        for index in range(n):
            image = images[index]
            try:
                features, label = pipeline.convert_to_image_array(image)
                X[index, :] = features
                y[index] = self.categories_dict[label]

            except:
                #TODO deal with this because it might be affecting the model
                X[index, :] = np.zeros(array_size)
                y[index] = 0


        return X, y
            

In [24]:
credentials_location = ".gitignore/credentials_for_marketplace.yml"

pipeline = PrepareData()
product_df, image_df = pipeline.retrieve_dataframes(credentials_location)

data/images_table.json  is already downloaded, skipping
data/products_table.json  is already downloaded, skipping


In [25]:
pipeline.create_dict_of_categories()
pipeline.categories_dict

{'diy_tools_materials': 0,
 'other_goods': 1,
 'clothes_footwear_accessories': 2,
 'sports_leisure_travel': 3,
 'computers_software': 4,
 'appliances': 5,
 'baby_kids_stuff': 6,
 'video_games_consoles': 7,
 'home_garden': 8,
 'phones_mobile_phones_telecoms': 9,
 'music_films_books_games': 10,
 'office_furniture_equipment': 11,
 'health_beauty': 12}

In [26]:

images_folder = "data/cleaned_images_128"
image_size = 128


X, y = pipeline.form_arrays(image_size)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, shuffle=False)

In [28]:

model = LogisticRegression(max_iter=100)
model.fit(X_train, y_train)

score = model.score(X_test, y_test)
print(score)

0.01190003966679889


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
