In [46]:
from download_from_database import DownloadTables
import pandas as pd
import torch
import numpy as np 
from PIL import Image
from alive_progress import alive_bar
from torchvision.transforms import ToTensor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
 

In [56]:
class PrepareData:
    """
    Class to process images for a classifcation model.

    """
    def __init__(self):
        """
        See help(PrepareData)
        """
        pass


    def retrieve_dataframes(self, credentials_location:str=None):
        """
        Retrives the Dataframes and assigns them as instance variables.

        Attributes:
            crediential_location(str): Optionally, the location of the yaml file containing the database credentials. It should contain DATABASE_TYPE, DBAPI, ENDPOINT, USER, PASSWORD, PORT, DATABASE
        
        Returns:
            product_df(DataFrame): The clean DataFrame from the products tables. 
            image_df (DataFrame): The clean DataFrame from the images table. 

        
        """
        if credentials_location != None:
            downloader = DownloadTables(credentials_location)
            downloader.download_table("images")
            downloader.download_table("products")
        
        #TODO add in a step to clean the products table if needed
    
        product_df = pd.read_json("data/products_table_clean.json")
        image_df = pd.read_json("data/images_table.json")
        
        self.product_df = product_df
        self.image_df = image_df

        return product_df, image_df

    def retrieve_image_category(self, image_name:str) -> str:
        """
        Given the ID of an image, retrieves the category. 

        Attributes:
            image_name(str): the name of when 
        """
        image_row = self.image_df.loc[self.image_df["id"] == image_name]
        product_id = image_row.iloc[0]["product_id"]

        product_row = self.product_df.loc[self.product_df["id"] == product_id]
        category = product_row.iloc[0]["main_category"]
        
        return category

    @staticmethod
    def image_to_array(image_location):
        """
        Converts an image to a numpy array, 
        """
        image = Image.open(image_location)
        image = ToTensor()(image)
        image = torch.flatten(image)
        return image.numpy()

    
    def convert_to_image_array(self, image):
        image_category = self.retrieve_image_category(image)
        image_location = images_folder +"/"+ image +".jpg"
        image_array = self.image_to_array(image_location)
        return image_array, image_category

    
    def create_dict_of_categories(self): 
        categories = set(self.product_df["main_category"]) 
        categories_dict = {k: v for v, k in enumerate(categories)} 
        self.categories_dict = categories_dict
        return categories_dict


    def form_arrays(self, image_size:int, n:int = None):
        
        images = list(image_df["id"])

        # if the number of images hasn't been specified, look at all images
        if n == None:
            n = len(images)

        # sets up the arrays
        # array_size is based on a square image with three channels
        array_size = (image_size**2)*3
        X = np.zeros((n,array_size))
        y = np.zeros(n)

        
        #set up a dictionary assigning categories to integers
        pipeline.create_dict_of_categories()
        

        for index in range(n):
            image = images[index]
            try:
                features, label = pipeline.convert_to_image_array(image)
                X[index, :] = features
                y[index] = self.categories_dict[label]

            except:
                #TODO deal with this because it might be affecting the model
                X[index, :] = np.zeros(array_size)
                y[index] = 0


        return X, y
            

In [57]:
credentials_location = ".gitignore/credentials_for_marketplace.yml"

pipeline = PrepareData()
product_df, image_df = pipeline.retrieve_dataframes(credentials_location)

data/images_table.json  is already downloaded, skipping
data/products_table.json  is already downloaded, skipping


In [59]:

images_folder = "data/cleaned_images_64"
image_size = 64


X, y = pipeline.form_arrays(image_size)



In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [55]:

model = LogisticRegression(max_iter=100)
model.fit(X_train, y_train)

score = model.score(X_test, y_test)
print(score)

0.16699722332407774


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
