In [35]:
import pandas as pd
from clean_images import CleanImages
from clean_tabular import CleanData
import os
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from PIL import Image
import multiprocessing
from torchvision.transforms import Normalize, ToPILImage, ToTensor
from torch.nn import Module
from torch import nn
from pathlib import Path



In [156]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, transformer = transforms.ToTensor(), X = 'image', y = 'major_category_encoded', img_dir = Path(Path.cwd(), 'images'), img_size=128):
        self.transformer = transformer
        self.img_dir = img_dir
        self.img_size = img_size
        '''Yielding images dataset from CleanImages python script'''
        image_class = CleanImages()
        image_df = image_class.total_clean(size=img_size, normalize=False).copy()

        '''Yielding product dataset from CleanData python script'''
        product_class = CleanData(tab_names=['products', 'products_2'])
        product_class.try_merge(['products', 'products_2'])
        product_class.get_na_vals(df='combined')
        products_df = product_class.expand_category().copy()

        '''Merging both the previous dataset to link image with associated product category '''
        merged_df = image_df.merge(products_df, left_on='id', right_on='id')
        filtered_df = merged_df.loc[:, ['image_id', 'image','image_array', 'minor_category', 'minor_category_encoded','major_category', 'major_category_encoded']].copy()
        filtered_df.dropna(inplace=True)
        print('Total observations in remaining dataset: ', len(filtered_df))
        self.y = filtered_df[y].values
        self.X = filtered_df[X].values
        print(self.X.dtype)

    # Not dependent on index
    def __getitem__(self, index):        
        if isinstance(self.X[0], (np.ndarray, pd.core.series.Series)) or torch.is_tensor(self.X[0]) :
            self.X[index] = torch.from_numpy(self.X[index])
            self.y[index] = int(self.y[index])
        else:
            self.X[index] =  Image.open(os.path.join(self.img_dir, self.X[index]))
            print(self.X[index])
            self.y[index] = int(self.y[index])
            if self.transformer is not None:
                self.X[index] = self.transformer(self.X[index])
        return self.X[index], self.y[index]

    def __len__(self):
        return len(self.y)

In [159]:
image_dataset = Dataset(X='image_array')
loaded_data = DataLoader(image_dataset, num_workers=multiprocessing.cpu_count()/2, batch_size=60)
for batch, (i, j) in enumerate(loaded_data.dataset, start=1):
    print('\n'*2)
    print('#'*20)
    print("Batch Number: ", batch)
    print('INPUT ARRAY')
    print(i)
    print("Shape of input array: ", i.shape)
    print('-'*20)    
    print('Encoded target label value: ', j)

images
                                     id                            product_id  \
0  912bb259-3ad9-457b-9db1-ce1da9016057  5f5f57d7-778f-4336-bb10-b43863418c8c   
1  b166d305-b852-4bdd-83f4-465b20da94fa  5f5f57d7-778f-4336-bb10-b43863418c8c   
2  68f5a29d-0075-4d60-81c1-ab684a82e50c  c2c8949f-3cde-4651-a234-4a4a1b2a9ad4   
3  f6a309d7-d247-446a-9b5e-aceefdd4334d  c2c8949f-3cde-4651-a234-4a4a1b2a9ad4   
4  2c2b3a6f-15b3-4289-937a-15482d9f5781  8292aa4e-7f1b-4655-bf0e-f1f2c9e3ffaf   

                                                                               bucket_link  \
0  https://aicore-product-images.s3.amazonaws.com/912bb259-3ad9-457b-9db1-ce1da9016057.jpg   
1  https://aicore-product-images.s3.amazonaws.com/b166d305-b852-4bdd-83f4-465b20da94fa.jpg   
2  https://aicore-product-images.s3.amazonaws.com/68f5a29d-0075-4d60-81c1-ab684a82e50c.jpg   
3  https://aicore-product-images.s3.amazonaws.com/f6a309d7-d247-446a-9b5e-aceefdd4334d.jpg   
4  https://aicore-product-images.s3.

In [143]:
print(type(loaded_data))
loaded_data

<class 'torch.utils.data.dataloader.DataLoader'>


<torch.utils.data.dataloader.DataLoader at 0x7fc434c64df0>

In [56]:
class Net(nn.Module):
    def __init__(self, num_features = len(category_list), input_array = X): #,  pool1 = 2, pool2 =2, pool3 =2):
        super(Net, self).__init__()
        dimensions = input_array.apply(lambda i: i.shape).unique()
        self.dimensions = dimensions
        print(self.dimensions)
        print(input_array.shape)
        if len(dimensions) == 1:
            self.features = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=16, kernel_size=2, padding=1),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=2, padding= 1), 
            nn.MaxPool2d(kernel_size=2, stride=2), 
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=2, padding=1), 
            )
        
            self.classifier = nn.Sequential(nn.Linear(int((dimensions[0][0]/4)*(dimensions[0][1]/4))*64, 1000), #dimensions[0][0]*dimensions[0][1]/(2*2)
            nn.ReLU(inplace=True), nn.Dropout(0.5), nn.Linear(1000, 2000), nn.ReLU(inplace=True), 
            nn.Linear(2000, num_features))
        
    def forward(self, x):
        print(self.dimensions)
        x = self.features(x)
        x = x.reshape(-1, 64*self.dimensions[0]*self.dimensions[1])
        x = self.classifier(x)
        return x

        
    
        

In [57]:
model = Net()

[(128, 128)]
(1200,)


In [144]:
filtered_df.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199],
           dtype='int64', length=1200)