In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import interact, interactive
from src.products_dataset import ProductsDataset

In [2]:
xl = pd.ExcelFile('./data/products.xlsx')
df = xl.parse()
print('Raw data table length: {}'.format(len(df)))

# Dropping the rows with Nan value in target columns
df = df.loc[:, ['id', 'condition', 'category']].dropna()
df = df.reset_index(drop=True)
print('Filtered data table length: {}'.format(len(df)))

Raw data table length: 11128
Filtered data table length: 11121


In [11]:
dataset = ProductsDataset(xlsx_filepath='./data/products.xlsx',
                          root_dir='./data/images')

In [2]:
dataset = ProductsDataset(xlsx_filepath='./data/products.xlsx',
                          root_dir='./data/images')

{'condition': {0: 10.831202046035806, 1: 1.0, 2: 1.0574282147315854, 3: 1.7471122112211221, 4: 64.16666666666667}, 'category': {0: 2.0, 1: 7.4787234042553195, 2: 35.15, 3: 5.428571428571429, 4: 1.3201877934272301, 5: 1.7444168734491314, 6: 1.0, 7: 1.0323054331864905, 8: 1.1986359761295822, 9: 56.24, 10: 1.0100574712643677, 11: 4.184523809523809, 12: 1.5037433155080213, 13: 3.017167381974249, 14: 1.4569948186528496}}


---
## Dataset visualization

In [20]:
@interact(index=widgets.IntSlider(continuous_update=False, min=0, max=len(dataset) - 1, step=1))
def f(index):
    sample = dataset[index]
    plt.imshow(sample['image'])
    print('category: {}'.format(sample['category']))
    print('condition: {}'.format(sample['condition']))
    height, width = sample['image'].shape[:2]
    print('image size: {}x{}'.format(width, height))

interactive(children=(IntSlider(value=0, continuous_update=False, description='index', max=11120), Output()), …

---
## Image distribution by classes

In [83]:
def print_image_distribution(df, fields=('category', 'condition')):
    for field in fields:
        num_images_by_class = {}

        for c in list(set(df[field])):
            num_images_by_class[c] = sum(df[field] == c)
        
        print('-' * 40)
        print('Images distribution by {}'.format(field))
        print('-' * 40)
        for key, value in num_images_by_class.items():
            print('{: <31}: {}'.format(key, value))
        print('-' * 40)
        print('{: <31}: {}'.format('Total', len(df)))
        print()

In [84]:
print_image_distribution(df)

----------------------------------------
Images distribution by category
----------------------------------------
Guitars                        : 1406
Amplifiers & Effects           : 703
Wind & Woodwind Instruments    : 965
Drums & Percussion             : 806
Stringed Instruments           : 935
Instrument Accessories         : 1362
DJ, Electronic Music & Karaoke : 1065
Bass Guitars                   : 40
Studio Recording Equipment     : 466
Brass Instruments              : 259
Microphones & Accessories      : 1392
Live Sound & Stage             : 25
Other                          : 336
Band & Orchestra               : 188
Keyboards                      : 1173
----------------------------------------
Total                          : 11121

----------------------------------------
Images distribution by condition
----------------------------------------
Poor                           : 66
Like New                       : 4005
New                            : 2424
Good                

## Train / Val / Test split

In [None]:
df_train = pd.DataFrame()
df_val = pd.DataFrame()
df_test = pd.DataFrame()

for cat in list(set(df['category'])):
    cat_df = df[df['category'] == cat]

    cat_df_train = cat_df.sample(frac=0.7)
    cat_df = cat_df.drop(cat_df_train.index)
    cat_df_val = cat_df.sample(frac=0.33)
    cat_df_test = cat_df.drop(cat_df_val.index)
    
    df_train = df_train.append(cat_df_train)
    df_val = df_val.append(cat_df_val)
    df_test = df_test.append(cat_df_test)

In [None]:
df_train.to_excel('./data/products-train.xlsx')
df_val.to_excel('./data/products-val.xlsx')
df_test.to_excel('./data/products-test.xlsx')

In [86]:
print_image_distribution(df_val)

----------------------------------------
Images distribution by category
----------------------------------------
Guitars                        : 139
Amplifiers & Effects           : 70
Wind & Woodwind Instruments    : 95
Drums & Percussion             : 80
Stringed Instruments           : 93
Instrument Accessories         : 135
DJ, Electronic Music & Karaoke : 105
Bass Guitars                   : 4
Studio Recording Equipment     : 46
Brass Instruments              : 26
Microphones & Accessories      : 138
Live Sound & Stage             : 2
Other                          : 33
Band & Orchestra               : 18
Keyboards                      : 116
----------------------------------------
Total                          : 1100

----------------------------------------
Images distribution by condition
----------------------------------------
Poor                           : 4
Like New                       : 398
New                            : 228
Good                           : 444
Fa