In [1]:
import os
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import entropy

In [2]:
meta = pd.read_csv("/data2/datasets/clobotics/ccth/indices/ccth_sku_metadata_0120.csv")
train = pd.concat([pd.read_csv(p) for p in Path("/data2/datasets/clobotics/ccth/labels/train/20200129").glob("*.csv")])

In [3]:
train.shape

(1611967, 12)

In [4]:
train.ProductId.value_counts()

1          187446
1037728     81842
1037709     67799
1037742     62589
1037769     50387
            ...  
1079369         1
1079258         1
1037612         1
1079339         1
1079436         1
Name: ProductId, Length: 634, dtype: int64

### Check Meta

In [5]:
meta[meta.ProductId == 1082140]

Unnamed: 0,ProductId,SKUType,Package,UnitCount,Volume,VolumeType,Industry,Juice,CategoryId,Brand,SubBrand,SubBrandId,Series,Flavor,Market,SKUName
852,1082140,,,,,,Drink,Juice,1037852,,Jele Light,1065198.0,,,Thailand,Jele Light Juice-Other


In [9]:
meta[meta.ProductId == 1082143]

Unnamed: 0,ProductId,SKUType,Package,UnitCount,Volume,VolumeType,Industry,Juice,CategoryId,Brand,SubBrand,SubBrandId,Series,Flavor,Market,SKUName
855,1082143,,,,,,Drink,Juice,1037852,,Jub Jub,1065201.0,,,Thailand,Jub Jub Juice-Other


### Check train data

In [10]:
train[train.ProductId == 1082144]

Unnamed: 0,ImgUrl,ProductId,xmin,ymin,xmax,ymax,ImageQuality,SceneType,Rotation,TaskItemId,TaskId,RequestId


In [11]:
train[train.ProductId.isin([1037848,
 1037849,
 1037850,
 1037851,
 1037852,
 1037853,
 1051447,
 1051448,
 1051868,
 1051869])]

Unnamed: 0,ImgUrl,ProductId,xmin,ymin,xmax,ymax,ImageQuality,SceneType,Rotation,TaskItemId,TaskId,RequestId
482,https://fileman-na.clobotics.com/api/file/0759...,1037850,0.827719,0.067999,0.928515,0.228892,[],[],0.0,1.0,95918.0,1.0
483,https://fileman-na.clobotics.com/api/file/bc26...,1037850,0.665159,0.471645,0.729852,0.618170,[],[],0.0,1.0,95918.0,1.0
484,https://fileman-na.clobotics.com/api/file/e8ff...,1037850,0.666424,0.396536,0.733207,0.543395,[],[],0.0,1.0,95918.0,1.0
485,https://fileman-na.clobotics.com/api/file/8aa1...,1037850,0.834002,0.295201,0.905318,0.417726,[],[],0.0,1.0,95918.0,1.0
486,https://fileman-na.clobotics.com/api/file/5133...,1037850,0.356364,0.490907,0.403178,0.584707,[],[],0.0,1.0,95918.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3048,https://fileman-na.clobotics.com/api/file/dbc2...,1037851,0.714010,0.052305,0.954706,0.463363,[],[],0.0,1.0,95590.0,1.0
3049,https://fileman-na.clobotics.com/api/file/f2bb...,1037851,0.777484,0.581401,0.899307,0.806822,[],[],0.0,1.0,95590.0,1.0
164,https://fileman-na.clobotics.com/api/file/d6f3...,1037851,0.780077,0.427865,0.857726,0.549411,[],[],0.0,1.0,95920.0,1.0
165,https://fileman-na.clobotics.com/api/file/0963...,1037851,0.778473,0.657958,0.876541,0.871245,[],[],0.0,1.0,95920.0,1.0


In [12]:
set(train.ProductId.unique().tolist()) - set(meta.ProductId.unique().tolist())

{1, 1079969, 1080433, 1080718, 1080801}

### check how many samples are from pure SKU class, how many are subbrand-other or category other

In [24]:
category_ids = meta[meta.ProductId == meta.CategoryId].ProductId.tolist()
subbrand_other_ids = meta[meta.ProductId == meta.SubBrandId].ProductId.tolist()
len(category_ids + subbrand_other_ids)

324

In [22]:
print("subbrand-other or category other\n" + "-" * 40)
print(f"# samples: {len(train[train.ProductId.isin(category_ids + subbrand_other_ids + [1])])} ")
print(f"# classes: {len(train[train.ProductId.isin(category_ids + subbrand_other_ids + [1])].ProductId.unique())} ")

subbrand-other or category other
----------------------------------------
# samples: 258574 
# classes: 196 


In [23]:
print("pure sku class\n" + "-" * 40)
print(f"# samples: {len(train[~train.ProductId.isin(category_ids + subbrand_other_ids + [1])])} ")
print(f"# classes: {len(train[~train.ProductId.isin(category_ids + subbrand_other_ids + [1])].ProductId.unique())} ")
print(f"\ntop large classes:\n{train[~train.ProductId.isin(category_ids + subbrand_other_ids + [1])].ProductId.value_counts()}")

pure sku class
----------------------------------------
# samples: 1353393 
# classes: 438 

top large classes:
1037728    81842
1037709    67799
1037742    62589
1037769    50387
1037800    41068
           ...  
1037652        1
1037612        1
1037791        1
1047785        1
1047805        1
Name: ProductId, Length: 438, dtype: int64
