In [6]:
import os
import glob
import yaml
import pandas as pd
from sklearn.model_selection import train_test_split
import shutil

In [7]:
cfg = yaml.full_load(open("../config.yml", 'r'))  # Load config data

**Build dataset**

In [4]:
data_path = cfg['PATHS']['RAW_DATA']

In [5]:
data_path

'/home/annahung/189nas/Wotcha/dataset/shopee'

In [6]:
data_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
data_df['filename'] = data_df['filename'].astype(str)
data_df['category'] = data_df['category'].astype(int)

In [7]:
data_df.head()

Unnamed: 0,filename,category
0,45e2d0c97f7bdf8cbf3594beb6fdcda0.jpg,3
1,f74d1a5fc2498bbbfa045c74e3cc333e.jpg,3
2,f6c172096818c5fab10ecae722840798.jpg,3
3,251ffd610399ac00fea7709c642676ee.jpg,3
4,73c7328b8eda399199fdedec6e4badaf.jpg,3


In [8]:
val_split = cfg['DATA']['VAL_SPLIT']

In [9]:
val_split

0.1

In [10]:
file_df_train, file_df_val = train_test_split(data_df, test_size=val_split, stratify=data_df['category'])

In [11]:
# Save training, validation and test sets
if not os.path.exists(cfg['PATHS']['PROCESSED_DATA']):
    os.makedirs(cfg['PATHS']['PROCESSED_DATA'])
file_df_train.to_csv(cfg['PATHS']['TRAIN_SET'])
file_df_val.to_csv(cfg['PATHS']['VAL_SET'])

**Statistics**

In [10]:
train_data_df = pd.read_csv(cfg['PATHS']['TRAIN_SET'])
train_categories = train_data_df.groupby("category")
train_categories.size()

category
0     2415
1     2432
2     2418
3     2433
4     2433
5     2377
6     2377
7     2394
8     2430
9     2428
10    2405
11    1659
12    2422
13    2414
14    2415
15    2369
16    2398
17    1398
18    1893
19    2411
20    2388
21    2338
22    2361
23    2286
24    2434
25    2423
26    2415
27    2432
28    2305
29    1924
30    2434
31    2409
32    1941
33     516
34    2339
35    2392
36    2417
37    1552
38    2406
39    2410
40    2413
41    2396
dtype: int64

In [11]:
val_data_df = pd.read_csv(cfg['PATHS']['VAL_SET'])
val_categories = val_data_df.groupby("category")
val_categories.size()

category
0     268
1     270
2     269
3     270
4     270
5     264
6     264
7     266
8     270
9     270
10    267
11    184
12    269
13    268
14    269
15    263
16    267
17    155
18    210
19    268
20    265
21    260
22    262
23    254
24    271
25    269
26    269
27    270
28    256
29    214
30    271
31    268
32    216
33     57
34    260
35    266
36    269
37    173
38    267
39    268
40    268
41    266
dtype: int64

In [48]:
for i in range(42):
    folderName = str(i).zfill(2)
    PATH = os.path.join('/home/annahung/189nas/Wotcha/dataset/shopee_processed/', 'train', folderName)
    #print(PATH)
    os.mkdir(PATH)

In [50]:

for i, row in val_data_df.iterrows():
    srcPath = os.path.join('/home/annahung/189nas/Wotcha/dataset/shopee/','train', str(val_data_df.loc[i, 'category']).zfill(2), val_data_df.loc[i, 'filename'])
    dstPath = os.path.join('/home/annahung/189nas/Wotcha/dataset/shopee_processed/','val', str(val_data_df.loc[i, 'category']).zfill(2), val_data_df.loc[i, 'filename'])
    shutil.copyfile(srcPath, dstPath)

In [51]:

for i, row in train_data_df.iterrows():
    srcPath = os.path.join('/home/annahung/189nas/Wotcha/dataset/shopee/','train', str(train_data_df.loc[i, 'category']).zfill(2), train_data_df.loc[i, 'filename'])
    dstPath = os.path.join('/home/annahung/189nas/Wotcha/dataset/shopee_processed/','train', str(train_data_df.loc[i, 'category']).zfill(2), train_data_df.loc[i, 'filename'])
    shutil.copyfile(srcPath, dstPath)

**Create samll subset**

In [13]:
for i in range(42):
    folderName = str(i).zfill(2)
    PATH = os.path.join('/home/annahung/189nas/Wotcha/dataset/smallSet/', 'test', folderName)
    #print(PATH)
    os.mkdir(PATH)

In [15]:
num = 0
for i, row in val_data_df.iterrows():
    srcPath = os.path.join('/home/annahung/189nas/Wotcha/dataset/shopee/','train', str(val_data_df.loc[i, 'category']).zfill(2), val_data_df.loc[i, 'filename'])
    dstPath = os.path.join('/home/annahung/189nas/Wotcha/dataset/smallSet/','train', str(val_data_df.loc[i, 'category']).zfill(2), val_data_df.loc[i, 'filename'])
    shutil.copyfile(srcPath, dstPath)
    num += 1
    if num == 60:
        break

In [16]:
num = 0
for i, row in val_data_df.iterrows():
    srcPath = os.path.join('/home/annahung/189nas/Wotcha/dataset/shopee/','train', str(val_data_df.loc[i, 'category']).zfill(2), val_data_df.loc[i, 'filename'])
    dstPath = os.path.join('/home/annahung/189nas/Wotcha/dataset/smallSet/','val', str(val_data_df.loc[i, 'category']).zfill(2), val_data_df.loc[i, 'filename'])
    shutil.copyfile(srcPath, dstPath)
    num += 1
    if num == 60:
        break

**clean test data**

In [20]:
import glob

test_img_file = glob.glob('/home/annahung/189nas/Wotcha/dataset/shopee/test/43/*')
test_img_file = [x.split('/')[-1] for x in test_img_file]

In [9]:
import pandas as pd
val_data_df = pd.read_csv('/home/annahung/189nas/Wotcha/dataset/shopee/test.csv')

In [17]:
df2 = val_data_df['filename'].to_list()

In [15]:

diff1 = set(data1).difference(data2)


In [22]:
aa = set(test_img_file)
bb = set(df2)
diff = aa.difference(bb)

In [24]:
for a_diff in diff:
    a_file = os.path.join('/home/annahung/189nas/Wotcha/dataset/shopee/test/43/', a_diff)
    os.remove(a_file)