<a href="https://colab.research.google.com/github/bluezdot/ShopeeImage/blob/main/ShopeeImagePreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing

## Import

In [1]:
!git clone https://github.com/bluezdot/ShopeeImage.git

Cloning into 'ShopeeImage'...
remote: Enumerating objects: 32475, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 32475 (delta 14), reused 50 (delta 14), pack-reused 32425[K
Receiving objects: 100% (32475/32475), 1.68 GiB | 34.98 MiB/s, done.
Resolving deltas: 100% (59/59), done.
Checking out files: 100% (32426/32426), done.


In [2]:
%cd ShopeeImage/dataset/data_use

/content/ShopeeImage/dataset/data_use


In [3]:
!pip install albumentations==0.4.6

Collecting albumentations==0.4.6
  Downloading albumentations-0.4.6.tar.gz (117 kB)
[K     |████████████████████████████████| 117 kB 5.0 MB/s 
Collecting imgaug>=0.4.0
  Downloading imgaug-0.4.0-py2.py3-none-any.whl (948 kB)
[K     |████████████████████████████████| 948 kB 47.2 MB/s 
Building wheels for collected packages: albumentations
  Building wheel for albumentations (setup.py) ... [?25l[?25hdone
  Created wheel for albumentations: filename=albumentations-0.4.6-py3-none-any.whl size=65172 sha256=e3097a7ba304b57f78ca9f2d265979fe6abe61f7716d42e82b2f6e6fad4f5f69
  Stored in directory: /root/.cache/pip/wheels/cf/34/0f/cb2a5f93561a181a4bcc84847ad6aaceea8b5a3127469616cc
Successfully built albumentations
Installing collected packages: imgaug, albumentations
  Attempting uninstall: imgaug
    Found existing installation: imgaug 0.2.9
    Uninstalling imgaug-0.2.9:
      Successfully uninstalled imgaug-0.2.9
  Attempting uninstall: albumentations
    Found existing installation: album

In [5]:
import random
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.io import read_image
import albumentations as A
import albumentations.pytorch
from albumentations.pytorch import ToTensorV2

In [6]:
train_img_path = './train_images'
test_img_path = './test_images'
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [7]:
def my_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

## Drop duplicated phash images

In [8]:
# drop duplicate by p-hash
train_df = train_df.drop_duplicates(subset=['image_phash'],keep = 'first')
train_df.image_phash.value_counts()

b8d1c7cc98663671    1
d195575106474ee7    1
c0fe9edf5a650422    1
8b4efcfc96c2444a    1
ee2c91d26e2d91d2    1
                   ..
a910e5231fcf2aec    1
b83bca69cc69cb28    1
e373954c499d94cc    1
9e47f90fa224d0d3    1
87be8e053f0e02f9    1
Name: image_phash, Length: 28735, dtype: int64

In [9]:
# check if label group preserved
train_df.label_group.value_counts()

1141798720    44
159351600     43
1091404026    39
3489985175    37
562358068     36
              ..
370710977      1
3601891778     1
2067677642     1
2213950251     1
1332066608     1
Name: label_group, Length: 11004, dtype: int64

In [12]:
class ShopeeDataset(Dataset):

    def __init__(self, dataframe, dir_path, transforms):
        self.dataframe = dataframe
        self.dir_path = dir_path
        self.transforms = transforms

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        img_info = self.dataframe.iloc[index]
        
        img = cv2.imread(self.dir_path + '/' + img_info['image'])
        label = torch.tensor([img_info['label_group']])

        if self.transforms:
            augmented = self.transforms(image=img) 
            img = augmented['image']

        return img, label

albumentations_transforms = albumentations.Compose([
    albumentations.Resize(300, 300, interpolation = cv2.INTER_LANCZOS4), 
    albumentations.Sharpen(),
    albumentations.ColorJitter(brightness=0.1, contrast=0.4, saturation=0, hue=0),
    albumentations.ShiftScaleRotate(rotate_limit=180, p=0.5, interpolation = cv2.INTER_LANCZOS4),
    albumentations.Normalize(mean=(0, 0, 0), std=(1, 1, 1)),
    ToTensorV2()])

train_ds = ShopeeDataset(train_df, train_img_path, albumentations_transforms)

my_seed(33)

train_dl = DataLoader(train_ds, batch_size=4, shuffle=True)
# visualize a batch
num_samples = 8
fig, ax = plt.subplots(1, num_samples, figsize=(25, 10))
for i in range(num_samples):
    ax[i].imshow((train_ds[32][0]).permute(1, 2, 0))
    ax[i].axis('off')
#i = iter(train_dl)
#plt.figure(figsize=(20, 5))
#j = 1
#for im in next(i)[0]:
#    plt.subplot(1, 4, j)
#    j += 1
#    plt.imshow(im.permute(1, 2, 0))

AttributeError: ignored