In [1]:
import pandas as pd
import numpy
import os

In [2]:
src = 'data'
img_src = 'data/images'

In [3]:
df = pd.read_csv(os.path.join(src, 'attributes.csv'))

In [4]:
df.head()

Unnamed: 0,filename,neck,sleeve_length,pattern
0,cdc8cd2a-0938-4970-a3b5-f5ed9595222c1527925869...,6.0,,4.0
1,11469770662809-Metersbonwe-Navy-T-shirt-485146...,5.0,3.0,9.0
2,11479107741104-Tommy-Hilfiger-Men-Navy-Blue-St...,6.0,1.0,9.0
3,f7ad67ab-eeb1-4449-8f63-7b580d2797e71532342804...,,0.0,9.0
4,11516770810185-Splash-Men-Tshirts-767151677081...,6.0,3.0,9.0


In [5]:
len(df)

2238

In [6]:
def check_file_present(filename):
    return (os.path.exists(os.path.join(img_src, filename)))

df['file_present'] = df.apply(lambda row: check_file_present(row.filename), axis=1)
df = df[df['file_present'] == True].reset_index(drop=True)

In [7]:
df

Unnamed: 0,filename,neck,sleeve_length,pattern,file_present
0,cdc8cd2a-0938-4970-a3b5-f5ed9595222c1527925869...,6.0,,4.0,True
1,11469770662809-Metersbonwe-Navy-T-shirt-485146...,5.0,3.0,9.0,True
2,f7ad67ab-eeb1-4449-8f63-7b580d2797e71532342804...,,0.0,9.0,True
3,11516770810185-Splash-Men-Tshirts-767151677081...,6.0,3.0,9.0,True
4,11505295751483-FOREVER-21-Men-White-Self-Desig...,1.0,3.0,,True
...,...,...,...,...,...
1792,11503030359861-Blackberrys-Men-Black-Solid-Man...,2.0,3.0,9.0,True
1793,11495782729779-Roadster-Men-Black-Printed-V-Ne...,5.0,3.0,9.0,True
1794,11507360109998-Nautica-Men-Blue-Striped-V-Neck...,5.0,3.0,9.0,True
1795,8f236dcd-4a2b-49ff-9229-981e7db946f91537273774...,6.0,,5.0,True


In [8]:
df.drop_duplicates(subset ="filename", keep = False, inplace = True)
del df['file_present']

In [9]:
df = df.reset_index(drop=True)

In [10]:
df

Unnamed: 0,filename,neck,sleeve_length,pattern
0,cdc8cd2a-0938-4970-a3b5-f5ed9595222c1527925869...,6.0,,4.0
1,11469770662809-Metersbonwe-Navy-T-shirt-485146...,5.0,3.0,9.0
2,f7ad67ab-eeb1-4449-8f63-7b580d2797e71532342804...,,0.0,9.0
3,11516770810185-Splash-Men-Tshirts-767151677081...,6.0,3.0,9.0
4,11505295751483-FOREVER-21-Men-White-Self-Desig...,1.0,3.0,
...,...,...,...,...
1763,11503030359861-Blackberrys-Men-Black-Solid-Man...,2.0,3.0,9.0
1764,11495782729779-Roadster-Men-Black-Printed-V-Ne...,5.0,3.0,9.0
1765,11507360109998-Nautica-Men-Blue-Striped-V-Neck...,5.0,3.0,9.0
1766,8f236dcd-4a2b-49ff-9229-981e7db946f91537273774...,6.0,,5.0


In [11]:
df.to_csv('attributes_clean.csv', index=False)

In [12]:
df['neck'] = df['neck'].fillna(7.0)
df['sleeve_length'] = df['sleeve_length'].fillna(4.0)
df['pattern'] = df['pattern'].fillna(10.0)

In [13]:
print(df['neck'].unique(), len(df['neck'].unique()))
print(df['sleeve_length'].unique(), len(df['sleeve_length'].unique()))
print(df['pattern'].unique(), len(df['pattern'].unique()))

[6. 5. 7. 1. 4. 0. 2. 3.] 8
[4. 3. 0. 1. 2.] 5
[ 4.  9. 10.  8.  3.  6.  7.  1.  2.  0.  5.] 11


In [14]:
print(df['neck'].value_counts())
print(df['sleeve_length'].value_counts())
print(df['pattern'].value_counts())

6.0    804
7.0    367
1.0    109
4.0    109
5.0    100
2.0     97
3.0     93
0.0     89
Name: neck, dtype: int64
3.0    1090
4.0     357
2.0     114
1.0     108
0.0      99
Name: sleeve_length, dtype: int64
9.0     1151
10.0     352
6.0       43
3.0       38
4.0       33
5.0       31
1.0       28
8.0       26
7.0       24
2.0       24
0.0       18
Name: pattern, dtype: int64


In [15]:
df[df['neck'] != df['neck']].reset_index(drop=True)

Unnamed: 0,filename,neck,sleeve_length,pattern


In [16]:
import cv2
x = cv2.imread(os.path.join(img_src, '0117ff63-32ea-41b9-91ea-e0a1dc8013cb1527919734744-NA-831527919734624-3.jpg'))
x.shape

(300, 225, 3)

In [17]:
import torch
import pandas as pd
import os
import numpy as np
from PIL import Image

from torchvision import transforms
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset

In [18]:
class FashionDataset(Dataset):
    def __init__(self):
        super().__init__()
        self.dataframe = df
        self.to_tensor = transforms.ToTensor()
        total_samples = len(self.dataframe)
        self.idx_perm = np.arange(total_samples)
    def __getitem__(self, index):
        df_idx = self.idx_perm[index]
        image_path = os.path.join(img_src, self.dataframe.filename[df_idx])
        image = Image.open(image_path)
        image = self.to_tensor(image)
        return image
    def __len__(self):
        return len(self.idx_perm)

In [None]:
dataset = FashionDataset()
loader = DataLoader(
    dataset,
    batch_size=10,
    num_workers=1,
    shuffle=False
)


mean = 0.
std = 0.
nb_samples = 0.
for data in loader:
    batch_samples = data.size(0)
    data = data.view(batch_samples, data.size(1), -1)
    mean += data.mean(2).sum(0)
    std += data.std(2).sum(0)
    nb_samples += batch_samples

mean /= nb_samples
std /= nb_samples

In [None]:
mean

In [None]:
std