# Import Libraries

In [1]:
# for loading/processing the images  
from keras.preprocessing.image import img_to_array, load_img 
from keras.applications.vgg16 import preprocess_input #preprocessing function for VGG16 compatibility

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle
import shutil
from IPython.display import FileLink

# Extract Images Names

In [2]:
files_path = '../input/fashion-product-images-dataset/fashion-dataset/images/'
file_names = os.listdir(files_path)
for i in range(len(file_names)):
    file_names[i] = files_path+file_names[i]
#file_names

# Transfer Learning (VGG16)

New final layer is a fully-connected layer with 4,096 output nodes. This vector of 4,096 numbers is the feature vector. Now that the final layer is removed, we can pass our image through the predict method to get our feature vector.

In [3]:
#input is VGG16 model input
#output is output from last but two layer's output
model = VGG16()
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

2022-06-11 11:37:40.688548: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


# Preprocess and Extract Features

- VGG model expects the images to be preprocessed as per the function preprocess_input()
- preprocess_input() receives inputs as 224x224 NumPy arrays in the format (num_of_samples, rows, columns, channels).

In [4]:
def preprocess_extract_features(file):
    img = load_img(file, target_size=(224,224))#load image and resize into 224x224 (for VGG16 preprocess compatibility)
    img = np.array(img)
    #print(img.shape) #(rows,columns,channels)

    reshaped_img = img.reshape(1,224,224,3)
    #print(reshaped_img.shape) #(num_of_samples, rows, columns, channels)

    img = preprocess_input(reshaped_img)#preprocess images for VGG16 model
    
    features = model.predict(img) #predict (since last two layers are dropped, gives feature-maps / features)
    
    return features

Create dictionary with file name as key and feature as values

In [5]:
features_dict = dict()

for i in file_names:
    file = i.split('/')[-1].split('.')[0]
    
    features_dict[file] = preprocess_extract_features(i)

2022-06-11 11:37:45.699163: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [6]:
#save the dictionary of features as pickle
with open('features.pkl','wb') as file:
    pickle.dump(features_dict, file)

# Read Features

In [7]:
#load saved feature dictionary
with open('../input/fashion-features/features_large.pkl','rb') as f:
    data = pickle.load(f)

In [8]:
#extract filename and features from the dictionary
filenames = np.array(list(data.keys()))
features = np.array(list(data.values())).reshape(-1,4096)
features.shape

(44441, 4096)

# Read CSV
Read fashion csv data

In [9]:
df = pd.read_csv('../input/fashion-product-images-dataset/fashion-dataset/styles.csv', on_bad_lines='skip')
df

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt
...,...,...,...,...,...,...,...,...,...,...
44419,17036,Men,Footwear,Shoes,Casual Shoes,White,Summer,2013.0,Casual,Gas Men Caddy Casual Shoe
44420,6461,Men,Footwear,Flip Flops,Flip Flops,Red,Summer,2011.0,Casual,Lotto Men's Soccer Track Flip Flop
44421,18842,Men,Apparel,Topwear,Tshirts,Blue,Fall,2011.0,Casual,Puma Men Graphic Stellar Blue Tshirt
44422,46694,Women,Personal Care,Fragrance,Perfume and Body Mist,Blue,Spring,2017.0,Casual,Rasasi Women Blue Lady Perfume


In [10]:
df[['masterCategory', 'subCategory', 'articleType']].nunique() #categories in dataset

masterCategory      7
subCategory        45
articleType       143
dtype: int64

In [11]:
label = df['subCategory'].tolist() # cluster based on subcategory (45 subcategories)
unique_labels = list(set(label))
print(len(unique_labels))
#unique_labels

45


# PCA
Reduce Dimensionality using PCA (4096->1000)

In [12]:
pca = PCA(n_components=1000, random_state=22)#reduce to 1000 dimensions
pca.fit(features) #fit
x = pca.transform(features) #transform
x.shape

(44441, 1000)

In [13]:
kmeans = KMeans(n_clusters=len(unique_labels), random_state=22) #cluster image data into 45 groups
kmeans.fit(x)

KMeans(n_clusters=45, random_state=22)

In [14]:
len(kmeans.labels_), len(filenames)

(44441, 44441)

In [15]:
#create dictionary with filepath and labels assigned by KMeans
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(files_path+file)
    else:
        groups[cluster].append(files_path+file)

# Create Zip
Move the fashion images to different folders based on the cluster label and zip the folder

In [16]:
for i in groups.items():
    os.mkdir(str(i[0]))
    for j in i[1]:
        shutil.copy(j+".jpg", "./" + str(i[0]) + "/" + j.split('/')[-1] + ".jpg")
    shutil.make_archive('./' + str(i[0]), 'zip', './' + str(i[0]))    
    shutil.rmtree("./" + str(i[0]))

In [17]:
ls

0.zip   14.zip  2.zip   25.zip  30.zip  36.zip  41.zip  7.zip
1.zip   15.zip  20.zip  26.zip  31.zip  37.zip  42.zip  8.zip
10.zip  16.zip  21.zip  27.zip  32.zip  38.zip  43.zip  9.zip
11.zip  17.zip  22.zip  28.zip  33.zip  39.zip  44.zip  __notebook__.ipynb
12.zip  18.zip  23.zip  29.zip  34.zip  4.zip   5.zip   features.pkl
13.zip  19.zip  24.zip  3.zip   35.zip  40.zip  6.zip


# Get Download Link for Each Zip File

In [18]:
for i in range(len(unique_labels)):
    display(FileLink(str(i)+'.zip'))