# Import Libraries

## Clustering

In [None]:
# for loading/processing the images  
from keras.preprocessing.image import img_to_array, load_img 
from keras.applications.vgg16 import preprocess_input #preprocessing function for VGG16 compatibility

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle
import shutil

## Classification

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt
import keras
from keras.models import Sequential
from keras.layers import Dense,Conv2D,Flatten,MaxPool2D,Dropout
import wandb
from wandb.keras import WandbCallback

# Initialize Variables

In [None]:
#clustering variables
#feature extraction by using VGG16 is done once and stored as pickle file
# true - to extract features by using VGG16 and save it as pickle file
# false - to skip the feature extraction and load the pickle file
use_VGG16 = False

#classification variables
target_size = (75,75)
batch_size = 32
train_val_split = 0.2
num_classes = 45
nb_epochs = 1000
init_epoch = 29
wandb_resume_state = True
exp_name = 'semi_super'

# Extract Images Names

In [None]:
files_path = '../input/fashion-product-images-dataset/fashion-dataset/images/'
file_names = os.listdir(files_path)
for i in range(len(file_names)):
    file_names[i] = files_path+file_names[i]
#file_names

# Transfer Learning (VGG16)

New final layer is a fully-connected layer with 4,096 output nodes. This vector of 4,096 numbers is the feature vector. Now that the final layer is removed, we can pass our image through the predict method to get our feature vector.

In [None]:
#input is VGG16 model input
#output is output from last but two layer's output
model = VGG16()
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

# Preprocess and Extract Features

- VGG model expects the images to be preprocessed as per the function preprocess_input()
- preprocess_input() receives inputs as 224x224 NumPy arrays in the format (num_of_samples, rows, columns, channels).

In [None]:
def preprocess_extract_features(file):
    img = load_img(file, target_size=(224,224))#load image and resize into 224x224 (for VGG16 preprocess compatibility)
    img = np.array(img)
    #print(img.shape) #(rows,columns,channels)

    reshaped_img = img.reshape(1,224,224,3)
    #print(reshaped_img.shape) #(num_of_samples, rows, columns, channels)

    img = preprocess_input(reshaped_img)#preprocess images for VGG16 model
    
    features = model.predict(img) #predict (since last two layers are dropped, gives feature-maps / features)
    
    return features

Create dictionary with file name as key and feature as values

In [None]:
features_dict = dict()

if use_VGG16: #run only if feature extraction is to be done now (else load the features from pickle file)
    for i in file_names:
        file = i.split('/')[-1].split('.')[0]

        features_dict[file] = preprocess_extract_features(i)

In [None]:
if use_VGG16:
    #save the dictionary of features as pickle only if feature extraction is done now
    with open('features.pkl','wb') as file:
        pickle.dump(features_dict, file)

# Read Features

In [None]:
if not use_VGG16:
    #load saved feature dictionary if already pickle file is saved
    with open('../input/fashion-features/features_large.pkl','rb') as f:
        data = pickle.load(f)

In [None]:
#extract filename and features from the dictionary
filenames = np.array(list(data.keys()))
features = np.array(list(data.values())).reshape(-1,4096)
features.shape

# Read CSV
Read fashion csv data

In [None]:
df = pd.read_csv('../input/fashion-product-images-dataset/fashion-dataset/styles.csv', on_bad_lines='skip')
df

In [None]:
df[['masterCategory', 'subCategory', 'articleType']].nunique() #categories in dataset

In [None]:
label = df['subCategory'].tolist() # cluster based on subcategory (45 subcategories)
unique_labels = list(set(label))
print(len(unique_labels))
#unique_labels

# PCA
Reduce Dimensionality using PCA (4096->1000)

In [None]:
pca = PCA(n_components=1000, random_state=22)#reduce to 1000 dimensions
pca.fit(features) #fit
x = pca.transform(features) #transform
x.shape

In [None]:
kmeans = KMeans(n_clusters=len(unique_labels), random_state=22) #cluster image data into 45 groups
kmeans.fit(x)

In [None]:
len(kmeans.labels_), len(filenames)

In [None]:
#create dictionary with filepath and labels assigned by KMeans
groups = {}
for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(files_path+file)
    else:
        groups[cluster].append(files_path+file)

# Create Folders
Move the fashion images to different folders based on the cluster label

In [None]:
os.mkdir('./Clustered Data')

for i in groups.items():
    os.mkdir('./Clustered Data/' + str(i[0]))
    for j in i[1]:
        shutil.copy(j+".jpg", "./Clustered Data/" + str(i[0]) + "/" + j.split('/')[-1] + ".jpg")

# Classification

In [None]:
wandb.login(key='5246287025871fb44919b66f47f36cbe454c16a5')

In [None]:
if wandb_resume_state:
    wandb.init(project="Fashion-Semi-Supervised", resume=True, group=exp_name)
else:    
    exp_name = wandb.util.generate_id()
    myrun = wandb.init(
            project='Fashion-Semi-Supervised',
            group=exp_name,
            config={
                'Image Size':75,
                'Num Channels':3,
                'Epoch': nb_epochs,
                'Batch_size':batch_size,
                'Loss':"categorical_crossentropy",            
                'Optimizer':'Adam',
            }
    )
config = wandb.config
print(exp_name)

# Data Generator

In [None]:
train_data_gen = ImageDataGenerator(rescale=1./255, validation_split=train_val_split)

train_generator = train_data_gen.flow_from_directory(
    directory='./Clustered Data/',
    target_size = target_size,
    batch_size = batch_size,
    #color_mode='grayscale',
    class_mode = 'categorical',
    subset='training')

validation_generator = train_data_gen.flow_from_directory(
    directory='./Clustered Data/',
    target_size = target_size,
    batch_size = batch_size,
    #color_mode='grayscale',
    class_mode = 'categorical',
    subset='validation')

In [None]:
labels = list(train_generator.class_indices.keys())

# Model

In [None]:
if wandb.run.resumed: #if run is to be resumed
    model = keras.models.load_model(wandb.restore("model-best.h5").name)
else:#else new run
    model= Sequential()
    model.add(Conv2D(kernel_size=(3,3), filters=32, activation='tanh', input_shape=(75,75,3)))
    model.add(Conv2D(filters=30,kernel_size = (3,3),activation='tanh'))
    model.add(MaxPool2D(2,2))
    model.add(Conv2D(filters=30,kernel_size = (3,3),activation='tanh'))
    model.add(MaxPool2D(2,2))
    model.add(Conv2D(filters=30,kernel_size = (3,3),activation='tanh'))

    model.add(Flatten())

    model.add(Dense(20,activation='relu'))
    model.add(Dense(15,activation='relu'))
    model.add(Dense(num_classes,activation = 'softmax'))

model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')

In [None]:
model.summary()

# Fit

In [None]:
#wandb keras compatibility
wandb_call = WandbCallback(save_model=True,
                          save_graph=True,
                          save_weights_only=True,
                          log_weights=True,
                          log_gradients=True,
                          training_data=train_generator,
                          validation_data=validation_generator,
                          validation_steps = validation_generator.samples // batch_size,
                          labels=labels,
                          predictions = 180,
                          input_type='images')

In [None]:
history = model.fit(
    train_generator,
    initial_epoch=wandb.run.step,
    steps_per_epoch = train_generator.samples // batch_size,
    validation_data = validation_generator, 
    validation_steps = validation_generator.samples // batch_size,
    epochs = nb_epochs,
    callbacks=[wandb_call])