# Analysis of Fashion data

## Overview
**Goal:** Use a recommendation model to optimize predictions and create a recommender system to provide outfit recommendations based on product popularity.

### Load our data

In [747]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# data preprocessing and tuning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder

from tensorflow.keras import Model, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LeakyReLU, Embedding, Flatten, Dot, Dense, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# for webapp
from flask import Flask, request, jsonify

# to get the newest version of helper
import importlib
importlib.reload(helper)

# Setup to Ignore Version Errors and Deprecations
import warnings
warnings.filterwarnings("ignore")

### Load the data

In [715]:
# loading in the article data
articles_df = pd.read_csv("../data/articles.csv")
articles_df.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [716]:
# load the modeldata from previous notebook
modeldata_df = pd.read_csv("../data/modeldata_df.csv")
modeldata_df.head()

Unnamed: 0,product_code,product_type_no,product_group_name,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,popularity
0,108775,253,Garment Upper body,1010016,9,4,5,1676,A,1,16,1002,low
1,108775,253,Garment Upper body,1010016,10,3,9,1676,A,1,16,1002,high
2,108775,253,Garment Upper body,1010017,11,1,9,1676,A,1,16,1002,low
3,110065,306,Underwear,1010016,9,4,5,1339,B,1,61,1017,medium
4,110065,306,Underwear,1010016,10,3,9,1339,B,1,61,1017,medium


In [717]:
modeldata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 13 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   product_code                105542 non-null  int64 
 1   product_type_no             105542 non-null  int64 
 2   product_group_name          105542 non-null  object
 3   graphical_appearance_no     105542 non-null  int64 
 4   colour_group_code           105542 non-null  int64 
 5   perceived_colour_value_id   105542 non-null  int64 
 6   perceived_colour_master_id  105542 non-null  int64 
 7   department_no               105542 non-null  int64 
 8   index_code                  105542 non-null  object
 9   index_group_no              105542 non-null  int64 
 10  section_no                  105542 non-null  int64 
 11  garment_group_no            105542 non-null  int64 
 12  popularity                  105542 non-null  object
dtypes: int64(10), object(3)
memor

## Feature Selection and Engineering

In [718]:
modeldata_df.columns

Index(['product_code', 'product_type_no', 'product_group_name',
       'graphical_appearance_no', 'colour_group_code',
       'perceived_colour_value_id', 'perceived_colour_master_id',
       'department_no', 'index_code', 'index_group_no', 'section_no',
       'garment_group_no', 'popularity'],
      dtype='object')

Our recommender model is going to use the product group name to recommend outfits, so let's clean that feature up a bit.

In [719]:
modeldata_df["product_group_name"].value_counts()

product_group_name
Garment Upper body       42741
Garment Lower body       19812
Garment Full body        13292
Accessories              11158
Underwear                 5490
Shoes                     5283
Swimwear                  3127
Socks & Tights            2442
Nightwear                 1899
Unknown                    121
Underwear/nightwear         54
Cosmetic                    49
Bags                        25
Items                       17
Furniture                   13
Garment and Shoe care        9
Stationery                   5
Interior textile             3
Fun                          2
Name: count, dtype: int64

In [720]:
# drop rows of irrelevant product groups
modeldata_df = modeldata_df[modeldata_df["product_group_name"].isin(["Garment Upper body", "Garment Lower body", "Garment Full body", "Accessories", "Swimwear", "Bags"])]
modeldata_df["product_group_name"].value_counts()

product_group_name
Garment Upper body    42741
Garment Lower body    19812
Garment Full body     13292
Accessories           11158
Swimwear               3127
Bags                     25
Name: count, dtype: int64

We want to map the product name to a number for easier processing.

In [721]:
# map each product group to a number
product_group_mapper = {
    "Garment Upper body": 1,
    "Garment Lower body": 2,
    "Garment Full body": 3,
    "Accessories": 4,
    "Swimwear": 5,
    "Bags": 6,
}

In [722]:
# replace the product_group_name with a number
modeldata_df["product_group_name"] = modeldata_df["product_group_name"].replace(product_group_mapper)
modeldata_df["product_group_name"].value_counts()

product_group_name
1    42741
2    19812
3    13292
4    11158
5     3127
6       25
Name: count, dtype: int64

In [723]:
# we also want to map the popularity index to numbers
popularity_mapper = {
    "low": 0,
    "medium": 1,
    "high": 2
}

In [762]:
# replace the popularity with a number
modeldata_df["popularity"] = modeldata_df["popularity"].replace(popularity_mapper)
modeldata_df["popularity"].value_counts()

popularity
1    60384
2    18666
0    11105
Name: count, dtype: int64

## Artificial Neural Network (ANN)
74% was the best our previous ANN could do in the previous machine learning notebook, using LeakyReLU and a laearning rate scheduler (LRS). We might need this model later as a backup recommendation system.

### Train-test split

In [724]:
TARGET = ["popularity"]
X, y = modeldata_df.drop(columns=TARGET, axis=1), modeldata_df[TARGET]

In [725]:
# use encoder helper function on the text columns in X
# for easier analysis
helper.encode_strings(X)

In [726]:
# One-hot encode target variable (for multiclass classification)
y_onehot = pd.get_dummies(y)

In [727]:
# Split data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y_onehot, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

### Tuning: Standard Scaler

In [728]:
# use standard scaler
# check if that will gain better results
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

### Modeling: ANN

In [729]:
# Number of features after preprocessing
input_dim = X_train_scaled.shape[1]

# Number of classes (low, medium, high)
output_dim = y_train.shape[1]

input_dim, output_dim

(12, 3)

In [730]:
# Best performing model from previous notebook:
# ANN Model with LeakyRelu and learning rate scheduler
ann_model = helper.ANN(input_size=input_dim, hidden_layers=[64, 32], dropouts=[0.3, 0.3], output_size=output_dim, learning_rate=0.001, leaky=True)
ann_model.model_summary()

Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_63 (Dense)            (None, 64)                832       
                                                                 
 leaky_re_lu_31 (LeakyReLU)  (None, 64)                0         
                                                                 
 dropout_32 (Dropout)        (None, 64)                0         
                                                                 
 dense_64 (Dense)            (None, 32)                2080      
                                                                 
 leaky_re_lu_32 (LeakyReLU)  (None, 32)                0         
                                                                 
 dropout_33 (Dropout)        (None, 32)                0         
                                                                 
 dense_65 (Dense)            (None, 3)               

In [731]:
# DO NOT DELETE
# Commenting out for faster notebook execution
# Train the model
"""
history = ann_model.train_model(X_train=X_train_scaled, y_train=y_train,
                      X_validation=X_val_scaled, y_validation=y_val,
                      epoch=50, batch_size=32)
"""

'\nhistory = ann_model.train_model(X_train=X_train_scaled, y_train=y_train,\n                      X_validation=X_val_scaled, y_validation=y_val,\n                      epoch=50, batch_size=32)\n'

In [732]:
# DO NOT DELETE
# Evaluate on the test set
# ann_model.evaluate_accuracy(X_test=X_test_scaled, y_test=y_test)

In [733]:
# DO NOT DELETE
# predict on the model and print confusion matrix
# y_pred_probs = ann_model.predict_model(X_test=X_test_scaled, y_test=y_test)

## Recommendation System
We're going to use a Generalized Matrix Factorization recommender system to recommend outfits based on dot product proximity. Let's create the recoemmender class and train the model.

In [734]:
# Generalized Matrix Factorization (GMF)
# GMF-Based Outfit Recommender
class OutfitRecommenderGMF:
    def __init__(self, num_products, num_features, embedding_dim=8, learning_rate=0.01):
        self.num_products = num_products
        self.num_features = num_features
        self.embedding_dim = embedding_dim
        self.learning_rate = learning_rate
        self.model = self.initialize_model()
        
    def initialize_model(self):
        # Inputs
        product_input = Input(shape=(1,), name="Product_Input")
        feature_input = Input(shape=(self.num_features,), name="Feature_Input")

        # Embedding layer for product
        product_embedding = Embedding(input_dim=self.num_products, output_dim=self.embedding_dim, name="Product_Embedding")(product_input)

        product_embedding = Flatten()(product_embedding)

        # Transform feature input to match embedding size
        transformed_features = Dense(self.embedding_dim, activation='relu')(feature_input)

        # Dot product of the two embeddings
        dot_product = Dot(axes=1)([product_embedding, transformed_features])

        # Dense layer for prediction
        output = Dense(1, activation="linear", name="Output")(dot_product)

        # Compile the model
        model = Model(inputs=[product_input, feature_input], outputs=output)

        # print the model summary
        model.summary()

        model.compile(optimizer=Adam(learning_rate=self.learning_rate), loss="mse", metrics=["accuracy"])
        return model

    def train(self, product_codes, features, popularity, epochs=100, batch_size=32, verbose=True):
        # fit the model
        self.model.fit([product_codes, features], popularity, epochs=epochs, batch_size=batch_size, verbose=verbose, validation_split=0.2)

    def predict(self, product_codes, features):
        # predict the model using the given product codes and features
        return self.model.predict([product_codes, features])

    def recommend_outfit(self, product_codes, features, df, preference):
        # return the recommended outfit based on the predicted values
        # of the given product codes and features
        predictions = self.predict(product_codes, features)
        
        if preference == 0: # low popularity
            recommended_idx = np.argmin(predictions)
        elif preference == 2: # high popularity
            recommended_idx = np.argmax(predictions)
        else: # medium popularity
            recommended_idx = np.argmax(np.mean(predictions))
        return df.iloc[recommended_idx]

In [735]:
# encode the strings in modeldata_df
helper.encode_strings(modeldata_df)

In [736]:
# split the product codes, popularity, and features of the model
product_codes = modeldata_df["product_code"].values
popularity = modeldata_df["popularity"].values
features = modeldata_df.drop(["popularity", "product_code"], axis=1).values

In [737]:
# scale the features
feature_scaler = StandardScaler()
features = feature_scaler.fit_transform(features)

In [738]:
# train test split the product codes, popularity, and features
prod_train, prod_test, feat_train, feat_test, pop_train, pop_test = train_test_split(
        product_codes, features, popularity, test_size=0.2, random_state=42)

In [739]:
 # Create and Train the Model

# num_products: max value of product_code
# num_features: number of features in the model
# embedding_dim: want to recommend 1 outfit
recommender = OutfitRecommenderGMF(num_products=modeldata_df["product_code"].max() + 1, num_features=features.shape[1], embedding_dim=1, learning_rate=0.01)

# train the model
recommender.train(prod_train, feat_train, pop_train, epochs=50, batch_size=32)

Model: "model_24"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Product_Input (InputLayer)     [(None, 1)]          0           []                               
                                                                                                  
 Product_Embedding (Embedding)  (None, 1, 1)         959462      ['Product_Input[0][0]']          
                                                                                                  
 Feature_Input (InputLayer)     [(None, 11)]         0           []                               
                                                                                                  
 flatten_25 (Flatten)           (None, 1)            0           ['Product_Embedding[0][0]']      
                                                                                           

## Getting User Input
In order to provide more accurate outfit recommendations for each user, we are going to ask the user for their preferences.
1. What type of outfit they're looking for
2. How popular they want their outfit to be. Some users may want outfit recommendations with more alternative styles.

In [740]:
popularity_mapper

{'low': 0, 'medium': 1, 'high': 2}

In [741]:
product_group_mapper

{'Garment Upper body': 1,
 'Garment Lower body': 2,
 'Garment Full body': 3,
 'Accessories': 4,
 'Swimwear': 5,
 'Bags': 6}

In [742]:
# ask customer for user input

# give multiple choice
user_outfit_type = int(input("What type of garment are you looking for? Top: 1, Bottoms: 2, Dress: 3, Accessories: 4, Swimwear: 5, Bags: 6"))

# give multiple choice
user_preference = int(input("What level of popularity do you want your clothing to be? low: 0, medium: 1, high: 2"))

In [753]:
# function to recommend outfit based on user preferences
def recommend_outfit_for_user(user_outfit_type, user_preference):
    # Filter Data Based on User Input
    filtered_df = modeldata_df[modeldata_df["product_group_name"] == user_outfit_type]

    # get filtered product codes, popularity, and features based on user input
    filtered_product_codes = filtered_df["product_code"].values
    filtered_popularity = filtered_df["popularity"].values
    filtered_features = feature_scaler.transform(filtered_df.drop(["popularity", "product_code"], axis=1).values)

    # get the recommended outfit
    recommended_outfit = recommender.recommend_outfit(filtered_product_codes, filtered_features, filtered_df, preference=user_preference)

    # print the recommended product
    recommended_product_name = articles_df[articles_df["product_code"] == recommended_outfit["product_code"]]["prod_name"].values[0]
    print(f"Recommended Outfit Based on Your Preference:\nProduct Name: {recommended_product_name}, Product Code: {recommended_outfit['product_code']}, Popularity: {recommended_outfit['popularity']}")

    # return recommended product name and popularity
    return (recommended_product_name, recommended_outfit['popularity'])
    

In [752]:
# call the recommend_out_for_user function when we have the user preferences
recommend_outfit_for_user(user_outfit_type=user_outfit_type, user_preference=user_preference)

Recommended Outfit Based on Your Preference:
Product Name: Alma Party dress, Product Code: 801938, Popularity: 2


In [None]:
# Filter Data Based on User Input
# filtered_df = modeldata_df[modeldata_df["product_group_name"] == user_outfit_type]
# filtered_df

Unnamed: 0,product_code,product_type_no,product_group_name,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,popularity
204,192460,265,3,1010016,8,4,12,5963,3,2,58,1003,1
238,202017,265,3,1010017,10,3,9,1676,0,1,16,1002,1
261,212629,265,3,1010016,9,4,5,1643,3,2,51,1002,0
262,212629,265,3,1010016,93,4,19,1643,3,2,51,1002,1
263,212629,265,3,1010016,53,4,18,1643,3,2,51,1002,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
105522,948152,270,3,1010001,22,2,8,7920,6,4,79,1002,2
105523,948152,270,3,1010001,92,1,19,7920,6,4,79,1002,2
105535,952937,265,3,1010001,13,2,1,1641,0,1,18,1005,1
105539,956217,265,3,1010016,9,4,5,1641,0,1,18,1005,0


In [None]:
# filtered_product_codes = filtered_df["product_code"].values
# filtered_popularity = filtered_df["popularity"].values
# filtered_features = feature_scaler.transform(filtered_df.drop(["popularity", "product_code"], axis=1).values)

In [None]:
# recommended_outfit = recommender.recommend_outfit(filtered_product_codes, filtered_features, filtered_df, preference=user_preference)
# recommended_outfit



product_code                   801938
product_type_no                   265
product_group_name                  3
graphical_appearance_no       1010002
colour_group_code                  51
perceived_colour_value_id           1
perceived_colour_master_id          4
department_no                    7613
index_code                          6
index_group_no                      4
section_no                         76
garment_group_no                 1014
popularity                          2
Name: 80409, dtype: int64

In [None]:
# print the recommended product
# recommended_product_name = articles_df[articles_df["product_code"] == recommended_outfit["product_code"]]["prod_name"].values[0]

# print(f"Recommended Outfit Based on Your Preference:\nProduct Name: {recommended_product_name}, Product Code: {recommended_outfit['product_code']}, Popularity: {recommended_outfit['popularity']}")

Recommended Outfit Based on Your Preference:
Product Name: Alma Party dress, Product Code: 801938, Popularity: 2


## Model API

In [757]:
# use Flask to create an API endpoint
app = Flask(__name__)

In [758]:
# Route for recommendations
@app.route('/recommend', methods=['POST'])
def recommend():
    data = request.json
    outfit_type = data['outfitType']
    preference = data['preference']

    # Filter dataset based on user input
    name, popularity = recommend_outfit_for_user(user_outfit_type=outfit_type, user_preference=preference)
    
    response = {
        "name": name,
        "popularity": popularity
    }
    return jsonify(response)

In [761]:
if __name__ == "__main__":
    app.run(port=8000)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:8000
[33mPress CTRL+C to quit[0m
 * Restarting with stat
Traceback (most recent call last):
  File "/opt/anaconda3/envs/Cohort-Env/lib/python3.8/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/envs/Cohort-Env/lib/python3.8/site-packages/traitlets/config/application.py", line 1074, in launch_instance
    app.initialize(argv)
  File "/opt/anaconda3/envs/Cohort-Env/lib/python3.8/site-packages/traitlets/config/application.py", line 118, in inner
    return method(app, *args, **kwargs)
  File "/opt/anaconda3/envs/Cohort-Env/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 692, in initialize
    self.init_sockets()
  File "/opt/anaconda3/envs/Cohort-Env/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 331, in init_sockets
    self.shell_port = self._bind_socket(self.shell_socket, self.shell_port)
  File "/opt/anaconda3/envs/Cohort-Env/lib/python3.8/site-packages/ipykernel/kernel

SystemExit: 1

In [None]:
# next steps:
# adjust the user input to have top and bottom as one outfit, accessories as an add-on
# improve the UI

## Summary
Since we want to use this recommender model in a web app, we are going to move and refactor our code into python files that can be run from the terminal. Please see `../server/model.py` for the model and `../ui/outfitapp.js` for the web UI.