In [1]:
# Importing relevant libraries
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import spacy
import warnings
warnings.filterwarnings("ignore")



In [2]:
from typing import List
import en_core_web_md
nlp = en_core_web_md.load()
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
from tqdm import tqdm, tqdm_notebook

We are tasked with finding the best recommendation based on a given input. We have divided our analyses into two parts:
- In the first part, we try to predict the label if only free-form data is given as input. Here, we first pre-processed the data and then tried different models for prediction, then chose the best model for prediction and predicted the type of the outfit. This is to ensure that if free-form data is given to us, we don't end up finding a product with different type
- In the second part, we have coded the required results, i.e. outputting based on given input

## Part I

## Loading the data

In [4]:
# Loading the outfits data
outfits_df = pd.read_csv('outfit_combinations.csv', encoding = 'latin')

In [5]:
# looking at the top 5 observations
outfits_df.head(5)

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
1,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
2,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2S5T9W793F4CY41HE,accessory1,kate spade new york,medium margaux leather satchel
3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump
4,01DMHCX50CFX5YNG99F3Y65GQW,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt


In [6]:
# looking at the outfit_item_type unique values
outfits_df.outfit_item_type.unique()

array(['bottom', 'top', 'accessory1', 'shoe', 'onepiece', 'accessory2',
       'accessory3'], dtype=object)

In [7]:
# changing accessories label to one label accessory
def change_outfit_type(x):
    '''
    This function converts similar labels into one for style category
    '''
    x = x.lower()
    if x == 'accessory1':
        return 'accessory'
    if x == 'accessory2':
        return 'accessory'
    if x == 'accessory3':
        return 'accessory'
    else:
        return x

# keeping values of similar labels into one value
outfits_df.outfit_item_type = outfits_df.outfit_item_type.apply(change_outfit_type)

# looking at the unique values
outfits_df.outfit_item_type.unique()

array(['bottom', 'top', 'accessory', 'shoe', 'onepiece'], dtype=object)

In [8]:
# checking for null values in the dataframe
outfits_df.isna().sum()

outfit_id            0
product_id           0
outfit_item_type     0
brand                0
product_full_name    0
dtype: int64

In [9]:
# retaining the unique product IDs in one dataframe
prod_IDs = pd.DataFrame(data = list(outfits_df.product_id.unique()), columns = ['product_id'])

In [10]:
len(prod_IDs)

804

In [11]:
# loading data with description
df_all = pd.read_csv("Full Data.csv")

In [12]:
# looking at the top 5 observations
df_all.head(5)

Unnamed: 0,product_id,brand,mpn,product_full_name,description,brand_category,created_at,updated_at,deleted_at,brand_canonical_url,details,labels,bc_product_id
0,01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,514683.0,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...",Unknown,2019-11-11 22:37:15.719107+00,2019-12-19 20:40:30.786144+00,,https://bananarepublic.gap.com/browse/product....,"A modern pump, in a rounded silhouette with an...","{""Needs Review""}",
1,01DSE9SKM19XNA6SJP36JZC065,Banana Republic,526676.0,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Unknown,2019-11-11 22:36:50.682513+00,2019-12-19 20:40:30.786144+00,,https://bananarepublic.gap.com/browse/product....,Dress it down with jeans and sneakers or dress...,"{""Needs Review""}",
2,01DSJX8GD4DSAP76SPR85HRCMN,Loewe,400100000000.0,52MM Padded Leather Round Sunglasses,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,2019-11-13 17:33:59.581661+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/loewe-52mm-pad...,100% UV protection\nCase and cleaning cloth in...,"{""Needs Review""}",
3,01DSJVKJNS6F4KQ1QM6YYK9AW2,Converse,400012000000.0,Baby's & Little Kid's All-Star Two-Tone Mid-To...,The iconic mid-top design gets an added dose o...,"JustKids/Shoes/Baby024Months/BabyGirl,JustKids...",2019-11-13 17:05:05.203733+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/converse-babys...,Canvas upper\nRound toe\nLace-up vamp\nSmartFO...,"{""Needs Review""}",
4,01DSK15ZD4D5A0QXA8NSD25YXE,Alexander McQueen,400011000000.0,64MM Rimless Sunglasses,Hexagonal shades offer a rimless view with int...,JewelryAccessories/SunglassesReaders/RoundOval,2019-11-13 18:42:30.941321+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/alexander-mcqu...,100% UV protection\nGradient lenses\nAdjustabl...,"{""Needs Review""}",


In [13]:
# keeping the columns relevant to the analyses
df_all = df_all[['product_id', 'brand', 'product_full_name', 'description', 'brand_category', 'details', 'labels']]

In [14]:
df_all.shape

(48979, 7)

In [15]:
# Checking NA values
df_all.isnull().sum()

product_id              0
brand                   0
product_full_name       0
description          7974
brand_category        238
details              9866
labels                  0
dtype: int64

In [16]:
# there are some rows which do not contain any description or details
# replace the null values with
df_all=df_all.fillna("Unknown")

In [17]:
# checking for duplicate product id entries
df_check = df_all.groupby("product_id").size().reset_index(name = 'count').sort_values('count', ascending = False)
#len(df_check[df_check['count']>1])
df_check.head(5)

Unnamed: 0,product_id,count
0,01DMBRYVA2P5H24WK0HTK4R0A1,2
41932,01DT51234VHAHGPTR89SZJ50V0,2
6064,01DPGTXH6QTM161M660N9W7C3S,2
6063,01DPGTXD3HEJ83GAWGBNB0PV92,2
42074,01DTJCE596G5WGANPMXNENAXFJ,2


In [18]:
# Dropping duplicate product ids
df_all = df_all.drop_duplicates(subset="product_id")

In [19]:
df_all.shape

(48072, 7)

In [20]:
# loading first file with tags
df_tagged = pd.read_excel("USC+Product+Attribute+Data+03302020.xlsx")

# loading second file with tags
df_tagged2 = pd.read_csv('usc_additional_tags.csv')

In [21]:
# combining both tag files in one dataframe
df_tag = pd.concat([df_tagged, df_tagged2])

In [22]:
df_tag.head(5)

Unnamed: 0,product_id,product_color_id,attribute_name,attribute_value
0,01DVBTBPHR8WJTCVEN5AJRHF47,01DVBTBPJ41VVT00JJCG8TTZ2W,gender,Women
1,01DVA7QRXM928ZM0WWR7HFNTC1,01DVA7QRXXR9F0TWVE1HMC5ZQ3,Primary Color,Blacks
2,01DPGV4YRP3Z8J85DASGZ1Y99W,01DPGVGBK6YGNYGNF2S6FSH02T,style,Casual
3,01E1JM43NQ3H17PB22EV3074NX,01E1JM5WFWWCCCH3JTTTCYQCEQ,style,Modern
4,01DSE8Z2ZDAZKZ2SKCS1E3B3HK,01DSE8ZG8Y3FR8KWE2TY1QDWBF,shoe_width,Medium


In [23]:
df_tag.attribute_name.unique()

array(['gender', 'Primary Color', 'style', 'shoe_width', 'length_top',
       'category', 'fit', 'occasion', 'subcategory_bottom',
       'sleeve_length', 'upper_material', 'subcategory_top',
       'strap_material', 'Additional Color', 'sizing', 'class_dress',
       'Pattern', 'class_pants_and_leggings', 'dry_clean_only',
       'length_one_piece', 'material_clothing', 'subcategory_accessory',
       'closure_blazers_coats_and_jackets', 'leg_style', 'toe_style',
       'neckline', 'class_sandals', 'length_skirts',
       'class_jumpsuit_and_romper', 'length_pants_and_leggings',
       'subcategory_sweater', 'closure_top',
       'class_blazers_coats_and_jackets', 'heel_height', 'class_handbags',
       'closure_handbag', 'class_pumps_and_heels', 'sheer',
       'class_skirts', 'material_purse', 'embellishment',
       'class_mules_and_slides', 'rise', 'subcategory_one_piece',
       'subcategory_blazers_coats_and_jackets', 'material',
       'subcategory_shoe', 'trend', 'class_flats'

In [24]:
# We manually searched for categories which are most relevant for our analyses
final_cat = ['category']

In [25]:
# https://stackoverflow.com/questions/37292872/how-can-i-one-hot-encode-in-python
# we then one hot encoded these categories so that we can keep the ones required
one_hot = pd.get_dummies(df_tag['attribute_name'])

In [26]:
# keeping the required category
one_hot = one_hot[final_cat]
one_hot.head(5)

Unnamed: 0,category
0,0
1,0
2,0
3,0
4,0


In [27]:
# creating a dataframe to store the one hot encoded values
df_tag_keep = df_tag.copy()

In [28]:
# merging the one hot encoded values with the dataframe containing values of product id
df_tag_keep = pd.concat([df_tag_keep, one_hot], axis=1)
df_tag_keep.head(5)

Unnamed: 0,product_id,product_color_id,attribute_name,attribute_value,category
0,01DVBTBPHR8WJTCVEN5AJRHF47,01DVBTBPJ41VVT00JJCG8TTZ2W,gender,Women,0
1,01DVA7QRXM928ZM0WWR7HFNTC1,01DVA7QRXXR9F0TWVE1HMC5ZQ3,Primary Color,Blacks,0
2,01DPGV4YRP3Z8J85DASGZ1Y99W,01DPGVGBK6YGNYGNF2S6FSH02T,style,Casual,0
3,01E1JM43NQ3H17PB22EV3074NX,01E1JM5WFWWCCCH3JTTTCYQCEQ,style,Modern,0
4,01DSE8Z2ZDAZKZ2SKCS1E3B3HK,01DSE8ZG8Y3FR8KWE2TY1QDWBF,shoe_width,Medium,0


In [29]:
# keeping the rows with style labels
df_cat_temp = df_tag_keep[df_tag_keep['category']>0].copy()

In [30]:
# observing the values of label for the category
df_cat_temp.attribute_value.unique()

array(['Accessory', 'Top', 'Bottom', 'Shoe', 'Blazers, Coats & Jackets',
       'Sweater', 'One Piece', 'Sweatshirt & Hoodie', 'top', 'shoe',
       'blazerscoatsjackets', 'onepiece', 'bottom', 'accessory',
       'sweater', 'sweatshirthoodie'], dtype=object)

In [31]:
# converting the labels into one similar label
def change_label_cat(x):
    '''
    This function converts similar labels into one for cat category
    '''
    x = x.lower()
    if x == 'onepiece':
        return 'one piece'
    else:
        return x

# keeping values of similar labels into one value
df_cat_temp.attribute_value = df_cat_temp.attribute_value.apply(change_label_cat)

# looking at the unique values
df_cat_temp.attribute_value.unique()

array(['accessory', 'top', 'bottom', 'shoe', 'blazers, coats & jackets',
       'sweater', 'one piece', 'sweatshirt & hoodie',
       'blazerscoatsjackets', 'sweatshirthoodie'], dtype=object)

In [32]:
# We then one hot encode the labels for each row
df_cat_temp = df_cat_temp[['product_id', 'attribute_name', 'attribute_value']]

# Remove duplicates
df_cat_temp = df_cat_temp.drop_duplicates(subset = ['product_id', 'attribute_name', 'attribute_value'], keep='first')

# one hot encoding
one_hot_cat = pd.get_dummies(df_cat_temp['attribute_value'])

# combining in one dataframe
df_cat_temp = pd.concat([df_cat_temp, one_hot_cat], axis=1)

In [33]:
# we perform operations to ensure there is only one row for one product id
# sum the rows to get one value per label per row
df_cat_sum = df_cat_temp.groupby('product_id').sum().reset_index()

# get all the values of label in one cell for each row
df_cat_concat = df_cat_temp.groupby(['product_id', 'attribute_name'])['attribute_value'].apply(lambda x: ', '.join(x)).reset_index()

In [34]:
# getting the final dataframe

# retaining the rows with the five labels we are interested in (shoe, accessory, bottom, one piece, top)
df_cat = pd.merge(df_cat_sum, df_cat_concat[['product_id', 'attribute_value']], on = 'product_id')
df_cat = df_cat[['product_id', 'accessory', 'bottom', 'one piece', 'top', 'shoe', 'attribute_value']]
df_cat = df_cat[(df_cat['accessory']>0)|(df_cat['bottom']>0)|(df_cat['top']>0)|(df_cat['one piece']>0)|(df_cat['shoe']>0)]
df_cat.head(5)

Unnamed: 0,product_id,accessory,bottom,one piece,top,shoe,attribute_value
0,01DMBRYVA2P5H24WK0HTK4R0A1,0,1,0,0,0,bottom
2,01DMBRYVA2Q2ST7MNYR6EEY4TK,0,0,1,0,0,one piece
3,01DMBRYVA2S5T9W793F4CY41HE,1,0,0,0,0,accessory
4,01DMBRYVA2ZFDYRYY5TRQZJTBD,0,0,0,0,1,shoe
5,01DMHCNT41E14QWP503V7CT9G6,1,0,0,0,0,accessory


In [35]:
# to check if there is any row with na value after the operations
df_cat.isna().sum()

product_id         0
accessory          0
bottom             0
one piece          0
top                0
shoe               0
attribute_value    0
dtype: int64

In [36]:
# getting the final dataframe for the category with text details from the dataframe containint all information
df_cat = pd.merge(df_cat, df_all, on = 'product_id', how = "left")

In [37]:
# checking if there are values of label more than 1
# since we are doing binary classification for each label, we would like to keep the values as 0 or 1
df_cat[['accessory', 'bottom', 'one piece', 'top', 'shoe']].max()

accessory    1
bottom       1
one piece    1
top          1
shoe         1
dtype: uint8

In [38]:
# looking at the first 5 observations
df_cat.head(5)

Unnamed: 0,product_id,accessory,bottom,one piece,top,shoe,attribute_value,brand,product_full_name,description,brand_category,details,labels
0,01DMBRYVA2P5H24WK0HTK4R0A1,0,1,0,0,0,bottom,Eileen Fisher,Slim Knit Skirt,A nice skirt,Apparel,Unknown,"{""Needs Attributes""}"
1,01DMBRYVA2Q2ST7MNYR6EEY4TK,0,0,1,0,0,one piece,Equipment,Chemelle Midi Dress,A nice dress,Apparel,Unknown,[{'value': 'Needs Attributes'}]
2,01DMBRYVA2S5T9W793F4CY41HE,1,0,0,0,0,accessory,kate spade new york,medium margaux leather satchel,A nice bag,Bags,Unknown,[{'value': 'Needs Attributes'}]
3,01DMBRYVA2ZFDYRYY5TRQZJTBD,0,0,0,0,1,shoe,Tory Burch,Penelope Mid Cap Toe Pump,A nice shoe,Shoes,Unknown,[]
4,01DMHCNT41E14QWP503V7CT9G6,1,0,0,0,0,accessory,Nina,Crystal Clutch,A nice clutch,Accessory,Unknown,[]


In [39]:
# Using a copy of the original dataframe in case of refresh is required
df_pre = df_cat.copy()

### Data pre-processing

In this step, we pre-processed the data, regex cleaning, stop words removal and lemmatization before we use the data for prediction

In [40]:
import unidecode
import string
#!pip install unidecode
#https://stackoverflow.com/questions/44431730/how-to-replace-accented-characters-in-python
    
def clean_punct(review_to_be_cleaned):
    '''
    Function which takes a list of sentences and cleans the data. Returns a cleaned list of sentences
    '''
    temp = review_to_be_cleaned

    #Normalize tabs and remove newlines
    temp = temp.replace('\t', ' ').replace('\n', ' ')
    
    # replacing the accented characters
    #temp = unicode(temp, "utf-8")
    temp = unidecode.unidecode(temp)
    
    # Replace punctuation with whitespace
    punc_syn = string.punctuation
    punc_syn = punc_syn.replace('"','')
    punc_syn = punc_syn.replace('-','')
    temp = re.sub(r'[{}]'.format(punc_syn), ' ', temp)
    
    # Single character removal 
    temp = re.sub(r"\s+[a-zA-Z]\s+", ' ', temp)
    
    #Remove leading whitespaces
    temp = temp.strip()

    #Normalize spaces to 1
    temp = re.sub(" +", " ", temp)

    #Normalize all characters to lowercase
    temp = temp.lower()
    
    return temp

In [41]:
# cleaning the required columns
list_cols = ['brand', 'product_full_name', 'description', 'brand_category', 'details']

for i in list_cols:
    df_pre[i] = df_pre[i].astype(str).apply(clean_punct)

In [42]:
# looking at the first 5 observations
df_pre.head(5)

Unnamed: 0,product_id,accessory,bottom,one piece,top,shoe,attribute_value,brand,product_full_name,description,brand_category,details,labels
0,01DMBRYVA2P5H24WK0HTK4R0A1,0,1,0,0,0,bottom,eileen fisher,slim knit skirt,a nice skirt,apparel,unknown,"{""Needs Attributes""}"
1,01DMBRYVA2Q2ST7MNYR6EEY4TK,0,0,1,0,0,one piece,equipment,chemelle midi dress,a nice dress,apparel,unknown,[{'value': 'Needs Attributes'}]
2,01DMBRYVA2S5T9W793F4CY41HE,1,0,0,0,0,accessory,kate spade new york,medium margaux leather satchel,a nice bag,bags,unknown,[{'value': 'Needs Attributes'}]
3,01DMBRYVA2ZFDYRYY5TRQZJTBD,0,0,0,0,1,shoe,tory burch,penelope mid cap toe pump,a nice shoe,shoes,unknown,[]
4,01DMHCNT41E14QWP503V7CT9G6,1,0,0,0,0,accessory,nina,crystal clutch,a nice clutch,accessory,unknown,[]


In [43]:
# using spacy library to remove the stop words
for i in tqdm(list_cols):
    df_pre[i] = list(
    map(lambda doc: " ".join([token.text for token in nlp(doc) if not token.is_stop]), list(df_pre[i])))

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:30<00:00, 18.07s/it]


In [44]:
from nltk.stem import WordNetLemmatizer
import nltk
# Using nltk library to perform lemmatization
def lemm(list_to_process):
    '''
    This function returns the list of reviews after lemmatization
    '''
    lemmatizer = WordNetLemmatizer()
    sentences = []
    
    for i in list_to_process:
        tokens = nltk.word_tokenize(i)
        words = []
        for word in tokens:
            words.append(lemmatizer.lemmatize(word))
        sentence = " ".join(words)
        sentences.append(sentence)
    return sentences

In [45]:
# performing lemmatization on the required columns
for i in list_cols:
    df_pre[i] = lemm(df_pre[i])

In [46]:
# combining all text in one column
df_pre['combined'] =  df_pre.brand+' '+df_pre.product_full_name+' '+df_pre.description+' '+df_pre.brand_category+' '+df_pre.details

In [47]:
df_pre.head(5)

Unnamed: 0,product_id,accessory,bottom,one piece,top,shoe,attribute_value,brand,product_full_name,description,brand_category,details,labels,combined
0,01DMBRYVA2P5H24WK0HTK4R0A1,0,1,0,0,0,bottom,eileen fisher,slim knit skirt,nice skirt,apparel,unknown,"{""Needs Attributes""}",eileen fisher slim knit skirt nice skirt appar...
1,01DMBRYVA2Q2ST7MNYR6EEY4TK,0,0,1,0,0,one piece,equipment,chemelle midi dress,nice dress,apparel,unknown,[{'value': 'Needs Attributes'}],equipment chemelle midi dress nice dress appar...
2,01DMBRYVA2S5T9W793F4CY41HE,1,0,0,0,0,accessory,kate spade new york,medium margaux leather satchel,nice bag,bag,unknown,[{'value': 'Needs Attributes'}],kate spade new york medium margaux leather sat...
3,01DMBRYVA2ZFDYRYY5TRQZJTBD,0,0,0,0,1,shoe,tory burch,penelope mid cap toe pump,nice shoe,shoe,unknown,[],tory burch penelope mid cap toe pump nice shoe...
4,01DMHCNT41E14QWP503V7CT9G6,1,0,0,0,0,accessory,nina,crystal clutch,nice clutch,accessory,unknown,[],nina crystal clutch nice clutch accessory unknown


In [48]:
# retaining the required columns
df_pre = df_pre[['product_id', 'accessory', 'bottom', 'one piece', 'top', 'shoe', 'attribute_value', 'combined']]

In [49]:
# saving the dataframe into a new dataframe
df_category = df_pre.copy()

In [50]:
# Saving dataframe in a csv file
df_category.to_csv('01 category.csv', index = False)

### Model Selection

In this part we define several vectorizer methods and models. We divide the current data into training and test data set in the proportion of 0.75-0.25. We evaluate our models on the test data with 50% accuracy as the baseline accuracy.

#### Using count vectorization, tfidf vectorization with logistic regression

In [51]:
def count_(docs):
    '''
    This returns count vectorized vectors of the docs using unigrams
    '''
    
    # use English stopwords, and use one-hot encoding, and the word must appear in at least two of the movie plots
    vectorizer = CountVectorizer(ngram_range=(1,1), stop_words="english", binary=True, min_df=10, max_df=5000) 
    vectorizer = vectorizer.fit(docs)
    X = vectorizer.transform(docs)

    vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    return vectorized_df, vectorizer

def count_2gram(docs):
    '''
    This returns count vectorized vectors of the docs using bigrams
    '''
    vectorizer = CountVectorizer(ngram_range=(2,2), stop_words="english", binary=True, min_df=10, max_df=5000)
    vectorizer = vectorizer.fit(docs)
    X = vectorizer.transform(docs)

    vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    return vectorized_df, vectorizer

def count_3gram(docs):
    '''
    This returns count vectorized vectors of the docs using trigrams
    '''
    vectorizer = CountVectorizer(ngram_range=(3,3), stop_words="english", binary=True, min_df=10, max_df=5000)
    vectorizer = vectorizer.fit(docs)
    X = vectorizer.transform(docs)
    
    vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    return vectorized_df, vectorizer

def tfidf(docs):
    '''
    This returns tfidf vectorized vectors of the docs using unigrams
    '''
    vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words="english", max_df=0.75)
    vectorizer = vectorizer.fit(docs)
    X = vectorizer.transform(docs)

    vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    return vectorized_df, vectorizer

def tfidf_2gram(docs):
    '''
    This returns tfidf vectorized vectors of the docs using bigrams
    '''
    vectorizer = TfidfVectorizer(ngram_range=(2,2), stop_words="english", max_df=0.75)
    vectorizer = vectorizer.fit(docs)
    X = vectorizer.transform(docs)

    vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    return vectorized_df, vectorizer

def tfidf_3gram(docs):
    '''
    This returns tfidf vectorized vectors of the docs using trigrams
    '''
    vectorizer = TfidfVectorizer(ngram_range=(3,3), stop_words="english", max_df=0.75)
    vectorizer = vectorizer.fit(docs)
    X = vectorizer.transform(docs)

    vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    
    return vectorized_df, vectorizer

In [52]:
# Model
def log_reg(X_train, Y_train, X_test, Y_test):
    '''
    This function returns the accuracy of the model with the given inputs
    '''
    lr = LogisticRegression()
    lr.fit(X_train, Y_train)
    
    y_pred = lr.predict(X_test)
    
    return round(np.mean(y_pred == Y_test),2)

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [54]:
# creating list to store values of different parameters
category_list = []
label_list = []
vectorizer_list = []
logreg_accuracy = []

i = 'category'
df = df_category.copy()

# getting the labels for each category   
labels = list(df.iloc[:,1:-2].columns)

for j in tqdm_notebook(labels):

    # looping through each vectorization methods defined above in the functions
    for k in ['c_1ng', 'c_2ng', 'c_3ng', 'tfidf_1ng', 'tfidf_2ng', 'tfidf_3ng']:

        # storing values of category and labels
        category_list.append(i)
        label_list.append(j)

        # splitting the data into training and testing
        X_train, X_test, y_train, y_test = train_test_split(df['combined'], df[j], test_size=0.25, random_state=42)

        # for each vectorization method, get the vectorized training data and the vectorizer
        if k=="c_1ng":
            train_df, vectorizer = count_(X_train)
            vectorizer_list.append("count_1gram")

        if k=="c_2ng":
            train_df, vectorizer = count_2gram(X_train)
            vectorizer_list.append("count_2gram")

        if k=="c_3ng":
            train_df, vectorizer = count_3gram(X_train)
            vectorizer_list.append("count_3gram")

        if k=="tfidf_1ng":
            train_df, vectorizer = tfidf(X_train)
            vectorizer_list.append("tfid_1gram")

        if k=="tfidf_2ng":
            train_df, vectorizer = tfidf_2gram(X_train)
            vectorizer_list.append("tfidf_2gram")

        if k=="tfidf_3ng":
            train_df, vectorizer = tfidf_3gram(X_train)
            vectorizer_list.append("tfidf_3gram")

        # converting the test data into vector using the vectorizer obtained above
        test_vector = vectorizer.transform(X_test)
        test_df = pd.DataFrame(test_vector.toarray(), columns=vectorizer.get_feature_names())

        # getting the accuracy of the model and storing in a list
        accuracy = log_reg(train_df, y_train, test_df, y_test)
        logreg_accuracy.append(accuracy)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [55]:
# storing the performance of each approach in a dataframe
results_logreg = pd.DataFrame({"category":category_list, "label":label_list, "vectorizer":vectorizer_list,
                               "LogReg_Accuracy":logreg_accuracy})

In [56]:
# looking at 10 observations
results_logreg.head(10)

Unnamed: 0,category,label,vectorizer,LogReg_Accuracy
0,category,accessory,count_1gram,1.0
1,category,accessory,count_2gram,0.98
2,category,accessory,count_3gram,0.96
3,category,accessory,tfid_1gram,0.98
4,category,accessory,tfidf_2gram,0.95
5,category,accessory,tfidf_3gram,0.93
6,category,bottom,count_1gram,0.99
7,category,bottom,count_2gram,0.96
8,category,bottom,count_3gram,0.89
9,category,bottom,tfid_1gram,0.98


In [57]:
# saving the results in a file
results_logreg.to_csv('01 Results_logreg.csv', index = False)

In [58]:
# performing basic operations to get the model with the highest accuracy

results_logreg = results_logreg.sort_values('LogReg_Accuracy', ascending=False).drop_duplicates(['category','label'])
results_logreg = results_logreg.reset_index()
results_logreg = results_logreg.drop(columns = ['index'])
results_logreg = results_logreg.sort_values(['category', 'label'], ascending=False)
results_logreg.head(5)

Unnamed: 0,category,label,vectorizer,LogReg_Accuracy
4,category,top,count_1gram,0.98
1,category,shoe,tfid_1gram,1.0
2,category,one piece,count_1gram,0.99
3,category,bottom,count_1gram,0.99
0,category,accessory,count_1gram,1.0


#### Using Word Embeddings with Keras

In [59]:
from keras.layers import Flatten, Masking
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [60]:
def get_max_token_length_per_doc(docs: List[List[str]])-> int:
    '''
    This function returns the maximum length of a document within a corpus
    '''
    return max(list(map(lambda x: len(x.split()), docs)))

In [61]:
def integer_encode_documents(docs, tokenizer):
    '''
    This function returns the integer encodings of the docs with the given tokenizer (keras here)
    '''
    return tokenizer.texts_to_sequences(docs)

In [62]:
def keras_m(X_train, y_train, X_test, y_test, vocab_size, EMBEDDING_SIZE, max_length):
    '''
    This function returns accuracy of the keras model with the given parameters
    '''
    # define the model
    model = Sequential()
    model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=max_length))
    model.add(Flatten())
    
    # since we are doing binary classification, activation function is sigmoid
    model.add(Dense(1, activation='sigmoid')) 

    # compiling the model, fitting on training data and returning accuracy
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    model.fit(X_train, y_train, epochs=20, verbose=0)
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    
    return round(accuracy,2)

In [63]:
# creating list to store values of different parameters
category_list = []
label_list = []
vectorizer_list = []
keras_accuracy = []

i = 'category'
df = df_category.copy()
# getting the labels for each category
labels = list(df.iloc[:,1:-2].columns)

for j in tqdm_notebook(labels):

    # getting the word embeddings of the corpus

    # Using keras tokenizer and fitting on the category corpus
    tokenizer = Tokenizer(num_words=5000, oov_token="UNKNOWN_TOKEN")
    tokenizer.fit_on_texts(df['combined'])

    # getting the maximum length of the docs within the corpus
    max_length = get_max_token_length_per_doc(df['combined'])

    # integer encode the training data
    encoded_docs = integer_encode_documents(df['combined'], tokenizer)
    # pad the documents
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    # get vocab size
    vocab_size = int(len(tokenizer.word_index) + 1)

    # splitting data into training and testing data
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, df[j], test_size=0.25, random_state=42)


    EMBEDDING_SIZE = 50

    # getting the accuracy of keras model with word embeddings
    accuracy = keras_m(X_train, y_train, X_test, y_test, vocab_size, EMBEDDING_SIZE, max_length)

    # Storing values in a list
    keras_accuracy.append(accuracy)
    category_list.append(i)
    label_list.append(j)
    vectorizer_list.append("keras-word embed")

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [64]:
# storing results in a dataframe
results_keras_w = pd.DataFrame({"category":category_list, "label":label_list, "vectorizer":vectorizer_list,
                               "Keras_Accuracy":keras_accuracy})

In [65]:
# first 10 observations
results_keras_w.head(10)

Unnamed: 0,category,label,vectorizer,Keras_Accuracy
0,category,accessory,keras-word embed,1.0
1,category,bottom,keras-word embed,1.0
2,category,one piece,keras-word embed,0.99
3,category,top,keras-word embed,0.99
4,category,shoe,keras-word embed,1.0


In [66]:
# storing in a csv file
results_keras_w.to_csv('01 Results_keras_w.csv', index = False)

#### Using Pre_Trained Embeddings with Keras

In [67]:
from numpy import array, argmax, asarray, zeros

In [68]:
# getting the embedings
embeddings_index = dict()
f = open('glove.6B.100d.txt', encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [69]:
def keras_gl(X_train, y_train, X_test, y_test, vocab_size, embedding_matrix, max_length, padded_docs):
    
    # define model
    model = Sequential()
    e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False)
    model.add(e)
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    
    # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

    # fit the model
    model.fit(X_train, y_train, epochs=20, verbose=0)
    
    # evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    
    # return accuracy of the model
    return round(accuracy,2)

In [70]:
# list to contain various values of the parameters
category_list = []
label_list = []
vectorizer_list = []
keras_glove_accuracy = []

i = 'category'
df = df_category.copy()
# getting labels for each category
labels = list(df.iloc[:,1:-2].columns)

for j in tqdm_notebook(labels):

    # getting the word embedings
    # similar to what we did before

    tokenizer = Tokenizer(num_words=5000, oov_token="UNKNOWN_TOKEN")
    tokenizer.fit_on_texts(df['combined'])

    max_length = get_max_token_length_per_doc(df['combined'])

    # integer encode the training data
    encoded_docs = integer_encode_documents(df['combined'], tokenizer)
    # pad the documents
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    # get vocab size
    vocab_size = int(len(tokenizer.word_index) + 1)

    # create a weight matrix for words in training docs
    embedding_matrix = zeros((vocab_size, 100))
    for word, p in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: # check that it is an actual word that we have embeddings for
            embedding_matrix[p] = embedding_vector

    # splitting into training and test data
    X_train, X_test, y_train, y_test = train_test_split(padded_docs, df[j], test_size=0.25, random_state=42)

    # fixed size for glove approach
    EMBEDDING_SIZE = 100

    accuracy = keras_gl(X_train, y_train, X_test, y_test, vocab_size, embedding_matrix, max_length, padded_docs)

    # Storing values in a list
    keras_glove_accuracy.append(accuracy)
    category_list.append(i)
    label_list.append(j)
    vectorizer_list.append("keras-glove")

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [71]:
# storing values in a dataframe
results_keras_gl = pd.DataFrame({"category":category_list, "label":label_list, "vectorizer":vectorizer_list,
                               "KerasGlove_Accuracy":keras_glove_accuracy})

In [72]:
# first 5 observations
results_keras_gl.head(5)

Unnamed: 0,category,label,vectorizer,KerasGlove_Accuracy
0,category,accessory,keras-glove,0.98
1,category,bottom,keras-glove,0.94
2,category,one piece,keras-glove,0.93
3,category,top,keras-glove,0.89
4,category,shoe,keras-glove,0.99


In [73]:
# storing results in a file
results_keras_gl.to_csv('04 Results_keras_gl.csv', index = False)

### Getting the best model for each label

In [74]:
# performing basic operations to combine all the results
results_logreg['model'] = 'LogReg'
results_logreg = results_logreg.rename(columns={'LogReg_Accuracy':'Accuracy'})

results_logreg.head(5)

Unnamed: 0,category,label,vectorizer,Accuracy,model
4,category,top,count_1gram,0.98,LogReg
1,category,shoe,tfid_1gram,1.0,LogReg
2,category,one piece,count_1gram,0.99,LogReg
3,category,bottom,count_1gram,0.99,LogReg
0,category,accessory,count_1gram,1.0,LogReg


In [75]:
# performing basic operations to combine all the results
results_keras_gl['model'] = 'Keras-Glove'
results_keras_gl = results_keras_gl.rename(columns={'KerasGlove_Accuracy':'Accuracy'})

results_keras_gl.head(5)

Unnamed: 0,category,label,vectorizer,Accuracy,model
0,category,accessory,keras-glove,0.98,Keras-Glove
1,category,bottom,keras-glove,0.94,Keras-Glove
2,category,one piece,keras-glove,0.93,Keras-Glove
3,category,top,keras-glove,0.89,Keras-Glove
4,category,shoe,keras-glove,0.99,Keras-Glove


We can observe that keras-glove approach didn't give higher accuracies, hence we retained count-tfidf approach here.

In [76]:
# performing basic operations to combine all the results
results_keras_w['model'] = 'Keras-Word'
results_keras_w = results_keras_w.rename(columns={'Keras_Accuracy':'Accuracy'})

results_keras_w.head(5)

Unnamed: 0,category,label,vectorizer,Accuracy,model
0,category,accessory,keras-word embed,1.0,Keras-Word
1,category,bottom,keras-word embed,1.0,Keras-Word
2,category,one piece,keras-word embed,0.99,Keras-Word
3,category,top,keras-word embed,0.99,Keras-Word
4,category,shoe,keras-word embed,1.0,Keras-Word


We can observe that word embeding with keras gave us the best accuracies on the test set, hence we moved ahead with using this approach.

In [77]:
results_fin = results_keras_w.copy()

In [78]:
# storing value in a csv file
results_fin.to_csv('01 Final Results.csv', index = False)

### Predictions using the best models

In [79]:
def keras_m_fin(X_train, y_train, vocab_size, EMBEDDING_SIZE, max_length):
    '''
    returns the neural network model by fitting on the given data using the given parameters
    '''
    # define the model
    model = Sequential()
    model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=max_length))
    model.add(Flatten())
    
    # since we are doing binary classification, activation function is sigmoid
    model.add(Dense(1, activation='sigmoid')) 
    
    # compiling and fiting the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    
    model.fit(X_train, y_train, epochs=20, verbose=0)
    
    return model

In [80]:
# creating lists to store values
category_list = []
label_list = []
models_list = []

i = 'category'
df = df_category.copy()
        
# getting all the labels for the category
labels = list(df.iloc[:,1:-2].columns)

# keras tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(df['combined'])

# maximmum length of a document within the corpus
max_length = get_max_token_length_per_doc(df['combined'])

# integer encode the training data
encoded_docs = integer_encode_documents(df['combined'], tokenizer)
# pad the documents
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# get vocab size
vocab_size = int(len(tokenizer.word_index) + 1)

for j in tqdm_notebook(labels):

    # run for each label
    # storing the values of category and label
    category_list.append(i)
    label_list.append(j)

    # using the whole corpus as the training data
    X_train = df['combined'].copy()
    y_train = df[j].copy()

    EMBEDDING_SIZE = 50
    model = keras_m_fin(padded_docs, y_train, vocab_size, EMBEDDING_SIZE, max_length)
    models_list.append(model)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [81]:
# storing values in a dataframe
mod_vec = pd.DataFrame({"category":category_list, "label":label_list, "model":models_list})

In [82]:
# all observations
mod_vec.head(5)

Unnamed: 0,category,label,model
0,category,accessory,<keras.engine.sequential.Sequential object at ...
1,category,bottom,<keras.engine.sequential.Sequential object at ...
2,category,one piece,<keras.engine.sequential.Sequential object at ...
3,category,top,<keras.engine.sequential.Sequential object at ...
4,category,shoe,<keras.engine.sequential.Sequential object at ...


In [83]:
def predicted_labels(test_df2):
    '''
    This function returns the probabilities of each label with the best combination of vectorizer and model 
    we got from the our analyses
    '''

    category_list = []
    label_list = []
    vectorizer_list = []
    pred_val_list = []

    i = 'category'
    df = df_category.copy()
    
    labels = list(df.iloc[:,1:-2].columns)

    y_test = test_df2['testdoc'].copy()

    for j in tqdm_notebook(labels):

        # keras tokenizer
        tokenizer = Tokenizer(num_words=5000, oov_token="UNKNOWN_TOKEN")
        tokenizer.fit_on_texts(df['combined'])

        max_length = get_max_token_length_per_doc(df['combined'])

        # integer encode the training data
        encoded_docs = integer_encode_documents(y_test, tokenizer)
        # pad the documents
        padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

        model = mod_vec.loc[(mod_vec.category==i)&(mod_vec.label==j),'model'].values[0]

        predi = model.predict(padded_docs)[0]
        y_pred = predi[0]

        category_list.append(i)
        label_list.append(j)
        pred_val_list.append(y_pred)

    pred_labels = pd.DataFrame({"category":category_list, "label":label_list, "pred_val":pred_val_list})

    return pred_labels

In [84]:
def best_labels(df_pred_labels):
    '''
    This function returns the label with highest prediction value
    '''
    pred_label_sel = df_pred_labels[df_pred_labels['pred_val']>0.5].copy()
    if len(pred_label_sel)==0:
        label = ''
        return label
    pred_label_sel = pred_label_sel.sort_values('pred_val', ascending=False)
    pred_label_sel = pred_label_sel.reset_index()
    label = pred_label_sel['label'].iloc[0]
    return label

In [85]:
def test_doc_clean(test_doc):
    '''
    returns a cleaned test document
    '''
    # cleaning the test document
    test_doc['testdoc'] = test_doc['testdoc'].astype(str).apply(clean_punct)

    test_doc['testdoc'] = list(
        map(lambda doc: " ".join([token.text for token in nlp(doc) if not token.is_stop]), list(test_doc['testdoc'])))

    test_doc['testdoc'] = lemm(test_doc['testdoc'])
    
    return test_doc

## Part II: Output

In [86]:
def step1_match_productID(prod_to_match, prod_df):
    '''
    This function checks if the entered product ID is already present in our database, 
    else returns top 5 productIDs closest to the entered product ID using fuzzy library
    Input:
    - prod_to_match: productID to match
    - prod_df: dataframe containing unique products
    Output:
    - prints string based on match success
    - returns a flag to indicate match was successfully found or not
    '''
    prod_to_match = productID
    bests = process.extractBests(prod_to_match, list(prod_df.product_id.unique()), scorer=fuzz.ratio)
    if bests[0][1] == 100:
        print(f'Product with matching ID found!')
        flag = 1
        return flag
    else:
        print(f'We were not able to find the product you were looking for. However, we suggest the following product IDs:')
        for i in range(len(bests)):
            print(f'{bests[i][0]}')
        flag = 0
        return flag

In [87]:
def step2_retproddet(flag, prod_df, prod_to_match):
    '''
    This function checks if the product ID is present in the dataframe. If yes, return the outfit combination.
    Else, show 5 nearest product IDs
    
    '''
      
    if flag == 1:
        
        match_df = prod_df.loc[prod_df['product_id']==prod_to_match].reset_index()
        
        if len(match_df)==0:
            "Unfortunately, we do not have any outfit recommendations for this product. We apologize for the inconvenience."
        
        for i in list(match_df.outfit_id.unique()):
            outfit_rec_df = prod_df.loc[prod_df['outfit_id']==i].reset_index()
            #print(outfit_rec_df)
            break
        
        outfit_rec_df = outfit_rec_df.sort_values('outfit_item_type')
        
        print("We would like to recommend the following outfit combination:")
        
        for i in range(len(outfit_rec_df)):
            outfit_item_type_temp = outfit_rec_df.loc[i,'outfit_item_type'] 
            product_full_name_temp = outfit_rec_df.loc[i,'product_full_name']
            product_id_temp = outfit_rec_df.loc[i,'product_id'] 
            print(f'{outfit_item_type_temp}: {product_full_name_temp} ({product_id_temp})')

### Example A

In [88]:
# Use the below cell if you have the product ID

In [89]:
productID = input("Please enter a product ID:")
prod_to_match = productID

# store the outfits_df in a dataframe
prod_df = outfits_df.copy()

# return 1 if there is a matching product ID else recommend five closes product IDs
flag = step1_match_productID(prod_to_match, prod_df)

# get the recommended outfit
step2_retproddet(flag, prod_df, prod_to_match)

Please enter a product ID:01DMBRYVA2ZFDYRYY5TRQZJTBD
Product with matching ID found!
We would like to recommend the following outfit combination:
bottom: Slim Knit Skirt (01DMBRYVA2P5H24WK0HTK4R0A1)
top: Rib Mock Neck Tank (01DMBRYVA2PEPWFTT7RMP5AA1T)
accessory: medium margaux leather satchel (01DMBRYVA2S5T9W793F4CY41HE)
shoe: Penelope Mid Cap Toe Pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)


### Example B

In [90]:
def get_label(test_brand, test_description, test_brand_category, test_details):
    '''
    This function returns the best predicted label, combined and cleans the given inputs
    '''
    # combine the inputs into one string
    test_doc = test_brand +" " + test_description + " " + test_brand_category +\
            " " + test_details
    
    # save it in a dataframe to ease further processes
    test_list = ['temp']
    test_list[0] = test_doc
    test_df2 = pd.DataFrame(data = test_list, columns = ["testdoc"])
    
    # cleaning the document
    test_df2 = test_doc_clean(test_df2)
    
    # getting the label predicted
    pred_label = predicted_labels(test_df2)
    
    # getting the label with highest prediction value
    label = best_labels(pred_label)
    #print(label)
    return label, test_df2.iloc[0].values[0]

def subset_df(label, df_category):
    '''
    This function looks at the subset of labels if there is a prediction of the label
    '''
    
    # use the whole dataframe if no label predicted
    if label == '':
        sub_df = df_category.copy()
        #print(sub_df.head(5))
        return df_category
    
    # filter on the required label if a label is predicted
    else:
        sub_df = df_category[df_category[label]>0].reset_index()
        #print(sub_df.head(5))
        return sub_df
    
def prod_ID(sub_df, test_doc):
    '''
    This function applies cosine similarity after using nlp word2vec model and returns the product ID with the highest similarity
    '''
    
    # calculate similarity score
    for i in range(len(sub_df)):
        orig = nlp(sub_df.loc[i,'combined'])
        test = nlp(test_doc)
        sub_df.loc[i,'Score'] =test.similarity(orig)
    
    # sort by score and get the product ID with the highest value
    sub_df = sub_df.sort_values('Score', ascending = False).reset_index()
    #print(sub_df)
    product_ID = sub_df['product_id'].iloc[0]
    
    return product_ID

In [92]:
print("Please enter the following parameters:")
test_brand = input("Brand:")
test_brand_category = input("Brand Category:")
test_details = input("Details:")
test_description = input("Description:")

# Printing the output

# get the predicted label and cleaned, combined test document
label, test_doc = get_label(test_brand, test_description, test_brand_category, test_details)

# get a subset of category dataframe if there is a predicted label
sub_df = subset_df(label, df_category)

# get the product ID with the highest similarity to given input
prod_to_match = prod_ID(sub_df, test_doc)
print(f'Recommended Product ID: {prod_to_match}')

# Return the matching outfit
prod_df = outfits_df.copy()

# return 1 if there is a matching product ID else recommend five closes product IDs
flag = step1_match_productID(prod_to_match, prod_df)

# get the recommended outfit
step2_retproddet(flag, prod_df, prod_to_match)

Please enter the following parameters:
Brand:Sexy silky, a-line mini skirt zipper Benson skirt
Brand Category:
Details:
Description:


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Recommended Product ID: 01DPKMGJ33SDFXM7XHGPQJWQ12
Product with matching ID found!
We would like to recommend the following outfit combination:
shoe: Pointed-toe flats in suede (01DPCRZWX4S2Z8Q5HYDFM4HNEG)
top: Ashlynn Blouse (01DPET2NWSA221STZF740BZ9SW)
bottom: Benson Skirt (01DPKMGJ33SDFXM7XHGPQJWQ12)
