# Overview

This notebook contains the codes I wrote for DSO 560 Text Analytics & NLP Final Project to recommend outfit for women clothing based on the user input. The client is ThreadTogether, an Australian Non-profit orgnazation.
- Part I focuses on outfit recommendation with product ID input 
- Part II generates outfit recommendation based on text input of brand, brand category, details, or description

Create on: 5.9.2020

Create by: Xinyi (Alex) Guo

In [1]:
import pandas as pd 
import numpy as np

from fuzzywuzzy import fuzz

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

from scipy.spatial.distance import cosine

### 1. Product ID input

#### 1.1 Load outfit data

In [2]:
outfit_df = pd.read_csv("outfit_combinations.csv")

In [151]:
# outfit_df.isnull().sum()

In [150]:
# outfit_df.head()

#### 1.2 Correct product ID input

In [6]:
inputProductId = '01DMBRYVA2ZFDYRYY5TRQZJTBD'

In [7]:
def recommendOutfit(outfit_df, inputProductId):
    #after matching the product_id, use the first outfit_id for outfit recommendation
    targetOutfitId = outfit_df[outfit_df.product_id == inputProductId].outfit_id.to_list()[0]
    targetOutfit_df = outfit_df[outfit_df.outfit_id == targetOutfitId]
    print("Recommended Outfit Combination: \n")
    for i in targetOutfit_df.outfit_item_type.to_list():
        temp = targetOutfit_df[targetOutfit_df.outfit_item_type == i]
        print(f'\t-{i}: {temp.product_full_name.to_list()[0]} ({temp.product_id.to_list()[0]})')

In [8]:
recommendOutfit(outfit_df, inputProductId)

Recommended Outfit Combination: 

	-bottom: Slim Knit Skirt (01DMBRYVA2P5H24WK0HTK4R0A1)
	-top: Rib Mock Neck Tank (01DMBRYVA2PEPWFTT7RMP5AA1T)
	-accessory1: medium margaux leather satchel (01DMBRYVA2S5T9W793F4CY41HE)
	-shoe: Penelope Mid Cap Toe Pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)


#### 1.3 Handle mistyped product ID input

In [10]:
typoProductId = '01DMBRYVA2Z3DYRYY5TRQZJTB5'

In [11]:
fuzz.ratio(inputProductId, typoProductId)

92

In [12]:
def fuzzRatio(df):
    product1 = df['typoProductId']
    product2 = df['product_id']
    return fuzz.ratio(product1, product2)

In [13]:
def getCorrectProductId(outfit_df, typoProductId):
    outfit_df['typoProductId'] = typoProductId
    outfit_df['fuzzRatio'] = outfit_df.apply(fuzzRatio, axis=1)
    #find the most similar product_id as the correct product_id
    correctProductId = outfit_df[outfit_df.fuzzRatio == max(outfit_df.fuzzRatio)].product_id.to_list()[0]
    return correctProductId

In [14]:
correctProductId = getCorrectProductId(outfit_df, typoProductId)
recommendOutfit(outfit_df, correctProductId)

Recommended Outfit Combination: 

	-bottom: Slim Knit Skirt (01DMBRYVA2P5H24WK0HTK4R0A1)
	-top: Rib Mock Neck Tank (01DMBRYVA2PEPWFTT7RMP5AA1T)
	-accessory1: medium margaux leather satchel (01DMBRYVA2S5T9W793F4CY41HE)
	-shoe: Penelope Mid Cap Toe Pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)


### 2. Free form text input (brand, brand category, details, and description)

#### 2.1 Load and clean full Data

In [15]:
fullData = pd.read_csv("full_data_final version.csv")

In [16]:
fullData.shape

(48979, 13)

In [17]:
fullData.drop_duplicates('product_id', inplace = True)

In [18]:
fullData = fullData.loc[:, ['product_id', 'brand', 'product_full_name', 'description', 'brand_category', 'details']]

In [19]:
fullData.shape

(48072, 6)

In [20]:
fullData.isnull().sum()

product_id              0
brand                   0
product_full_name       0
description          7917
brand_category        238
details              9615
dtype: int64

In [21]:
#Define function to remove punctuation
import string 
def removePunctuation(text, punctuations=string.punctuation+"``"+"’"+"”"):
    words=nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in punctuations]
    cleanedText = " ".join(newWords)
    return cleanedText

In [22]:
nltk_stopwords = set(stopwords.words("English"))

In [23]:
#Define function to remove stopwords
def removeStopwords(text, stopwords=nltk_stopwords):
    words = nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in stopwords]
    cleanedText = " ".join(newWords)
    return cleanedText

In [24]:
#Define function for lemmatization
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    words = nltk.word_tokenize(text)
    lemmatizedWords = [lemmatizer.lemmatize(word.lower()) for word in words]
    lemmatizedText = " ".join(lemmatizedWords)
    return lemmatizedText

In [25]:
def preprocessing(df, columns = ["brand", "brand_category", "description", "details"]):
    df['details'] = df['details'].str.replace("\n", "")
    #replace 'unknown' or null values with UNKNOWN_TOKEN
    df['brand_category'] = df['brand_category'].str.replace("Unknown", "UNKNOWN_TOKEN")
    df['brand_category'] = df['brand_category'].fillna('UNKNOWN_TOKEN')
    df['description'] = df['description'].fillna('UNKNOWN_TOKEN')
    df['details'] = df['details'].fillna('UNKNOWN_TOKEN')
    #remove punctuation and stopwords then lemmatize
    for col in columns: 
        df[col] = df[col].apply(removePunctuation)
        df[col] = df[col].apply(removeStopwords)
        df[col] = df[col].apply(lemmatize)
    return df

In [26]:
fullData.head()

Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details
0,01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...",Unknown,"A modern pump, in a rounded silhouette with an..."
1,01DSE9SKM19XNA6SJP36JZC065,Banana Republic,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Unknown,Dress it down with jeans and sneakers or dress...
2,01DSJX8GD4DSAP76SPR85HRCMN,Loewe,52MM Padded Leather Round Sunglasses,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,100% UV protection Case and cleaning cloth inc...
3,01DSJVKJNS6F4KQ1QM6YYK9AW2,Converse,Baby's & Little Kid's All-Star Two-Tone Mid-To...,The iconic mid-top design gets an added dose o...,"JustKids/Shoes/Baby024Months/BabyGirl,JustKids...",Canvas upper Round toe Lace-up vamp SmartFOAM ...
4,01DSK15ZD4D5A0QXA8NSD25YXE,Alexander McQueen,64MM Rimless Sunglasses,Hexagonal shades offer a rimless view with int...,JewelryAccessories/SunglassesReaders/RoundOval,100% UV protection Gradient lenses Adjustable ...


In [27]:
fullData = preprocessing(fullData)

In [28]:
fullData.head()

Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details
0,01DSE9TC2DQXDG6GWKW9NMJ416,banana republic,Ankle-Strap Pump,modern pump rounded silhouette ankle strap ext...,unknown_token,modern pump rounded silhouette ankle strap ext...
1,01DSE9SKM19XNA6SJP36JZC065,banana republic,Petite Tie-Neck Top,dress jean sneaker dress tailored trouser heel...,unknown_token,dress jean sneaker dress tailored trouser heel...
2,01DSJX8GD4DSAP76SPR85HRCMN,loewe,52MM Padded Leather Round Sunglasses,padded leather cover classic round sunglass,jewelryaccessories/sunglassesreaders/roundoval...,100 uv protection case cleaning cloth included...
3,01DSJVKJNS6F4KQ1QM6YYK9AW2,converse,Baby's & Little Kid's All-Star Two-Tone Mid-To...,iconic mid-top design get added dose support p...,justkids/shoes/baby024months/babygirl justkids...,canvas upper round toe lace-up vamp smartfoam ...
4,01DSK15ZD4D5A0QXA8NSD25YXE,alexander mcqueen,64MM Rimless Sunglasses,hexagonal shade offer rimless view intricate n...,jewelryaccessories/sunglassesreaders/roundoval,100 uv protection gradient lens adjustable nos...


#### 2.2 Vectorize full data

In [29]:
import en_core_web_lg
nlp = en_core_web_lg.load()
def vectorize(text):
    temp = nlp(text)
    return temp.vector

In [30]:
columns = ["brand", "brand_category", "description", "details"]
for col in columns:
    fullData["{}Vector".format(col)] = fullData[col].apply(vectorize)
    print(col, "is vectorized.")

brand is vectorized.
brand_category is vectorized.
description is vectorized.
details is vectorized.


In [31]:
fullData.head()

Unnamed: 0,product_id,brand,product_full_name,description,brand_category,details,brandVector,brand_categoryVector,descriptionVector,detailsVector
0,01DSE9TC2DQXDG6GWKW9NMJ416,banana republic,Ankle-Strap Pump,modern pump rounded silhouette ankle strap ext...,unknown_token,modern pump rounded silhouette ankle strap ext...,"[0.1228375, -0.211389, 0.60544, 0.1499925, -0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.00021044741, 0.12995128, -0.10054972, 0.048...","[-0.047574263, 0.16968253, -0.11397273, 0.0237..."
1,01DSE9SKM19XNA6SJP36JZC065,banana republic,Petite Tie-Neck Top,dress jean sneaker dress tailored trouser heel...,unknown_token,dress jean sneaker dress tailored trouser heel...,"[0.1228375, -0.211389, 0.60544, 0.1499925, -0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.021091318, 0.06911382, -0.18785079, 0.31123...","[-0.015648283, 0.19760883, -0.1847444, 0.19181..."
2,01DSJX8GD4DSAP76SPR85HRCMN,loewe,52MM Padded Leather Round Sunglasses,padded leather cover classic round sunglass,jewelryaccessories/sunglassesreaders/roundoval...,100 uv protection case cleaning cloth included...,"[0.31551, 0.47482, 0.23485, 0.74321, 0.57547, ...","[0.0079012, 0.2155, -0.0106336, 0.152184, 0.23...","[0.09593249, -0.09788016, -0.06213267, 0.10880...","[0.11028513, 0.4177481, -0.025815923, 0.013043..."
3,01DSJVKJNS6F4KQ1QM6YYK9AW2,converse,Baby's & Little Kid's All-Star Two-Tone Mid-To...,iconic mid-top design get added dose support p...,justkids/shoes/baby024months/babygirl justkids...,canvas upper round toe lace-up vamp smartfoam ...,"[-0.18844, -0.40088, -0.3496, -0.06288, 0.5014...","[0.03199641, 0.23442467, 0.0011862485, 0.01239...","[0.050940253, 0.17849508, -0.07587735, 0.15529...","[-0.052187078, -0.008974921, -0.11718154, 0.03..."
4,01DSK15ZD4D5A0QXA8NSD25YXE,alexander mcqueen,64MM Rimless Sunglasses,hexagonal shade offer rimless view intricate n...,jewelryaccessories/sunglassesreaders/roundoval,100 uv protection gradient lens adjustable nos...,"[0.167725, 0.269785, 0.200405, -0.14397, -0.10...","[0.0079012, 0.2155, -0.0106336, 0.152184, 0.23...","[0.16068909, 0.115750454, -0.05685518, 0.06262...","[0.13339709, 0.43507954, -0.056259636, 0.05777..."


In [32]:
# fullData.to_csv('cleaned vectorized fulldata.csv')

#### 2.3 Outfit recommendation based on free form input of brand, brand category, details, and description

In [107]:
#initialize default cosine columns of value 0 to calculate cosine average later
columns = ["brand", "brand_category", "description", "details"]
for col in columns:
    fullData["cosine_{}".format(col)] = 0

In [142]:
#test 1
inputDescription = "slim fitting, straight leg pant with a center back zipper and slightly cropped leg"
inputBrand = "Reformation"
inputBrandCategory = ""
inputDetails = ""
inputTextDict = {"description":inputDescription, "brand":inputBrand, 
                 "brand_category":inputBrandCategory, "details":inputDetails}

In [146]:
#test 2
inputDescription = "Sexy silky, a-line mini skirt zipper Benson skirt"
inputBrand = ""
inputBrandCategory = ""
inputDetails = ""
inputTextDict = {"description":inputDescription, "brand":inputBrand, 
                 "brand_category":inputBrandCategory, "details":inputDetails}

In [143]:
def inputPreprocessing(text):
    text = removePunctuation(text)
    text = removeStopwords(text)
    text = lemmatize(text)
    return text

In [144]:
def calculateCosineDistance(df):
    vector1 = inputVector
    vector2 = df["{}Vector".format(key)]
    return (1 - cosine(vector1, vector2))

In [147]:
#Recommend outfit
for key in inputTextDict.keys():
    if inputTextDict[key] != "":
        inputText = inputPreprocessing(inputTextDict[key])
        inputVector = vectorize(inputText)
        fullData['cosine_{}'.format(key)] = fullData.apply(calculateCosineDistance, axis=1)
        
fullData['cosine_avg'] = fullData.mean(axis = 1)
similarProductId = fullData[fullData.cosine_avg == max(fullData.cosine_avg)].product_id.to_list()[0]
recommendOutfit(outfit_df, similarProductId)

Recommended Outfit Combination: 

	-shoe: Pointed-toe flats in suede (01DPCRZWX4S2Z8Q5HYDFM4HNEG)
	-top: Ashlynn Blouse (01DPET2NWSA221STZF740BZ9SW)
	-bottom: Benson Skirt (01DPKMGJ33SDFXM7XHGPQJWQ12)


In [149]:
# fullData.sort_values('cosine_avg', ascending = False)