In [31]:
import pandas as pd 
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from scipy.spatial.distance import cosine
from scipy import spatial

In [2]:
df1 = pd.read_csv("outfit_combinations.csv")
df1.head()

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
1,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
2,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2S5T9W793F4CY41HE,accessory1,kate spade new york,medium margaux leather satchel
3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump
4,01DMHCX50CFX5YNG99F3Y65GQW,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt


In [5]:
# if the inputproductid is correct:

def recommend(df1, inputProductId):
    targetOutfitId = df1[df1.product_id == inputProductId].outfit_id.to_list()[0]
    df2 = df1[df1.outfit_id == targetOutfitId]
    print("Recommended Outfit Combination: \n")
    for i in df2.outfit_item_type.to_list():
        temp = df2[df2.outfit_item_type == i]
        print(f'\t-{i}: {temp.product_full_name.to_list()[0]} ({temp.product_id.to_list()[0]})')

In [8]:
# if the inputproductid is not correct: we can use fuzz method to deal with the productid first which can be found in my teammates' works.

# if the input is not productid but some texts:
full_df = pd.read_csv('full_data_final version.csv')
full_df.head()

Unnamed: 0,product_id,brand,mpn,product_full_name,description,brand_category,created_at,updated_at,deleted_at,brand_canonical_url,details,labels,bc_product_id
0,01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,514683.0,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...",Unknown,2019-11-11 22:37:15.719107+00,2019-12-19 20:40:30.786144+00,,https://bananarepublic.gap.com/browse/product....,"A modern pump, in a rounded silhouette with an...","{""Needs Review""}",
1,01DSE9SKM19XNA6SJP36JZC065,Banana Republic,526676.0,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Unknown,2019-11-11 22:36:50.682513+00,2019-12-19 20:40:30.786144+00,,https://bananarepublic.gap.com/browse/product....,Dress it down with jeans and sneakers or dress...,"{""Needs Review""}",
2,01DSJX8GD4DSAP76SPR85HRCMN,Loewe,400100000000.0,52MM Padded Leather Round Sunglasses,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,2019-11-13 17:33:59.581661+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/loewe-52mm-pad...,100% UV protection Case and cleaning cloth inc...,"{""Needs Review""}",
3,01DSJVKJNS6F4KQ1QM6YYK9AW2,Converse,400012000000.0,Baby's & Little Kid's All-Star Two-Tone Mid-To...,The iconic mid-top design gets an added dose o...,"JustKids/Shoes/Baby024Months/BabyGirl,JustKids...",2019-11-13 17:05:05.203733+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/converse-babys...,Canvas upper Round toe Lace-up vamp SmartFOAM ...,"{""Needs Review""}",
4,01DSK15ZD4D5A0QXA8NSD25YXE,Alexander McQueen,400011000000.0,64MM Rimless Sunglasses,Hexagonal shades offer a rimless view with int...,JewelryAccessories/SunglassesReaders/RoundOval,2019-11-13 18:42:30.941321+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/alexander-mcqu...,100% UV protection Gradient lenses Adjustable ...,"{""Needs Review""}",


In [10]:
# Define all preprocessing functions for texts in brand, brand_category, description and details

import string 
def removePunctuation(text, punctuations=string.punctuation+"``"+"’"+"”"):
    words=nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in punctuations]
    cleanedText = " ".join(newWords)
    return cleanedText

def removeStopwords(text, stopwords = set(stopwords.words("English"))):
    words = nltk.word_tokenize(text)
    newWords = [word for word in words if word.lower() not in stopwords]
    cleanedText = " ".join(newWords)
    return cleanedText

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    words = nltk.word_tokenize(text)
    lemmatizedWords = [lemmatizer.lemmatize(word.lower()) for word in words]
    lemmatizedText = " ".join(lemmatizedWords)
    return lemmatizedText

def preprocessing(df, columns = ["brand", "brand_category", "description", "details"]):
    df['details'] = df['details'].str.replace("\n", "")
    df['brand_category'] = df['brand_category'].str.replace("Unknown", "UNKNOWN_TOKEN")
    df['brand_category'] = df['brand_category'].fillna('UNKNOWN_TOKEN')
    df['description'] = df['description'].fillna('UNKNOWN_TOKEN')
    df['details'] = df['details'].fillna('UNKNOWN_TOKEN')
    for col in columns: 
        df[col] = df[col].apply(removePunctuation)
        df[col] = df[col].apply(removeStopwords)
        df[col] = df[col].apply(lemmatize)
    return df

In [11]:
full_df = preprocessing(full_df)

In [25]:
full_df = full_df.loc[:,['product_id','brand','description','brand_category','details']]
full_df.drop_duplicates(inplace = True)
together = df1.merge(full_df, how = 'left', on = ['product_id', 'brand'])
together.fillna('unknown_token', inplace = True)
together = preprocessing(together)
text = [together.brand +' ' +' '+ together.description +' '+ together.brand_category +' '+ together.details][0]

In [26]:
import en_core_web_lg
nlp = en_core_web_lg.load()
def vectorize(text):
    temp = nlp(text)
    return temp.vector

In [36]:
vector_text  =[vectorize(i) for i in text]
df_vector = pd.DataFrame(columns = ['product_id', 'vector'], 
                         index = together.index)
df_vector['product_id'] = together.product_id
df_vector.set_index('product_id', inplace = True)
df_vector['vector']  = [i for i in vector_text]

In [37]:
# New dataframe will all texts combined together and vectorized
df_vector.head()

Unnamed: 0_level_0,vector
product_id,Unnamed: 1_level_1
01DMBRYVA2P5H24WK0HTK4R0A1,"[0.048501667, 0.026456669, -0.125315, -0.02320..."
01DMBRYVA2PEPWFTT7RMP5AA1T,"[0.048501667, 0.026456669, -0.125315, -0.02320..."
01DMBRYVA2S5T9W793F4CY41HE,"[-0.020111887, 0.102002226, -0.025714444, -0.1..."
01DMBRYVA2ZFDYRYY5TRQZJTBD,"[-0.09115001, 0.112751, 0.11167667, -0.0078573..."
01DMBRYVA2P5H24WK0HTK4R0A1,"[0.048501667, 0.026456669, -0.125315, -0.02320..."


In [38]:
def recommend2():
    brand = input('brand: ')
    description = input('description: ')
    details = input('details: ')
    brand_cate = input('brand category: ')
    name = input('product name: ')
    inputs = str(brand) + ' ' + str(description) + ' '+str(details)+' '+str(brand_cate)+' '+str(name)
    inputs = vectorize(inputs)
    simi_list = [1 - spatial.distance.cosine(df_vector.iloc[i,0], inputs) for i in range(len(df_vector))]
    df_vector['similarity'] = simi_list
    product_id = df_vector[df_vector.iloc[:,1] == df_vector.iloc[:,1].max()].iloc[0,:].name
    outfit = together[together.product_id == product_id].outfit_id.iloc[:1].values[0]
    recommendation = together[together.outfit_id == outfit]
    
    for i in range(len(recommendation)):
        types = list(recommendation.outfit_item_type)
        names = list(recommendation.product_full_name)
        product_id = list(recommendation.product_id)
    dicts = { types[i] : [names[i], product_id[i]] for i in range(0, len(types) ) }    
    for i in dicts.keys():
        print(f'{i}: {dicts[i][0]} ({dicts[i][1]})')

In [7]:
# test
recommend(df1, '01DMBRYVA2ZFDYRYY5TRQZJTBD')

Recommended Outfit Combination: 

	-bottom: Slim Knit Skirt (01DMBRYVA2P5H24WK0HTK4R0A1)
	-top: Rib Mock Neck Tank (01DMBRYVA2PEPWFTT7RMP5AA1T)
	-accessory1: medium margaux leather satchel (01DMBRYVA2S5T9W793F4CY41HE)
	-shoe: Penelope Mid Cap Toe Pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)
