In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models.phrases import Phraser, Phrases
import spacy
from scipy import spatial
from fuzzywuzzy import fuzz



In [2]:
punctuations = string.punctuation
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
from spacy.lang.en import English
parser = English()

def clean_text(text):
    '''
    use regular expression to clean text 
    replace numbers and units to variables
    '''
    p = re.compile(r'<.*?>')
    text = p.sub('', text)
    text = text.lower()
    text = re.sub('\xa0', '',text)
    text = re.sub(r'\d{1,3}(\.|\’)?\d{1,3}?(\"|\”)',"length_val", text)
    text = re.sub(r'\d{1,3}\s*?%',"percentage_val", text)
    text = text.strip(string.punctuation).replace("\n", " ").replace("\r", " ")
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'\d{1,3}\s*?mm',"mm_val", text)
    text = re.sub(r'\d{1,3}\s*?cm',"cm_val", text)
    text = re.sub(r'\d{1,3}\s*?(inches|inch)',"inches_val", text)
    text = re.sub(r'\d{1,3}\s*?(lbs|kg)',"weight_val", text)
    text = re.sub(r'size\s*?\d{1,3}\s*?',"size_val", text)
    text = re.sub(r'\b\d+\b',' ',text)
    text = re.sub(r'\s+',' ',text) 
    mytokens = parser(text)
    mytokens = [ word.lemma_.lower().strip() for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    return " ".join(mytokens)

In [6]:
### Reading files
combo = pd.read_csv('outfit_combinations.csv')
full = pd.read_csv('full_data_final version.csv')
full = full.loc[:,['product_id','brand','product_full_name','description','brand_category','brand_canonical_url','details']]
full.drop_duplicates(inplace=True)
combo = combo.merge(full, how = 'left', on = ['product_id', 'brand', 'product_full_name'])
combo.fillna('unknown_token', inplace = True)
x = [combo.brand + ' '+ combo.product_full_name +' '+combo.description+' '+combo.brand_category + ' '+combo.details]
x = x[0]
cleaned_x = [clean_text(i) for i in x]

In [8]:
import en_core_web_lg
nlp = en_core_web_lg.load()
def vectorize(text):
    temp = nlp(text)
    return temp.vector

In [9]:
### Vectoirize name, descripiton, brand, brand category using spacy
vector_text  =[vectorize(i) for i in cleaned_x]

In [10]:
### Create new dataframe
df_x = pd.DataFrame(columns = ['product_id', 'vector'], index = combo.index)
df_x['product_id'] = combo.product_id
df_x.set_index('product_id', inplace = True)
df_x['vector']  = [i for i in vector_text]

In [11]:
df_x

Unnamed: 0_level_0,vector
product_id,Unnamed: 1_level_1
01DMBRYVA2P5H24WK0HTK4R0A1,"[0.13419311, 0.0055919983, -0.23032445, 0.1932..."
01DMBRYVA2PEPWFTT7RMP5AA1T,"[0.051668204, 0.1084152, -0.246434, 0.16985838..."
01DMBRYVA2S5T9W793F4CY41HE,"[-0.09401983, 0.11568582, 0.016127413, -0.1234..."
01DMBRYVA2ZFDYRYY5TRQZJTBD,"[0.0035409087, 0.091781445, 0.103435636, -0.18..."
01DMBRYVA2P5H24WK0HTK4R0A1,"[0.13419311, 0.0055919983, -0.23032445, 0.1932..."
...,...
01E5ZYHZA7186DVWEJ99Q4D2PM,"[0.20001274, 0.064251885, -0.1018092, -0.02918..."
01E2P0SJSKFKNQJ5SVQ8MD1JZT,"[0.011835023, 0.12402969, 0.05941964, -0.06654..."
01E4RW25Y8ZF6WKZRE50Y6SKH5,"[0.10242396, 0.19070508, -0.206334, 0.18994208..."
01E5ZS3R9JD696YWGK9NSG56E1,"[0.03905594, 0.15765749, -0.024056124, 0.07992..."


In [38]:
def get_rec():
    '''
    This function takes 6 optional inputs
    If an appropriate product is input, we use it to find recommended cloths
    If not, we search the most similiar product based on input information
    and find the outfit
    '''
    p_id = input('product id: ')
    brand = input('brand: ')
    description = input('description: ')
    details = input('details: ')
    brand_cate = input('brand category: ')
    name = input('product name: ')
    if len(p_id)>0:
        ### Seaerch by product id
        if p_id in list(combo.product_id):
            product_id = p_id
        else:
            ### recommend the most similiar product id
            fuzz_df = pd.DataFrame(columns = ['product_id', 'fuzzy'])
            fuzz_df.product_id = df_x.index
            fuzz_df.drop_duplicates(inplace = True)
            fuzz_id = [fuzz.ratio(p_id, i) for i in fuzz_df.product_id]
            fuzz_df.fuzzy = fuzz_id
            top_3_id = fuzz_df.sort_values('fuzzy', ascending  =False)[:3].product_id
            print(f'input product_id not found, follows are recommended product id:{top_3_id.values}')
            product_id = input('New input product id: ')
    else:
        ### if user did not input product, use other info
        inputs = str(brand) + ' ' + str(description) + ' '+str(details)+' '+str(brand_cate)+' '+str(name)
        inputs = clean_text(inputs)
        inputs = vectorize(inputs)
        simi_list = [1 - spatial.distance.cosine(df_x.iloc[i,0], inputs) for i in range(len(df_x))]
        df_x['similarity'] = simi_list

        product_id = df_x[df_x.iloc[:,1]==df_x.iloc[:,1].max()].iloc[0,:].name
    outfit = combo[combo.product_id==product_id].outfit_id.iloc[:1].values[0]
    recom = combo[combo.outfit_id== outfit]
    
    for i in range(len(recom)):
        types = list(recom.outfit_item_type)
        names = list(recom.product_full_name)
        product_id = list(recom.product_id)
    dicts = { types[i] : [names[i], product_id[i]] for i in range(0, len(types) ) }    
    for i in dicts.keys():
        print(f'{i}: {dicts[i][0]} ({dicts[i][1]})')

### Test 1
#### Correct product ID

In [35]:
get_rec()

product id:01DMBRYVA2ZFDYRYY5TRQZJTBD
brand:
description:
details:
brand category:
product name:
bottom: Slim Knit Skirt (01DMBRYVA2P5H24WK0HTK4R0A1)
top: Rib Mock Neck Tank (01DMBRYVA2PEPWFTT7RMP5AA1T)
accessory1: medium margaux leather satchel (01DMBRYVA2S5T9W793F4CY41HE)
shoe: Penelope Mid Cap Toe Pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)


#### Wrong product ID

In [36]:
get_rec()

product id:11DMBRYVA2ZFDYRYY5TRQZJTBD
brand:
description:
details:
brand category:
product name:
input product_id not found, follows are recommended product id:['01DMBRYVA2ZFDYRYY5TRQZJTBD' '01DMBRYVA2Q2ST7MNYR6EEY4TK'
 '01DMBRYVA2PEPWFTT7RMP5AA1T']
New input product id: 01DMBRYVA2ZFDYRYY5TRQZJTBD
bottom: Slim Knit Skirt (01DMBRYVA2P5H24WK0HTK4R0A1)
top: Rib Mock Neck Tank (01DMBRYVA2PEPWFTT7RMP5AA1T)
accessory1: medium margaux leather satchel (01DMBRYVA2S5T9W793F4CY41HE)
shoe: Penelope Mid Cap Toe Pump (01DMBRYVA2ZFDYRYY5TRQZJTBD)


### Test 2 

In [37]:
get_rec()

product id:
brand:Reformation
description:slim fitting, straight leg pant with a center back zipper and slightly cropped leg
details:
brand category:
product name:
accessory1: Cassi Belt Bag (01DPEHS0XH9PDD1GH5ZE4P43A2)
bottom: Marlon Pant (01DPKMH0D252JKMAA27MFCT5GM)
top: Jane Sweater (01DPKN20Q3J0BE3CS896DQB6ER)
shoe: Giulia Satin Heel (01DPKNHQDG6GPTKV97CFQRJDHE)


### Test 3

In [39]:
get_rec()

product id: 
brand: 
description: Sexy silky, a-line mini skirt zipper Benson skirt
details: 
brand category: 
product name: 
shoe: Pointed-toe flats in suede (01DPCRZWX4S2Z8Q5HYDFM4HNEG)
top: Ashlynn Blouse (01DPET2NWSA221STZF740BZ9SW)
bottom: Benson Skirt (01DPKMGJ33SDFXM7XHGPQJWQ12)


In [40]:
get_rec()

product id: 
brand: Sexy silky. This is an a-line mini skirt with a center back zipper. The Benson pairs well with the Hailee Top.
description: 
details: 
brand category: 
product name: 
shoe: Pointed-toe flats in suede (01DPCRZWX4S2Z8Q5HYDFM4HNEG)
top: Ashlynn Blouse (01DPET2NWSA221STZF740BZ9SW)
bottom: Benson Skirt (01DPKMGJ33SDFXM7XHGPQJWQ12)
