In [1]:
import pandas as pd
import numpy as np

In [2]:
outfit = pd.read_csv('outfit_combinations.csv')
full_data = pd.read_csv('full_data.csv')

In [3]:
full_data = full_data.loc[:,['product_id','brand','description',
                             'brand_category','details']]
full_data.drop_duplicates(subset=['product_id'],inplace=True)

In [4]:
outfit.shape,full_data.shape

((5291, 5), (48072, 5))

## TF-IDF weighted self-trained embedding

In [5]:
df = full_data.copy()

In [6]:
import spacy
nlp = spacy.load("en_core_web_md")

import re
import string
import pickle
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from gensim.models.phrases import Phraser, Phrases
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

stopwords = set(stopwords.words('english'))
punc = string.punctuation.replace('-','').replace('%','').replace('"','') + "’"

def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [7]:
def preprocess(text):
    
    text = re.sub(r'\xa0',' ',text)
    for p in punc:
        text = text.replace(p,' ')
    text = text.replace('-','')
    text = text.lower()
    text = re.sub(r'\s\d+\s*[cm]m[\s/]|\s\d+\.\d+\sinch(es)*|\s\d+\s*["|”]\s',' sizetoken ',text)
    text = re.sub(r'\s\d+\s*[cm]m[\s/]|\s\d+\.\d+\sinch(es)*|\s\d+\s*["|”]\s',' sizetoken ',text)
    text = re.sub(r'\s\d+%\s',' percentagetoken ',text)
    text = re.sub(r'\s\d+\s',' numbertoken ',text)
    text = re.sub(r'\s{2,}', ' ', text)
    tokens = word_tokenize(text)
    filtered_tokens = list(filter(lambda token: token not in stopwords, tokens))
    text = " ".join(filtered_tokens)
    
    doc = nlp(text)
    
    return [token.lemma_ for token in doc]

In [8]:
def prepare(dataframe, cols):
    
    df = dataframe.copy()
    df.fillna(' NOINFO ',inplace=True)
    
    for col in cols:
        new_col = []
        for text in df[col]:
            new_col.append(preprocess(text))
        phrases = Phrases(new_col, min_count=5)
        bi_gram = Phraser(phrases)
#         save_obj(bi_gram, col+'_bigram')
        df[col+'_bigram_doc'] = list(bi_gram[new_col])
        df[col+'_joined_text'] = df[col+'_bigram_doc'].apply(lambda l: " ".join(l))
        df[col+'_joined_text'] = df[col+'_joined_text'].apply(lambda t: t.replace('_',''))
        df[col+'_final_doc'] = df[col+'_joined_text'].apply(lambda s: s.split())
#         print(f'{col} preprocessing done.')

    return df

In [9]:
cols = ['brand','description','brand_category','details']
df1 = prepare(df, cols)

In [10]:
df1.head(3)

Unnamed: 0,product_id,brand,description,brand_category,details,brand_bigram_doc,brand_joined_text,brand_final_doc,description_bigram_doc,description_joined_text,description_final_doc,brand_category_bigram_doc,brand_category_joined_text,brand_category_final_doc,details_bigram_doc,details_joined_text,details_final_doc
0,01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,"A modern pump, in a rounded silhouette with an...",Unknown,"A modern pump, in a rounded silhouette with an...","[banana, republic]",banana republic,"[banana, republic]","[modern, pump, round, silhouette, ankle_strap,...",modern pump round silhouette anklestrap extras...,"[modern, pump, round, silhouette, anklestrap, ...",[unknown],unknown,[unknown],"[modern, pump, round, silhouette, ankle_strap,...",modern pump round silhouette anklestrap extras...,"[modern, pump, round, silhouette, anklestrap, ..."
1,01DSE9SKM19XNA6SJP36JZC065,Banana Republic,Dress it down with jeans and sneakers or dress...,Unknown,Dress it down with jeans and sneakers or dress...,"[banana, republic]",banana republic,"[banana, republic]","[dress, jean, sneaker, dress, tailor_trouser, ...",dress jean sneaker dress tailortrouser heel ti...,"[dress, jean, sneaker, dress, tailortrouser, h...",[unknown],unknown,[unknown],"[dress, jean_sneaker, dress, tailor, trouser, ...",dress jeansneaker dress tailor trouser heel ti...,"[dress, jeansneaker, dress, tailor, trouser, h..."
2,01DSJX8GD4DSAP76SPR85HRCMN,Loewe,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,100% UV protection Case and cleaning cloth inc...,[loewe],loewe,[loewe],"[padded, leather, cover, classic, round_sunglass]",padded leather cover classic roundsunglass,"[padded, leather, cover, classic, roundsunglass]","[jewelryaccessories, sunglassesreaders, roundo...",jewelryaccessories sunglassesreaders roundoval...,"[jewelryaccessories, sunglassesreaders, roundo...","[100_%, uv_protection, case, clean_cloth, incl...",100% uvprotection case cleancloth includeaceta...,"[100%, uvprotection, case, cleancloth, include..."


In [11]:
def weighted_average(token_lists, tfidf_df, tfidf_tokens, model):

    vectors = []
    for idx, doc in enumerate(token_lists): 

        total_scores = 0
        running_total_word_embedding = np.zeros(100) 

        for token in doc:
            if token in tfidf_tokens:
                tf_idf_score = tfidf_df.loc[idx, token]
                running_total_word_embedding += tf_idf_score * model.wv.__getitem__(token)
                total_scores += tf_idf_score

        document_embedding = running_total_word_embedding/total_scores
        vectors.append(document_embedding)
        
    return vectors

In [12]:
def get_vectors(dataframe):
    
    df = dataframe.copy()
    
    cols = ['brand','description','brand_category','details']
    for col in cols:
        
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(df[col+'_joined_text'])
        tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
        tfidf_tokens = tfidf_df.columns 
        
        model = Word2Vec(df[col+'_final_doc'], min_count=1)
        
#         save_obj(model,col+'_w2v_model')
#         save_obj(vectorizer, col+'_vectorizer')
#         save_obj(tfidf_df, col+'_tfidf_df')
    
        vectors = weighted_average(df[col+'_final_doc'], tfidf_df, tfidf_tokens, model)
        df[col+'_vector'] = vectors
#         print(f'{col} vectorization done.')
        
    return df      

In [13]:
df2 = get_vectors(df1)

In [14]:
df2.head(3)

Unnamed: 0,product_id,brand,description,brand_category,details,brand_bigram_doc,brand_joined_text,brand_final_doc,description_bigram_doc,description_joined_text,...,brand_category_bigram_doc,brand_category_joined_text,brand_category_final_doc,details_bigram_doc,details_joined_text,details_final_doc,brand_vector,description_vector,brand_category_vector,details_vector
0,01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,"A modern pump, in a rounded silhouette with an...",Unknown,"A modern pump, in a rounded silhouette with an...","[banana, republic]",banana republic,"[banana, republic]","[modern, pump, round, silhouette, ankle_strap,...",modern pump round silhouette anklestrap extras...,...,[unknown],unknown,[unknown],"[modern, pump, round, silhouette, ankle_strap,...",modern pump round silhouette anklestrap extras...,"[modern, pump, round, silhouette, anklestrap, ...","[-0.006715953814315126, -0.01039340701561759, ...","[-0.004647935761223234, -0.47167691231410513, ...","[-0.0002778382913675159, 0.0014776202151551843...","[0.41041421081801877, -0.24040577652891382, 0...."
1,01DSE9SKM19XNA6SJP36JZC065,Banana Republic,Dress it down with jeans and sneakers or dress...,Unknown,Dress it down with jeans and sneakers or dress...,"[banana, republic]",banana republic,"[banana, republic]","[dress, jean, sneaker, dress, tailor_trouser, ...",dress jean sneaker dress tailortrouser heel ti...,...,[unknown],unknown,[unknown],"[dress, jean_sneaker, dress, tailor, trouser, ...",dress jeansneaker dress tailor trouser heel ti...,"[dress, jeansneaker, dress, tailor, trouser, h...","[-0.006715953814315126, -0.01039340701561759, ...","[0.6641151532992506, -0.6207354159287056, 0.38...","[-0.0002778382913675159, 0.0014776202151551843...","[-0.008222198637743875, -0.8895652350816768, 0..."
2,01DSJX8GD4DSAP76SPR85HRCMN,Loewe,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,100% UV protection Case and cleaning cloth inc...,[loewe],loewe,[loewe],"[padded, leather, cover, classic, round_sunglass]",padded leather cover classic roundsunglass,...,"[jewelryaccessories, sunglassesreaders, roundo...",jewelryaccessories sunglassesreaders roundoval...,"[jewelryaccessories, sunglassesreaders, roundo...","[100_%, uv_protection, case, clean_cloth, incl...",100% uvprotection case cleancloth includeaceta...,"[100%, uvprotection, case, cleancloth, include...","[-0.0018665628740563989, -0.002845471492037177...","[0.5134071664896477, -0.676150946425393, 0.007...","[0.7606678696910437, 1.1448306721912391, -0.41...","[1.6002830654306337, -0.44923354592997056, 0.7..."


In [15]:
def remove_nan(arr):
    if pd.isnull(arr[0]):
        arr = np.zeros(100)
    return arr

In [16]:
df4 = df2.copy()

In [17]:
df4.brand_vector = df4.brand_vector.apply(remove_nan)
df4.brand_category_vector = df4.brand_category_vector.apply(remove_nan)
df4.description_vector = df4.description_vector.apply(remove_nan)
df4.details_vector = df4.details_vector.apply(remove_nan)

## Testing

#### Example C

In [18]:
df3 = outfit.copy()

In [19]:
df3 = pd.merge(df3,df4,how='left',left_on='product_id',right_on='product_id')

In [20]:
inputDescription = "Sexy silky, a-line mini skirt zipper Benson skirt"
inputBrand = ""
inputBrandCategory = ""
inputDetails = ""
inputTextDF = pd.DataFrame({"description":inputDescription, "brand":inputBrand, 
                 "brand_category":inputBrandCategory, "details":inputDetails}, index = range(1))

In [21]:
columns = []
for col in inputTextDF.columns:
    if inputTextDF[col].values[0] != '':
        columns.append(col)

In [22]:
inputTextDF = prepare(inputTextDF, columns)

In [23]:
for col in columns:
    
    text = inputTextDF[col+'_joined_text'].values[0]
    doc = inputTextDF[col+'_final_doc'].values[0]
    vectorizer = load_obj(col+'_vectorizer')
    X = vectorizer.transform([text])
    tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    model = load_obj(col+'_w2v_model')
    tfidf_tokens = tfidf_df.columns
    
    total_scores = 0
    running_total_word_embedding = np.zeros(100) 

    for token in doc:
        if token in tfidf_tokens:
            tf_idf_score = tfidf_df.loc[0, token]
            running_total_word_embedding += tf_idf_score * model.wv.__getitem__(token)
            total_scores += tf_idf_score

    document_embedding = running_total_word_embedding/total_scores
    document_embedding = document_embedding.reshape(1,100)
    vectors = df3[col+'_vector']
    new = []
    for a in vectors:
        new.append(list(a))
    new = np.array(new)
    
    df3[f'cosine_{col}'] = cosine_similarity(document_embedding, new).reshape(-1,1)

In [24]:
max_idx = df3[['cosine_description']].mean(axis = 1).idxmax()

In [25]:
pid = df3.iloc[max_idx,:].product_id

In [26]:
outfitid = (outfit[outfit.product_id == pid].outfit_id).values[0]

In [27]:
outfit[outfit.outfit_id == outfitid]

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
35,01DQ8PBB3MHENH040414WNDRSM,01DPCWRYQMHRAG7NJ3Q5JJMT42,shoe,J.Crew,Lucie suede pumps
36,01DQ8PBB3MHENH040414WNDRSM,01DPD8C9XN7MQSHSSTKTCZZN1N,accessory1,J.Crew,Loeffler Randall® Marla square shoulder bag
37,01DQ8PBB3MHENH040414WNDRSM,01DPGSFJK363CXPQDA2NZ7KKH1,bottom,Veronica Beard,Vail Skirt
38,01DQ8PBB3MHENH040414WNDRSM,01DPKNAXXK774YWVKC4XQCCEXA,top,Reformation,Anne Top


## Fuzzy matching product id

In [28]:
from fuzzywuzzy import process, fuzz

In [29]:
user_input_id = '01DMBRYVA2ZFDYRYY5TRQZJTBD'

In [30]:
suggested_id = process.extractOne(user_input_id, outfit.product_id, scorer=fuzz.ratio)[0]

In [31]:
suggested_id

'01DMBRYVA2ZFDYRYY5TRQZJTBD'

In [32]:
outfit_id = outfit.loc[outfit.product_id == suggested_id, 'outfit_id'].values[0]

In [33]:
outfit[outfit.outfit_id == outfit_id]

Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
1,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
2,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2S5T9W793F4CY41HE,accessory1,kate spade new york,medium margaux leather satchel
3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump
