In [1]:
import pandas as pd
import numpy as np
import random
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
import warnings;
warnings.filterwarnings('ignore')
from fuzzywuzzy import process
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
outfit = pd.read_csv("outfit_combinations.csv")
full = pd.read_csv("full+data.csv")

In [40]:
df = pd.merge(full, outfit, on=["product_id"], how='inner')

In [41]:
# Rename columns
df = df.rename(columns={"brand_x": "brand", "product_full_name_x": "product_full_name"})
# Drop unused columns
df = df.drop(["mpn", "created_at", "updated_at", "deleted_at", "brand_canonical_url", \
              "labels", "bc_product_id", "brand_y"], axis=1)
# Drop duplicates
df = df.drop_duplicates(subset=["product_id", "outfit_id"], keep="first")

In [42]:
#clean data
df["product_id"] = df["product_id"].str.upper()
df["brand"] = df["brand"].str.title()
df['product_full_name'] = df["product_full_name"].str.title()
df["description"] = df["description"].str.title()
df["description"] = df["description"].str.replace("\n", " ")
df["brand_category"] = df["brand_category"].str.title()
df["brand_category"] = df["brand_category"].str.replace(r"(/|,|:)", " ")
df["brand_category"] = df["brand_category"].str.lower()
df["details"] = df["details"].str.title()
df["details"] = df["details"].str.replace("\n", " ")
df["outfit_id"] = df["outfit_id"].str.upper()
df["outfit_item_type"] = df["outfit_item_type"].str.lower()

df = df.replace(np.nan, "", regex=True)
df = df.replace(r"(unknown|Unknown)", "", regex=True)
df.reset_index(inplace = True)
df["new_column"] = df["brand"] + " " + df["product_full_name"] + " " + df["description"] + " " + df["details"]
df['output_name']=df['outfit_item_type']+':'+df['product_full_name']+'('+df['product_id']+')'
df['outfit_id']= df['outfit_id'].astype(str)
df['product_id']= df['product_id'].astype(str)

In [43]:
# Tokenize the sentences into words
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r"(\w+['-]?[a-zA-Z']*[a-z]|[0-9]+-*[0-9]*)")
df["new_column"] = df["new_column"].apply(lambda x: tokenizer.tokenize(x))

In [50]:
# Remove stopwords

from nltk.corpus import stopwords
stop = stopwords.words('english')

def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

cleaned = []
for item in df["new_column"]:
    new_words = []
    for word in item:
        if word in stop:
            continue
        new_words.append(word)

    cleaned.append(new_words)

df["new_column"] = cleaned
df['new_column']= df['new_column'].astype(str)

In [51]:
# select necessary columns
df=df[['index','product_id', 'brand', 'name', 'description',
       'brand_category', 'details','outfit_id', 'outfit_item_type', 'product_full_name',
       'new_column', 'output_name']]

## Recommend by product_id using fuzzy wuzzy

In [28]:
# product recommendation by product_id
def similar_products(product_id):
    strOptions =list(set(df['product_id'].to_list()))
    str2Match=product_id
    Ratios = process.extract(str2Match,strOptions)
    highest = process.extractOne(str2Match,strOptions)
    final_prod=highest[0]
    outfit_code=df.loc[df['product_id']==final_prod]['outfit_id'].to_list()
    outfit_code=outfit_code[0]
    final_result=df.loc[df['outfit_id']==outfit_code]['output_name'].to_frame().reset_index(drop=True)
    return final_result

In [29]:
similar_products('01DMHCNT41E14QWP503V7CT9G6')

Unnamed: 0,output_name
0,shoe:Penelope Mid Cap Toe Pump(01DMBRYVA2ZFDYR...
1,accessory1:Crystal Clutch(01DMHCNT41E14QWP503V...
2,bottom:Slim Knit Skirt(01DMBRYVA2P5H24WK0HTK4R...
3,top:Rib Mock Neck Tank(01DMBRYVA2PEPWFTT7RMP5A...


## Recommend by description using doc2vec

In [80]:
# get doc2vec
df['new_column']=df['new_column'].str.replace(r"[\[\]\',\?\#]", '').str.lower()
doc=pd.Series(df['new_column'].dropna().unique()).str.split()
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(doc)]
model = Doc2Vec(documents, vector_size=50, window=4, min_count=2, workers=4)
doc_vec=[]
for sentence in doc:
    vec=model.infer_vector(sentence).reshape(1, -1)
    doc_vec.append(vec)

In [81]:
# merge with original data
final_column=df['new_column'].dropna().unique()
vec_df = pd.DataFrame({
            'new_column': final_column,
            'doc': doc,
            'doc_vec': doc_vec
        })
data=pd.merge(df, vec_df, on='new_column', how='left')
data.dropna(subset=['new_column'], inplace=True)
doc1=pd.Series(df['new_column'].dropna().str.split())

In [82]:
# product recommendation by description
def similar_des(des):
    score1=[]
    for sentence in doc1:
        doc1_vector=model.infer_vector(sentence).reshape(1, -1)
        doc2_vector=model.infer_vector([des]).reshape(1, -1)
        score=cosine_similarity(doc1_vector, doc2_vector)
        score1.append(score)
    my_list = map(lambda x: x[0], score1)
    ser = pd.Series(my_list)
    data['score']=ser
    result_df=data.sort_values(by=['score'], ascending=False)
    needed_id=result_df.iloc[0].outfit_id
    final_rec=result_df.loc[result_df['outfit_id']==needed_id]['output_name'].to_frame().reset_index(drop=True)
    return final_rec

In [78]:
similar_des('leather ankle boots')

Unnamed: 0,output_name
0,shoe:Kittie Leather Ankle Boots(01DT5147G9J5P7...
1,top:Pleated Clean Collar Shirt(01DPGV52E2XTG7P...
2,accessory2:Theron Tweed Jacket(01DTJ8DDAG7F7XX...
3,accessory1:Devon Camera Bag In Croc-Embossed L...
4,bottom:Wendall Cropped Wide-Leg Jeans(01DTJ8G0...
