<a href="https://colab.research.google.com/github/andrea-bordon/andrea-bordon/blob/main/Product_embeddings_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from datetime import datetime
%matplotlib inline

In [None]:
orders = (pd.read_csv('./data/order_products__train.csv')
          ._append(pd.read_csv('./data/order_products__prior.csv'))
         )
products = pd.read_csv('./data/products.csv')

In [None]:
relevant_cols = ['order_id','product_name']

#downsample while I test the code for faster iteration on syntax. run full dataset before commit.
sample_size = 1

baskets = (orders
           .merge(products,on='product_id',how='left')
           .sample(frac=sample_size)
          )[relevant_cols]

#memory management on my local computer
del([orders,products])

In [None]:
baskets.sort_values(['order_id']).head(20)

In [None]:
num_items = baskets.product_name.nunique()
embedding_size = np.floor(num_items**0.25).astype('int')
print('''Let's use vectors of length {n} for {tokens} products'''.format(n=embedding_size, tokens = num_items))

biggest_basket = np.max(baskets.groupby('order_id').product_name.nunique())
print('''The biggest basket (window in our algorithm) will be {}'''.format(biggest_basket))

In [None]:
df_of_basket_lists = (baskets
        .groupby('order_id')
        .apply(lambda baskets :
                baskets.product_name
                .tolist()
               )
       )

#memory management
del(baskets)

In [None]:
df_of_basket_lists.head()
print(len(df_of_basket_lists))

In [None]:
model = Word2Vec(df_of_basket_lists, vector_size=embedding_size, window=biggest_basket)

In [None]:
def cosine_similarity(word_u,word_v,model):
    """
    Cosine similarity gets the similarity for two products and computes the similarity
    between two embeddings in our word2vec model

    Arguments:
        u - numpy array of shape (n,)
        v - numpy array of shape (n,)

    Returns:
        cosine similarity between words u & v
    """
    #get embeddings from gensim model
    u = model.wv[word_u]
    v = model.wv[word_v]

    #compute similarity
    dot = np.dot(u, v)
    norm_u = np.sqrt(np.sum(u * u))
    norm_v = np.sqrt(np.sum(v * v))
    cosine_similarity = dot / (norm_u * norm_v)

    return cosine_similarity

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
def display_pca_scatterplot(model, words=None, sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.index_to_key), sample)
#            words = np.random.choice(list(model.vocab.keys()), sample)

        else:
            words = [ word for word in model.vocab ]

    word_vectors = np.array([model[w] for w in words])

    twodim = PCA().fit_transform(word_vectors)[:,:2]

    plt.figure(figsize=(16,10))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)

In [None]:
model.wv.most_similar('Chicken Fingers')

In [None]:
# same aisle same category
products_draw = ['Spicy Chicken Breast Patties', 'Gluten Free Crispy Battered Haddock',
'Key West Pink Shrimp',
'Potato Crunch Fish Fillets',
'Classic Seasoning with Lemon Skillet Crisp Tilapia',
'Mini Crispy Crabless Cakes',
'Whole Grain Breaded Chicken Breast Chunks',
'Chipotle Black Bean Burger',
'Cracked Peppercorn Tilapia',
'Angus Beef Meatballs',
'Crispy Strips',
'Chicken Fingers',
'Patties, Beef, Quarter Pound',
'Original Turkey Burgers Smoke Flavor Added',
'Tortilla Crusted Tilapia',
'Crispy Chicken',
'Breaded Vegan Coconut Shrimp',
'Breaded Chicken Patties',
'Crab Cakes',
'Breaded Nuggets Chicken Breast']

In [None]:
display_pca_scatterplot(model.wv, products_draw)
plt.show()