In [3]:
import gensim
import os
# Check that we're using the fast C++ word2vec implementation via Cython

assert gensim.models.doc2vec.FAST_VERSION
base_data_path = "D:\Projects\Datasets"
w2vec_path = os.path.join(base_data_path, "word2vec\GoogleNews-vectors-negative300.bin.gz")
model = gensim.models.KeyedVectors.load_word2vec_format(w2vec_path, binary=True)

In [4]:
# Load the documents and build a tf-idf matrix
import pandas as pd
import os
import numpy as np

document_path = os.path.join(base_data_path, "products\strawberry.json")
all_docs = pd.read_json(document_path)
all_docs = all_docs[all_docs.description.notnull()]
all_docs = all_docs[all_docs.id.notnull()]
all_docs.reset_index(inplace=True)
all_docs.describe()

Unnamed: 0,index,has_variants,id,is_deleted,original_price,price
count,7956.0,7956.0,7956.0,0.0,5846.0,7956.0
mean,5353.855706,0.0,153472.768728,,82.03079,60.909565
std,2781.236612,0.0,52164.105949,,82.578493,74.26447
min,0.0,0.0,7595.0,,5.5,4.0
25%,3205.75,0.0,127082.5,,35.5,25.0
50%,5570.5,0.0,169148.0,,57.0,43.0
75%,7694.25,0.0,193097.0,,96.5,71.0
max,9855.0,0.0,214224.0,,850.0,2870.0


In [5]:
all_docs.iloc[999]

index                                                               1453
brand                                                L'Artisan Parfumeur
category                                                         perfume
description                          Drole De Rose Eau De Toilette Spray
has_variants                                                           0
id                                                                127139
images                 [{'url': 'https://a.cdnsbn.com/images/products...
is_deleted                                                           NaN
original_price                                                     136.5
price                                                               64.5
product_image_urls     [https://a.cdnsbn.com/images/products/l/127139...
request_fingerprint             bdbffe5c72bc1c34a25f5289b2d2e2a6b999966d
size                                                         100ml/3.4oz
title                                Drole De Rose 

In [58]:
model.similar_by_word('sweet_tea', topn=25)

KeyError: "word 'sweet_tea' not in vocabulary"

In [116]:
def generate_sentence_vector_sum(sentence, wv):
    v = np.zeros(wv.vector_size)
    for w in sentence.split():
        if w.lower() in wv:
            #print(w.lower())
            v += wv[w.lower()]
    return v


def generate_averaged_sentence_vector(sentence, wv):
    v = generate_sentence_vector_sum(sentence, wv)
    return v / len(sentence)


def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [117]:
vec_a = generate_averaged_sentence_vector("hello world", model)
vec_b = generate_averaged_sentence_vector("dr dr dr", model)
cosine_sim(vec_a, vec_b)

0.14022197143923812

In [61]:
vec_a = generate_sentence_vector_sum("hello world", model)
vec_b = generate_sentence_vector_sum("dr dr dr", model)
cosine_sim(vec_a, vec_b)

hello
world
dr
dr
dr


0.14022197143923812

In [73]:
query = "Sweet & Savoury Body Scrub"


def find_top_n_docs(query: str, num_docs, sentences, w2v_model):
    
    vectors = sentences.map(lambda title: generate_averaged_sentence_vector(title, w2v_model.wv))
    query_vector = generate_averaged_sentence_vector(query, w2v_model)
    rankings = []
    i = 0
    for product_vector in vectors:
        product_similarity = cosine_sim(query_vector, product_vector)
        rankings.append([i, product_similarity])
        i += 1
        
    return_value = sorted(rankings, key=lambda ranking: -ranking[1])
    
    return return_value[:num_docs]

print(find_top_n_docs(query, 25, all_docs['title'], model))

result_idx = list(map(lambda item: item[0], find_top_n_docs(query, 25, all_docs['title'], model)))
print(result_idx)

all_docs.iloc[result_idx]["title"]



[[395, 0.8312897985602955], [921, 0.75899388591239825], [392, 0.74481696028392608], [414, 0.73493870519356164], [3923, 0.71813767334821355], [2553, 0.71724840490595287], [1068, 0.7118501522458538], [377, 0.71133853979761985], [362, 0.70684602308440425], [403, 0.70597779368451929], [3059, 0.70409631359083125], [1831, 0.69981714154766639], [623, 0.69804007811543878], [1522, 0.69535971870249058], [376, 0.69523443498906767], [1481, 0.69490161303587739], [3639, 0.6935116673535423], [508, 0.68733776178976491], [1525, 0.68732810554056778], [1642, 0.68721691017456821], [1067, 0.686395468771967], [391, 0.68451861403070724], [371, 0.68310969646242092], [1886, 0.68287109044915961], [1195, 0.6793339125331227]]


[395, 921, 392, 414, 3923, 2553, 1068, 377, 362, 403, 3059, 1831, 623, 1522, 376, 1481, 3639, 508, 1525, 1642, 1067, 391, 371, 1886, 1195]


395             Pomegranate, Argan & Grapeseed Body Scrub
921                                      Candy Body Scrub
392     Tarocco Orange, Eucalyptus & Sage Deep Cleansi...
414                        Almond & Aloe Hand & Body Wash
3923    Bundle Of Joy Set: Newborn 2-in-1 Hair & Body ...
2553    Lovely Clean & Perfume Body Lotion (Exp. Date ...
1068                            Lavender Hand & Body Wash
377     Avocado, Olive & Basil Perfect Pair: Bath & Sh...
362                Exfoliating Body Scrub for Smooth Skin
403                        Pear & Pink Magnolia Body Wash
3059                             Seaweed & Sage Body Wash
1831                        Elixir Exfoliating Body Scrub
623                      Creamy Body Scrub - Vanilla Bean
1522    Vanille & Coco Perfumed Body Cream (New Packag...
376     Citron, Honey & Coriander Duo: Bath & Shower G...
1481                        Omnia Coral Gentle Body Scrub
3639    Strawberry Scrub Fruit Enzyme Polisher - For F...
508     Nashi 

In [118]:
# Train a custom word2vec model from document titles

title_model = gensim.models.Word2Vec(all_docs['title'], min_count=10, workers=4, size=100)

In [119]:
query = "Exfoliating Gel"

title_vectors = all_docs["title"]
print(find_top_n_docs(query, 25, title_vectors, title_model))

result_idx = list(map(lambda item: item[0], find_top_n_docs(query, 25, title_vectors, title_model)))
print(result_idx)

all_docs.iloc[result_idx]["title"]

ValueError: non-broadcastable output operand with shape () doesn't match the broadcast shape (100,)