In [7]:
# Takes a long time to download (1.6GB)
#import gensim.downloader as api

#print(api.info())

#model = api.load("word2vec-google-news-300")


In [1]:
import re
import numpy as np
import pandas as pd
import gensim.downloader as api
from tqdm import tqdm 
import os



In [2]:
# pre processing function 
def preprocess_text(text):
    if pd.isnull(text):
        return []
    # lowercase the text
    text = text.lower()
    
    # remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # tokenize (split into words)
    tokens = text.split()

    return tokens

In [3]:
# sentence embedding generator by averaging word vectors for each token
def get_word2vec_embeddings(tokens, model):
    word_vectors = []
    for word in tokens:
        if word in model:
            word_vectors.append(model[word])
        
    
    if word_vectors:
        # compute the mean of all word vectors
        sentence_embedding = np.mean(word_vectors, axis=0)
    else:
        # if there are no valid words return a vector of zeros
        sentence_embedding = np.zeros(model.vector_size)
    
    return sentence_embedding

In [4]:
print("Available models:")
print(api.info()['models'].keys())

Available models:
dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])


In [5]:
# choosed a smaller model to save time and space
model_name = "glove-wiki-gigaword-50"  # 70MB

# load the model
model = api.load(model_name)

In [26]:
# TEST RUN WITH EXAMPLE WORDS FOR TESTING WORD AND SENTENCE EMBEDDINGS
# # example words
# words = ["arda", "asmi", "paige", "tanisha", "rebecca", "vaishnavi"]
# for word in words:
#     embedding = model[word]
#     print(f"Embedding for '{word}':")
#     print(embedding)
#     print(embedding.shape)
# # Example of preprocessing and generating sentence embedding
# sample_text = "american express team 1 is the best team"
# tokens = preprocess_text(sample_text)
# print(f"preprocessed tokens: {tokens}")

# sentence_embedding = get_word2vec_embeddings(tokens, model)
# print(sentence_embedding.shape)
# print(f"Sentence embedding: {sentence_embedding}")


In [6]:
input_path = "../data/products.csv"
output_path = "../data/brand_embeddings.csv"
chunk_size = 10000
embeddings_list = []

In [7]:
with pd.read_csv(input_path, chunksize=chunk_size) as reader:
        for chunk_number, chunk in enumerate(tqdm(reader, desc="Processing Chunks")):
            # preprocess name column
            chunk['tokens'] = chunk['brand'].apply(preprocess_text)
            
            # sentence embeddings
            embeddings = chunk['tokens'].apply(lambda tokens: get_word2vec_embeddings(tokens, model))
            
            # convert embeddings to lists for storing in df
            embeddings = embeddings.apply(lambda x: x.tolist())
            
            embeddings_df = pd.DataFrame({
                'brand': chunk['brand'],
                'brand_embedding': embeddings
            })
            
            # Save 
            if chunk_number == 0:
                # header for the first chunk
                embeddings_df.to_csv(output_path, index=False, mode='w')
            else:
                # no header for the rest
                embeddings_df.to_csv(output_path, index=False, header=False, mode='a')
            
            # free memory
            del chunk['tokens']
            del embeddings
            del embeddings_df


Processing Chunks: 3it [00:01,  2.66it/s]
