In [29]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity
import json 


In [30]:
train_df = pd.read_csv("data/train.csv")
meta_df = pd.read_csv("data/item_metadata_filtered.csv")
with open("data/id_mappings.json") as f:
    id_map = json.load(f)

asin_to_id = {asin: int(item_id) for asin, item_id in id_map["item_mapping"].items()}
item_mapping_df = pd.DataFrame(list(asin_to_id.items()), columns=["parent_asin", "item_id"])
meta_df = pd.merge(meta_df, item_mapping_df, on="parent_asin", how="left")
train_df = train_df.merge(meta_df[["item_id", "main_category"]], on="item_id", how="left")


# Preprocess metadata
meta_df["title"] = meta_df["title"].fillna("")
meta_df["store"] = meta_df["store"].fillna("")
meta_df["description"] = meta_df["description"].fillna("")
meta_df["average_rating"] = meta_df["average_rating"].fillna("")
meta_df["price"] = meta_df["price"].fillna("")
meta_df["image_urls"] = meta_df["image_urls"].fillna("[]")
meta_df["main_category"] = meta_df["main_category"].fillna("")

# Index by ASIN for lookup
meta_df = meta_df.set_index("parent_asin")

In [31]:
# Combine movie name and tags into a single string
df = meta_df
df['content'] = df['title'].astype(str) + ' ' + df['description'].astype(str) + ' ' + df['store'].astype(str)
df['content'] = df['content'].fillna('')

# Tokenize content for Word2Vec
df['tokenized_content'] = df['content'].apply(simple_preprocess)

# Initialize the Word2Vec model (without training)
model = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)

# Build the vocabulary
model.build_vocab(df['tokenized_content'])

In [32]:
# Train the model
model.train(df['tokenized_content'], total_examples=model.corpus_count, epochs=10)

# Function to average word vectors for a text
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

# Function to compute average word vectors for all movies
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    print("DoneVoc")
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus]
    print("DoneFeatures")
    return np.array(features)



In [33]:
df.head()

Unnamed: 0_level_0,main_category,title,average_rating,rating_number,price,store,features,description,images,categories,image_count,has_images,image_urls,category,item_id,content,tokenized_content
parent_asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
B07WFSQXL5,All Beauty,PPY Eyelash Growth Serum – Natural Ingredients...,3.9,114.0,,PPY,[],[],{'hi_res': array(['https://m.media-amazon.com/...,[],27,True,['https://m.media-amazon.com/images/I/615N6fkc...,All_Beauty,66457,PPY Eyelash Growth Serum – Natural Ingredients...,"[ppy, eyelash, growth, serum, natural, ingredi..."
B08BV6F6BC,All Beauty,Wixar Natural Sea Moss Soap - (2 PACK) - Laven...,4.4,41.0,,WIXAR NATURALS,[],[],{'hi_res': array(['https://m.media-amazon.com/...,[],21,True,['https://m.media-amazon.com/images/I/81ugegqe...,All_Beauty,71310,Wixar Natural Sea Moss Soap - (2 PACK) - Laven...,"[wixar, natural, sea, moss, soap, pack, lavend..."
B07Z818MLY,All Beauty,7 Packs Deep Wave Crochet Hair 22 Inch Deep wa...,3.4,10.0,,Yun Mei Hair,[],[],{'hi_res': array(['https://m.media-amazon.com/...,[],21,True,['https://m.media-amazon.com/images/I/71aVcpK8...,All_Beauty,67591,7 Packs Deep Wave Crochet Hair 22 Inch Deep wa...,"[packs, deep, wave, crochet, hair, inch, deep,..."
B071DY8Z4B,All Beauty,BEWAVE Hair Brush Sponge Twist With Comb Hair ...,4.2,24.0,,BEWAVE,[],[],{'hi_res': array(['https://m.media-amazon.com/...,[],17,True,['https://m.media-amazon.com/images/I/61or2jYp...,All_Beauty,52585,BEWAVE Hair Brush Sponge Twist With Comb Hair ...,"[bewave, hair, brush, sponge, twist, with, com..."
B0BTLTVR1X,All Beauty,"Zydeco Chop Chop Cajun Seasoning Base, 8 Ounce...",4.7,21.0,,BORELTH,"['All Natural blend of Dehydrated Onion, Dehyd...",['Zydeco Chop Chop is a blend of Dehydrated On...,{'hi_res': array(['https://m.media-amazon.com/...,[],3,True,['https://m.media-amazon.com/images/I/71707mY6...,All_Beauty,77706,"Zydeco Chop Chop Cajun Seasoning Base, 8 Ounce...","[zydeco, chop, chop, cajun, seasoning, base, o..."


In [40]:
# Compute average word vectors for all movies
w2v_feature_array = averaged_word_vectorizer(corpus=df['tokenized_content'], model=model, num_features=100)
print(w2v_feature_array)

# def get_top_10(items):
#     # Get the user input
#     # index = 71310

#     # Find the index of the user movie
#     movie_index = df[df['item_id'].isin(items)].index    # label
#     print(movie_index)
#     movie_pos = df.index.get_loc(movie_index)            # positional index for NumPy

#     # Compute similarity
#     user_movie_vector = w2v_feature_array[movie_pos].reshape(1, -1)
#     similarity_scores = cosine_similarity(user_movie_vector, w2v_feature_array)

#     # Get top 20 most similar movies (excluding self)
#     similar_movies = list(enumerate(similarity_scores[0]))
#     sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:20]
#     return sorted_similar_movies


def get_top_10(items):
    # Find positional indices of all input items in df
    movie_indices = [df.index.get_loc(idx) for idx in df.index[df['item_id'].isin(items)]]

    # Get vectors for all movies, average them to form a user vector
    user_movie_vectors = w2v_feature_array[movie_indices]
    user_vector = user_movie_vectors.mean(axis=0).reshape(1, -1)

    # Compute cosine similarity with all movies
    similarity_scores = cosine_similarity(user_vector, w2v_feature_array)[0]

    # Create list of (index, score)
    similar_movies = list(enumerate(similarity_scores))

    # Exclude movies already in items (by their positional indices)
    filtered_similar_movies = [x for x in similar_movies if x[0] not in movie_indices]

    # Sort by similarity descending and get top 20
    sorted_similar_movies = sorted(filtered_similar_movies, key=lambda x: x[1], reverse=True)[:20]

    return sorted_similar_movies
    
sorted_similar_movies = get_top_10([71310, 66457])

# Print results
for i, score in sorted_similar_movies:
    print(f"{i}: {df.iloc[i]['item_id']}")


DoneVoc
DoneFeatures
[[-0.98938797 -0.63321898  0.37841503 ...  0.68543627  0.39928239
   1.06550875]
 [-0.89971217 -0.2223519   0.99634223 ...  1.22436567  0.11091638
   0.41460643]
 [-0.55975644  0.42546408 -0.12629673 ...  1.93767114  0.69386993
   0.43358062]
 ...
 [ 0.47057001 -1.15322939 -0.49671129 ...  0.24202227  0.72934528
   0.77827651]
 [ 0.20904753 -0.58179728  0.8848865  ...  0.27003405  0.38788676
   0.2032381 ]
 [ 0.16033345 -1.41082833 -0.68430082 ...  0.51842156  0.86146028
   0.54123062]]
8198: 46750
9048: 50081
13368: 62870
1331: 62363
1118: 62143
18395: 71526
7701: 62298
14898: 63108
3620: 41153
14854: 49950
25133: 48799
6078: 51297
10930: 60679
7252: 69286
10284: 45621
13481: 43758
16785: 37827
21776: 39924
12592: 67936
227: 46685


In [35]:
sorted_similar_movies[:10]

[(1118, 0.9497015259651863),
 (12592, 0.921190815781187),
 (14246, 0.9202295899954112),
 (10114, 0.9157248711687271),
 (25620, 0.9070783657161763),
 (17794, 0.9054662337818759),
 (14599, 0.9012616568329038),
 (14817, 0.9011115494482287),
 (1398, 0.8997760859739761),
 (17251, 0.899592135816312)]