In [28]:
import os
import pandas as pd
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.spatial.distance import cityblock as manhattan_distance
from scipy.spatial.distance import minkowski as minkowski_distance
from scipy.stats import spearmanr
from transformers import AutoTokenizer, AutoModel
import pymongo

In [29]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [30]:
dir_path = "C:\\Users\\Hassa\\OneDrive\\Desktop\\PRODUCT_SCOUT\\Back-End\\processed_data.csv"
df = pd.read_csv(dir_path)

In [31]:
with open('categoric_keywords.pkl', 'rb') as f:
    key_values = pickle.load(f)

In [32]:
temp = []
cats = []
for i in range(len(df)):
    temp.append(df["Category"][i])
    temp.append(df["Sub-Category"][i])
    temp.append(df["Sub-Sub-Category"][i])
    cats.append(temp)
    temp = []

categories = []
for item in cats:
    if item not in categories:
        categories.append(item)
        
category_dictionary = {}
for key in key_values.keys():
    queries = key_values[key]
    category_string_list = key.split(' > ')
    for category in categories:
        if category == category_string_list:
            category_dictionary[tuple(category)] = queries     
categories = [list(key) for key in category_dictionary.keys()]

key_word_list = []
for value in category_dictionary.values():
    key_word_list.append(value)

client = pymongo.MongoClient("mongodb://hassan:1234@ac-n2z0fl0-shard-00-00.gwmrp9a.mongodb.net:27017,ac-n2z0fl0-shard-00-01.gwmrp9a.mongodb.net:27017,ac-n2z0fl0-shard-00-02.gwmrp9a.mongodb.net:27017/test?replicaSet=atlas-11gdpx-shard-0&ssl=true&authSource=admin")
db = client["popularities"]
collection = db["popularity_data"]
docs = collection.find()

keyword_data = []
for doc in docs:
    dictionary = {
        "Category": doc['mapping'].split(' >')[0],
        "Sub-Category": doc['mapping'].split(' > ')[1],
        "SubSub-Category": doc['mapping'].split('> ')[2],
        "Keyword": doc['query']
    }
    keyword_data.append(dictionary)

groups = {}
for item in keyword_data:
    key = (item['Category'], item['Sub-Category'], item['SubSub-Category'])
    if key not in groups:
        groups[key] = []
    groups[key].append(item)

result = []
for group in groups.values():
    combined = {'Category': group[0]['Category'], 'Sub-Category': group[0]['Sub-Category'], 'SubSub-Category': group[0]['SubSub-Category']}
    keywords = [item['Keyword'] for item in group]
    combined['Keywords'] = keywords
    result.append(combined)

In [34]:
category_query_list = [category[0] + " " + category[1] + " " + category[2] for category in categories]
category_query_embeddings_categorical = model.encode(category_query_list)

category_query_list_hierarchical = [category[0] for category in categories] + [category[1] for category in categories] + [category[2] for category in categories]
category_query_embeddings_hierarchical = model.encode(category_query_list_hierarchical)

keyword_embeddings = []
for d in result:
    embeddings = model.encode(d['Keywords'])
    keyword_embeddings.append(embeddings)

In [43]:
user_query = "condoms"

In [44]:
category_scores = []
user_query_embedding = model.encode([user_query])[0]
for i, category in enumerate(categories):
    score = cosine_similarity(user_query_embedding.reshape(1, -1), category_query_embeddings_categorical[i].reshape(1, -1))
    category_scores.append((category, score))
sorted_scores_categorical = sorted(category_scores, key=lambda x: x[1], reverse=True)

category_scores_hierarchical = []
user_query_embedding = model.encode([user_query])[0]
for i, category in enumerate(categories):
    category1_score = cosine_similarity(user_query_embedding.reshape(1, -1), category_query_embeddings_hierarchical[i].reshape(1, -1))
    category2_score = cosine_similarity(user_query_embedding.reshape(1, -1), category_query_embeddings_hierarchical[i+len(categories)].reshape(1, -1))
    category3_score = cosine_similarity(user_query_embedding.reshape(1, -1), category_query_embeddings_hierarchical[i+len(categories)*2].reshape(1, -1))
    score = (category1_score + category2_score + category3_score) / 3
    category_scores_hierarchical.append((category, score))

sorted_scores_hierarchical = sorted(category_scores_hierarchical, key=lambda x: x[1], reverse=True)

SCORES1 = []
SCORES1.append(sorted_scores_categorical[0][0])
SCORES1.append(sorted_scores_categorical[1][0])
SCORES1.append(sorted_scores_categorical[2][0])

SCORES2 = []
SCORES2.append(sorted_scores_hierarchical[0][0])
SCORES2.append(sorted_scores_hierarchical[1][0])
SCORES2.append(sorted_scores_hierarchical[2][0])

for index, d in enumerate(result):
    similarities = cosine_similarity(keyword_embeddings[index], [user_query_embedding])
    d['Similarity'] = similarities.sum()

similarity_scores = [d['Similarity'] for d in result]
max_similarity_indices = sorted(range(len(similarity_scores)), key=lambda i: similarity_scores[i], reverse=True)[:3]

SCORES3 = []
for i in range(3):
    temp = []
    temp.append(result[max_similarity_indices[i]]['Category'])
    temp.append(result[max_similarity_indices[i]]['Sub-Category'])
    temp.append(result[max_similarity_indices[i]]['SubSub-Category'])
    SCORES3.append(temp)

combined_list = SCORES1 + SCORES2 + SCORES3

counts = {}
for lst in combined_list:
    key = tuple(lst)
    if key in counts:
        counts[key] += 1
    else:
        counts[key] = 1

sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)

for key, count in sorted_counts:
    if list(key) in SCORES1 or list(key) in SCORES2 or list(key) in SCORES3:
        categoric_mapping = list(key)
        break

print(categoric_mapping)

['Health & Beauty', 'Sexual Wellness', 'Condoms']


In [48]:
output_directory = "C:\\Users\\Hassa\\OneDrive\\Desktop\\PRODUCT_SCOUT\\Back-End"

with open(os.path.join(output_directory, 'categories.pkl'), 'wb') as f:
    pickle.dump(categories, f)

with open(os.path.join(output_directory, 'category_embeddings_categorical.pkl'), 'wb') as f:
    pickle.dump(category_query_embeddings_categorical, f)
    
with open(os.path.join(output_directory, 'category_embeddings_hierarchical.pkl'), 'wb') as f:
    pickle.dump(category_query_embeddings_hierarchical, f)
    
with open(os.path.join(output_directory, 'keyword_embeddings.pkl'), 'wb') as f:
    pickle.dump(keyword_embeddings, f)