In [135]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [136]:
import pandas as pd
import json

df = pd.read_csv("save/data.csv", encoding="utf-8")
summaries = json.load(open("./save/summaries.json", encoding="utf-8"))

In [137]:
from tqdm.auto import tqdm

embeddings_list = []
string_list = []
for i, row in tqdm(df.iterrows()):
    name_embedding = model.encode(row["name"])
    embeddings_list.append(name_embedding)
    string_list.append(row["name"])
    if row["name"] not in summaries:
        summary = row["name"]
        print(row["name"])
    else:
        summary = summaries[row["name"]]
    summary_embedding = model.encode(summary)
    embeddings_list.append(summary_embedding)
    string_list.append(summary)
    category_list = json.loads(row["categories"])
    for category in category_list:
        name_embedding = model.encode(category)
        embeddings_list.append(name_embedding)
        string_list.append(category)

0it [00:00, ?it/s]

In [138]:
import numpy as np
import os

if not os.path.exists("save"):
    os.mkdir("save")

json.dump(string_list, open("save/string_list.json", "w"))
embedding_array = np.array(embeddings_list)
np.save("save/embedding_array.npy", embedding_array)

In [139]:
loaded_string_list = json.load(open("save/string_list.json"))
loaded_embedding_array = np.load("save/embedding_array.npy")

string_to_embedding = {
    key: value
    for key, value in zip(loaded_string_list, loaded_embedding_array)
}

In [140]:
all_categories = []
name_to_categories = {}
for i, value in df.iterrows():
    name = value["name"]
    categories = json.loads(value["categories"])
    all_categories.extend(categories)
    name_to_categories[name] = categories

all_categories = list(set(all_categories))

In [141]:
all_categories_embeddings = np.array([string_to_embedding[category] for category in all_categories])

In [142]:
name_to_all_scores = {}
for i, value in df.iterrows():
    name = value["name"]
    scores = model.similarity(string_to_embedding[name], all_categories_embeddings)
    name_to_all_scores[name] = scores

In [143]:
name_to_all_summary_scores = {}
for i, value in df.iterrows():
    summary = summaries[value["name"]]
    scores = model.similarity(string_to_embedding[summary], all_categories_embeddings)
    name_to_all_summary_scores[value["name"]] = scores

In [144]:
import torch

def get_rankings(book_name: str, version: str):
    if version == "raw":
        name_to_scores = name_to_all_scores
    elif version == "summary":
        name_to_scores = name_to_all_summary_scores
    else:
        raise ValueError("Unknown version")
    _, index = torch.sort(name_to_scores[book_name], descending=True)
    index = index[0].tolist()
    specific_categories = name_to_categories[book_name]
    positions = [all_categories.index(c) for c in specific_categories]
    rankings = [index.index(p) for p in positions]
    rankings = sorted(rankings)
    return rankings



def top_k(book_name: str, version: str, k=15):
    rankings = get_rankings(book_name, version)
    count = 0
    for ranking in rankings:
        if ranking < k:
            count += 1
    return count / k


def mean_avg_precision(book_name: str, version: str):
    rankings = get_rankings(book_name, version)
    _sum = 0.0
    for i, ranking in enumerate(rankings):
        _sum += (i + 1) / (ranking + 1)
    metric = _sum / len(rankings)
    return metric

In [145]:
avg_top_k = np.mean([top_k(key, "raw") for key in name_to_categories.keys()])
avg_mean_avg_precision = np.mean([mean_avg_precision(key, "raw") for key in name_to_categories.keys()])
avg_top_k, avg_mean_avg_precision

(np.float64(0.15751096491228067), np.float64(0.14951776486845036))

In [146]:
avg_top_k = np.mean([top_k(key, "summary") for key in name_to_categories.keys()])
avg_mean_avg_precision = np.mean([mean_avg_precision(key, "summary") for key in name_to_categories.keys()])
avg_top_k, avg_mean_avg_precision

(np.float64(0.19331140350877193), np.float64(0.18494782492576187))

In [147]:
import random

def random_top_k(k=15, exp=10000):
    counts = []
    for _ in range(exp):
        rankings = sorted([random.randint(1, len(all_categories)) for _ in range(k)])
        count = 0
        for ranking in rankings:
            if ranking < k:
                count += 1
        counts.append(count)
    return np.mean(counts)

def random_mean_avg_precision(k=15, exp=10000):
    sums = []
    for _ in range(exp):
        rankings = sorted([random.randint(1, len(all_categories)) for _ in range(k)])
        _sum = 0
        for i, ranking in enumerate(rankings):
            _sum += (i + 1) / ranking
        sums.append(_sum / k)
    return np.mean(sums)

In [148]:
random_top_k(), random_mean_avg_precision()

(np.float64(0.117), np.float64(0.011795588880474205))

In [155]:
def get_top_categories(book_name: str, k=10):
    _, index = torch.sort(name_to_all_summary_scores[book_name], descending=True)
    index = index[0].tolist()
    predicted_categories = [all_categories[index[i]] for i in range(k)]
    return predicted_categories

In [156]:
get_top_categories("Two Awesome Hours: Science-Based Strategies to Harness Your Best Time and Get Your Most Important Work Done by Josh Davis")

['Time Efficiency',
 'Productivity',
 'Work Efficiency',
 'Time Management',
 'Workplace Productivity',
 'Time Optimization',
 'Efficiency',
 'Work-Life Integration',
 'Time Mastery',
 'Simplifying Life']