In [5]:
import pandas as pd

test_dataset = pd.read_excel("data.xlsx", sheet_name="Test dataset")
categories = pd.read_excel("data.xlsx", sheet_name="Categories", header=None)[0].tolist()

In [7]:
import re

def extract_category(category):
    pattern1 = re.compile("Books on ([-a-zA-Z ]+)")
    pattern2 = re.compile("Books about ([-a-zA-Z ]+)")
    pattern3 = re.compile("([-a-zA-Z ]+) Books")

    match1 = pattern1.match(category)
    match2 = pattern2.match(category)
    match3 = pattern3.match(category)

    if match1:
        return match1.group(1)
    elif match2:
        return match2.group(1)
    elif match3:
        return match3.group(1)
    else:
        return category

In [8]:
extracted_categories = [extract_category(category) for category in categories]

In [10]:
PROMPT_TEMPLATE = """I will give you the name of the book, your task is to search for it in the internet and construct one sentence what the book is about. I will need this sentence in order to determine the topics of this book.
If you can not find the information about the book you should tell it.

Book name:
{book_name}

Return the answer in the following JSON format:
{{
  "know": <str, 'yes' or 'no'>,
  "sentence": <str, the sentence that tells what the book is about. Do not return this key if 'know' keys is no>
}}
Do not return additional text, except json answer."""

In [12]:
import os
from openai import OpenAI
import json
from dotenv import load_dotenv


load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def make_completion(prompt: str) -> str:
    answer = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ]
    )
    return answer.choices[0].message.content


def get_summary(book_name: str):
    prompt = PROMPT_TEMPLATE.format(book_name=book_name)
    response = make_completion(prompt)
    if not response:
        return Exception("No response")
    match = re.search(r"\{.*\}", response, re.DOTALL)
    if match:
        data = json.loads(match.group(0))
        if "sentence" in data:
            return data["sentence"]
        else:
            return Exception("No sentence key in json: {}".format(response))
    else:
        return Exception("Wrong json output: {}".format(response))

In [13]:
from tqdm.auto import tqdm

summaries = {}
for book_name in tqdm(test_dataset["Title"]):
    summary = get_summary(book_name)
    if isinstance(summary, str):
        summaries[book_name] = summary
    else:
        print(book_name)
        summaries[book_name] = book_name

  0%|          | 0/199 [00:00<?, ?it/s]

Психологічна допомога by За матеріалами Центру громадського здоров’я, Міністерства охорони здоровʼя (МОЗ) України та платформи «ВзаємоДія»
The Parallel Parenting Solution: Eliminate Conflict with Your Ex, Create the Life You Want by Karl Knickerbocker, J.D.
Why, When, and How by based on articles by Vox, BBC, The New York Times, The Guardian, Vice, Politico, and Radio Liberty
Global Response by based on articles by Vox, BBC, The New York Times, The Guardian, Vice, Politico, and Radio Liberty
On the Brink of New Chornobyl by based on articles by Vox, BBC, The New York Times, The Guardian, Vice, Politico, and Radio Liberty
The Power of Russian Propaganda by based on articles by Vox, BBC, The New York Times, The Guardian, Vice, Politico, and Radio Liberty
How to Live: 27 Conflicting Answers and One Weird Conclusion by Derek Sivers
Техногенна небезпека by За матеріалами Міністерства охорони здоров’я України.
Усе про укриття by За матеріалами Центру стратегічних комунікацій та інформаційної

In [14]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [15]:
title_embeddings = model.encode(extracted_categories)

In [38]:
import torch

k = 15
predicted_categories = {"categories": []}

for book_name in tqdm(test_dataset["Title"]):
    book_embedding = model.encode(summaries[book_name])
    scores = model.similarity(book_embedding, title_embeddings)
    _, index = torch.sort(scores, descending=True)
    book_categories = [categories[index[0][i]] for i in range(k)]
    predicted_categories["categories"].append("\n".join(book_categories))

df_predicted = pd.DataFrame(predicted_categories)

  0%|          | 0/199 [00:00<?, ?it/s]

In [46]:
pd.concat([test_dataset, df_predicted], axis=1).to_excel("predicted.xlsx", index=False)