In [None]:
import pandas as pd

books = pd.read_csv("books_cleaned.csv")

In [None]:
books["categories"].value_counts().reset_index()

In [None]:
books["categories"].value_counts().reset_index().query("count > 50")

In [None]:
category_mapping = {"Fiction":"Fiction",
                    "Juvenile Fiction": "Children's Fiction",
                    "Biography & Autobiography": "Nonfiction",
                    "History": "Nonfiction",
                    "Literary Criticism": "Nonfiction",
                    "Philosophy": "Nonfiction",
                    "Religion": "Nonfiction", 
                    "Comics & Graphic Novels": "Fiction",
                    "Drama": "Fiction",
                    "Juvenile Nonfiction": "Children's Nonfiction",
                    "Science": "Nonfiction",
                    "Poetry": "Fiction"}

books["simple_categories"] = books["categories"].map(category_mapping)

In [None]:
books

In [None]:
books[~(books["simple_categories"].isna())]

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # or "0,1" for multiple GPUs

In [None]:
from transformers import pipeline
import torch

# Ensure CUDA is available
device = 0 if torch.cuda.is_available() else -1
fiction_categories = ["Fiction", "Nonfiction"]

classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=device  # Use integer index instead of "cuda"
)


In [None]:
sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0]

In [None]:
classifier(sequence, fiction_categories)

In [None]:
import numpy as np

max_index = np.argmax(classifier(sequence, fiction_categories)["scores"])
max_label = classifier(sequence, fiction_categories)["labels"][max_index]
max_label

In [None]:
def generate_prediction(sequence, categories):
    predictions = classifier(sequence, categories)
    max_index = np.argmax(predictions["scores"])
    max_label = predictions["labels"][max_index]
    return max_label

In [None]:
from tqdm import tqdm

actual_cat = []
predicted_cat = []

for i in tqdm(range(0, 300)):
    sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[i]
    predicted_cat += [generate_prediction(sequence, fiction_categories)]
    actual_cat += ["Fiction"]
    

In [None]:
for i in tqdm(range(0, 300)):
    sequence = books.loc[books["simple_categories"] == "Nonfiction", "description"].reset_index(drop=True)[i]
    predicted_cat += [generate_prediction(sequence, fiction_categories)]
    actual_cat += ["Nonfiction"]

In [None]:
predictions_df = pd.DataFrame({"actual_categories": actual_cat, "predicted_categories": predicted_cat})

In [None]:
predictions_df

In [None]:
predictions_df["corrected_prediction"] = (np.where(predictions_df["actual_categories"] == predictions_df["predicted_categories"], 1, 0))

In [None]:
predictions_df["corrected_prediction"].sum()/len(predictions_df)

In [None]:
isbns = []
predicted_cats = []

missing_cats = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)

In [None]:
for i in tqdm(range(0, len(missing_cats))):
    sequence = missing_cats["description"][i]
    predicted_cats += [generate_prediction(sequence, fiction_categories)]
    isbns += [missing_cats["isbn13"][i]]

In [None]:
missing_predictions_df = pd.DataFrame({"isbn13": isbns, "predicted_categories": predicted_cats})

In [None]:
missing_predictions_df

In [None]:
books = pd.merge(books, missing_predictions_df, on="isbn13", how="left")
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])
books = books.drop(columns = ['predicted_categories'])

In [None]:
books

In [None]:
books[books["categories"].str.lower().isna().isin([
    "romance",
    "science fiction",
    "scifi",
    "fantasy",
    "horror",
    "mystery",
    "thriller",
    "comedy",
    "crime",
    "historical"
])]

In [None]:
books.to_csv("books_with_categories.csv", index=False)