In [None]:
import pandas as pd

books = pd.read_csv("books_cleaned.csv")

In [None]:
books["categories"].value_counts().reset_index()

In [None]:
# query() method in pandas allows to filter DataFrames using string expression
books["categories"].value_counts().reset_index().query("count >= 50")

In [None]:
#books[books["categories"] == "Juvenile Fiction"]

In [None]:
#books[books["categories"] == "Juvenile Nonfiction"].reset_index()

In [None]:
category_mapping = { 'Fiction': 'Fiction',
'Juvenile Fiction': "Fiction",
'Biography & Autobiography': 'Nonfiction',
'History': 'Nonfiction',
'Literary Criticism': 'Nonfiction',
'Religion': 'Nonfiction',
'Philosophy': 'Nonfiction',
'Comics & Graphic Novels': 'Fiction',
'Drama': 'Fiction',
'Juvenile Nonfiction': "Nonfiction",
'Science': 'Nonfiction',
'Poetry': 'Fiction',
'Literary Collections': 'Nonfiction'
}
books["simple_categories"] = books["categories"].map(category_mapping)

In [None]:
# Show only rows where simple_categories is not null
books[books["simple_categories"].notna()]

In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

import transformers
print(f"Transformers version: {transformers.__version__}")

from transformers import pipeline
print("✅ Pipeline import successful!")

In [None]:
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli",
                      device='cuda')

In [None]:
candidate_labels = ["Fiction", "Nonfiction"]

In [None]:
sequence_to_classify = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[0]

In [None]:
classifier(sequence_to_classify, candidate_labels)

In [None]:
#Postprocessing to get predicted label
import numpy as np
max_index = np.argmax(classifier(sequence_to_classify, candidate_labels)["scores"])
predicted_class = classifier(sequence_to_classify, candidate_labels)["labels"][max_index]

In [None]:
print(predicted_class)

In [None]:
#Bundle all of this into function
def generate_predictions(sequence, categories):
    predictions = classifier(sequence, categories)
    max_index = np.argmax(predictions["scores"])
    predicted_class = predictions["labels"][max_index]
    return predicted_class

In [None]:
books["simple_categories"].value_counts().reset_index()

In [None]:
books["simple_categories"].isna().sum()

In [None]:
#Evaluation
from tqdm import tqdm

actual_categories = []
predicted_categories = []

for i in tqdm(range(0,300)):
    sequence = books.loc[books["simple_categories"] == "Fiction", "description"].reset_index(drop=True)[i]
    predicted_categories.append(generate_predictions(sequence, candidate_labels))
    actual_categories.append("Fiction")
    

In [None]:
for i in tqdm(range(0,300)):
    sequence = books.loc[books["simple_categories"] == "Nonfiction", "description"].reset_index(drop=True)[i]
    predicted_categories.append(generate_predictions(sequence, candidate_labels))
    actual_categories.append("Nonfiction")

In [None]:
print(f"Length of actual_categories: {len(actual_categories)}")
print(f"Length of predicted_categories: {len(predicted_categories)}")

In [None]:
prediction_df = pd.DataFrame({"actual_categories": actual_categories, "predicted_categories": predicted_categories})

In [None]:
prediction_df.head(10)

In [None]:
prediction_df["correct_prediction"] = (np.where(prediction_df["actual_categories"] == prediction_df["predicted_categories"], 1, 0))

In [None]:
accuracy = prediction_df["correct_prediction"].sum() / len(prediction_df) * 100

In [None]:
accuracy

In [None]:
#Now that is a good accuracy to use in predicting books with missing labels
isbns = []
predicted_categories = []
missing_categories = books.loc[books["simple_categories"].isna(), ["isbn13", "description"]].reset_index(drop=True)

In [None]:
for i in tqdm(range(0, len(missing_categories))):
    sequence = missing_categories["description"][i]
    predicted_categories.append(generate_predictions(sequence, candidate_labels))
    isbns.append(missing_categories["isbn13"][i])

In [None]:
missing_predictions_df = pd.DataFrame({"isbn13": isbns, "predicted_categories": predicted_categories})

In [None]:
missing_predictions_df.head(10)

In [None]:
#Now time to merge this in the original dataframe 'books'
#Fill in the missing categories from predicted categories
#Then drop the predicted_categories column
books = pd.merge(books, missing_predictions_df, on="isbn13", how="left")
books["simple_categories"] = np.where(books["simple_categories"].isna(), books["predicted_categories"], books["simple_categories"])
books = books.drop(columns=["predicted_categories"])

In [None]:
books.head()

In [None]:
print(books["simple_categories"].isna().sum())
print(books["simple_categories"].isna().count())

In [None]:
books.to_csv("books_with_categories.csv", index=False)