In [None]:
import numpy as np 
import pandas as pd 
import kagglehub
import os 
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#!pip install langchain transformers

In [None]:
# Download latest version
path = kagglehub.dataset_download("dylanjcastillo/7k-books-with-metadata")

print("Path to dataset files:", path)

In [None]:
books_df = pd.read_csv(os.path.join(path, 'books.csv'))

# Display the first few rows
books_df.head(40)

In [None]:
books_df.describe()

In [None]:
ax = plt.axes()
sns.heatmap(books_df.isna().transpose(), cbar=False, ax=ax)

plt.xlabel("Columns")
plt.ylabel("Missing values")

plt.show()

In [None]:
books_df["missing_description"] = np.where(books_df["description"].isna(), 1, 0)
books_df.head()

In [None]:
books_df["age_of_book"] = 2024 - books_df["published_year"]
books_df.head()

In [None]:
columns_of_interest = ["num_pages", "age_of_book", "missing_description", "average_rating"]

correlation_matrix = books_df[columns_of_interest].corr(method = "spearman")

sns.set_theme(style="white")
plt.figure(figsize=(8, 6))
heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm",
                      cbar_kws={"label": "Spearman correlation"})
heatmap.set_title("Correlation heatmap")
plt.show()

In [None]:
### so lets discard these missing values 
## 
book_missing = books_df[~(books_df["description"].isna()) &
      ~(books_df["num_pages"].isna()) &
      ~(books_df["average_rating"].isna()) &
      ~(books_df["published_year"].isna())]
book_missing

In [None]:
book_missing["categories"].value_counts().reset_index().sort_values("count", ascending=False)

In [None]:
category_counts = book_missing["categories"].value_counts().reset_index().sort_values("count", ascending=False)

# Plot the distribution
plt.figure(figsize=(10, 6))
sns.barplot(x='categories', y='count', data=category_counts, palette='viridis')
plt.xlabel('categories')
plt.ylabel('count')
plt.title('Distribution of Book Categories')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

In [None]:
book_missing["words_in_description"] = book_missing["description"].str.split().str.len()
book_missing

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot a histogram
plt.figure(figsize=(10, 6))
sns.histplot(book_missing["words_in_description"], bins=30, kde=False, color='skyblue')
plt.xlabel('Number of Words in Description')
plt.ylabel('Frequency')
plt.title('Distribution of Word Counts in Book Descriptions')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
book_missing.loc[book_missing["words_in_description"].between(1, 4), "description"]

In [None]:
book_missing_25_words = book_missing[book_missing["words_in_description"] >= 25]

In [None]:
book_missing_25_words["title_and_subtitle"] = (
    np.where(book_missing_25_words["subtitle"].isna(), book_missing_25_words["title"],
             book_missing_25_words[["title", "subtitle"]].astype(str).agg(": ".join, axis=1))
)
book_missing_25_words

In [None]:
book_missing_25_words["tagged_description"] = book_missing_25_words[["isbn13", "description"]].astype(str).agg(" ".join, axis=1)

book_missing_25_words

In [None]:
(
    book_missing_25_words
    .drop(["subtitle", "missing_description", "age_of_book", "words_in_description"], axis=1)
    .to_csv("books_cleaned.csv", index = False)
)