In [None]:
import requests
import pandas as pd
import time

API_KEY = "AIzaSyBv3Y1Ouq7702e8Hw5pekgSLByQooUSWKA"  # your real key
queries = ["awarded books", "award-winning books", "literary award books"]
max_results = 40  # API max per request
target_books = 500
all_books = []

for q in queries:
    start_index = 0
    while len(all_books) < target_books:
        params = {
            "q": q,
            "key": API_KEY,
            "startIndex": start_index,
            "maxResults": max_results,
            "printType": "books",
            "langRestrict": "en"
        }
        try:
            response = requests.get("https://www.googleapis.com/books/v1/volumes", params=params)
            response.raise_for_status()
            data = response.json()

            if "items" not in data:
                print(f"No more results for query '{q}' at startIndex {start_index}")
                break

            for item in data["items"]:
                info = item.get("volumeInfo", {})
                all_books.append({
                    "title": info.get("title"),
                    "authors": ", ".join(info.get("authors", [])) if info.get("authors") else None,
                    "publisher": info.get("publisher"),
                    "publishedDate": info.get("publishedDate"),
                    "description": info.get("description"),
                    "categories": ", ".join(info.get("categories", [])) if info.get("categories") else None,
                    "pageCount": info.get("pageCount"),
                    "language": info.get("language"),
                    "previewLink": info.get("previewLink"),
                    "averageRating": info.get("averageRating"),   # new field
                    "ratingsCount": info.get("ratingsCount")      # new field
                })

            start_index += max_results
            time.sleep(0.2)  # polite delay
        except Exception as e:
            print(f"Error for query '{q}' at startIndex {start_index}: {e}")
            time.sleep(1)

# Remove duplicates based on title + authors
df = pd.DataFrame(all_books).drop_duplicates(subset=["title", "authors"]).reset_index(drop=True)

# Limit to target_books
df = df.head(target_books)

# Save enhanced CSV
df.to_csv("award_books_enhanced.csv", index=False)
print(f"Total books fetched: {len(df)}")
print(df.head())






In [None]:
# Fill missing values safely
df["authors"] = df["authors"].fillna("Unknown")
df["categories"] = df["categories"].fillna("Unknown")
df["publisher"] = df["publisher"].fillna("Unknown")

# Extract published year safely
df["publishedYear"] = df["publishedDate"].str[:4]

# Fill missing ratings
df["averageRating"] = df["averageRating"].fillna(0)
df["ratingsCount"] = df["ratingsCount"].fillna(0)

# Save cleaned CSV
df.to_csv("award_books_cleaned.csv", index=False)
print(df.head())



In [None]:
import requests
import pandas as pd
import time

# --- Configuration ---
API_KEY = "AIzaSyBv3Y1Ouq7702e8Hw5pekgSLByQooUSWKA"
queries = ["awarded books", "award-winning books", "literary award books"]
max_results = 40  # max per API request
target_books = 500
all_books = []

# --- Fetch books from multiple queries ---
for q in queries:
    start_index = 0
    while len(all_books) < target_books:
        params = {
            "q": q,
            "key": API_KEY,
            "startIndex": start_index,
            "maxResults": max_results,
            "printType": "books",
            "langRestrict": "en"
        }
        try:
            response = requests.get("https://www.googleapis.com/books/v1/volumes", params=params)
            response.raise_for_status()
            data = response.json()

            if "items" not in data:
                print(f"No more results for query '{q}' at startIndex {start_index}")
                break

            for item in data["items"]:
                info = item.get("volumeInfo", {})
                all_books.append({
                    "title": info.get("title"),
                    "authors": ", ".join(info.get("authors", [])) if info.get("authors") else None,
                    "publisher": info.get("publisher"),
                    "publishedDate": info.get("publishedDate"),
                    "description": info.get("description"),
                    "categories": ", ".join(info.get("categories", [])) if info.get("categories") else None,
                    "pageCount": info.get("pageCount"),
                    "language": info.get("language"),
                    "previewLink": info.get("previewLink"),
                    "averageRating": info.get("averageRating"),
                    "ratingsCount": info.get("ratingsCount")
                })

            start_index += max_results
            time.sleep(0.2)  # polite delay
        except Exception as e:
            print(f"Error for query '{q}' at startIndex {start_index}: {e}")
            time.sleep(1)

# --- Convert to DataFrame and remove duplicates ---
df = pd.DataFrame(all_books).drop_duplicates(subset=["title", "authors"]).reset_index(drop=True)

# --- Limit to target_books ---
df = df.head(target_books)

# --- Clean missing fields safely ---
df["authors"] = df["authors"].fillna("Unknown")
df["categories"] = df["categories"].fillna("Unknown")
df["publisher"] = df["publisher"].fillna("Unknown")
df["averageRating"] = df["averageRating"].fillna(0)
df["ratingsCount"] = df["ratingsCount"].fillna(0)

# --- Extract published year ---
df["publishedYear"] = df["publishedDate"].str[:4]

# --- Save cleaned CSV ---
df.to_csv("award_books_cleaned.csv", index=False)

print(f"Total books fetched and cleaned: {len(df)}")
print(df.head())


In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()


In [None]:
# --- Fill missing values safely without chained assignment ---
df['authors'] = df['authors'].fillna("Unknown")
df['categories'] = df['categories'].fillna("Unknown")
df['publisher'] = df['publisher'].fillna("Unknown")
df['description'] = df['description'].fillna("No description")
df['pageCount'] = df['pageCount'].fillna(df['pageCount'].median())
df['publishedYear'] = df['publishedYear'].fillna("Unknown")
df['averageRating'] = df['averageRating'].fillna(0)
df['ratingsCount'] = df['ratingsCount'].fillna(0)

# --- Optional: convert numeric columns to appropriate types ---
df['pageCount'] = df['pageCount'].astype(int)
df['averageRating'] = df['averageRating'].astype(float)
df['ratingsCount'] = df['ratingsCount'].astype(int)

# --- Save the fully cleaned dataset ---
df.to_csv("award_books_cleaned_final.csv", index=False)

print("Dataset cleaned successfully. Sample data:")
print(df.head())



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
sns.set_style("whitegrid")

# 1. Histogram of averageRating
plt.figure(figsize=(8,5))
sns.histplot(df['averageRating'], bins=10, kde=False)
plt.title("Distribution of Average Ratings")
plt.xlabel("Average Rating")
plt.ylabel("Number of Books")
plt.show()

# 2. Histogram of ratingsCount
plt.figure(figsize=(8,5))
sns.histplot(df['ratingsCount'], bins=20, kde=False)
plt.title("Distribution of Ratings Count")
plt.xlabel("Ratings Count")
plt.ylabel("Number of Books")
plt.show()

# 3. Histogram of pageCount
plt.figure(figsize=(8,5))
sns.histplot(df['pageCount'], bins=20, kde=False)
plt.title("Distribution of Page Count")
plt.xlabel("Number of Pages")
plt.ylabel("Number of Books")
plt.show()


In [None]:
# 1. Top 10 categories
top_categories = df['categories'].value_counts().head(10)
plt.figure(figsize=(10,5))
sns.barplot(x=top_categories.values, y=top_categories.index, palette="viridis")
plt.title("Top 10 Book Categories")
plt.xlabel("Number of Books")
plt.ylabel("Category")
plt.show()

# 2. Top 10 authors
top_authors = df['authors'].value_counts().head(10)
plt.figure(figsize=(10,5))
sns.barplot(x=top_authors.values, y=top_authors.index, palette="magma")
plt.title("Top 10 Authors")
plt.xlabel("Number of Books")
plt.ylabel("Author")
plt.show()

# 3. Top 10 publishers
top_publishers = df['publisher'].value_counts().head(10)
plt.figure(figsize=(10,5))
sns.barplot(x=top_publishers.values, y=top_publishers.index, palette="coolwarm")
plt.title("Top 10 Publishers")
plt.xlabel("Number of Books")
plt.ylabel("Publisher")
plt.show()


In [None]:
# Remove unknown years
df_year = df[df['publishedYear'] != "Unknown"]

# Convert to numeric
df_year['publishedYear'] = df_year['publishedYear'].astype(int)

# Count of books per year
books_per_year = df_year['publishedYear'].value_counts().sort_index()
plt.figure(figsize=(12,5))
sns.lineplot(x=books_per_year.index, y=books_per_year.values)
plt.title("Number of Award-Winning Books Over Years")
plt.xlabel("Published Year")
plt.ylabel("Number of Books")
plt.show()

# Average rating per year
avg_rating_per_year = df_year.groupby('publishedYear')['averageRating'].mean()
plt.figure(figsize=(12,5))
sns.lineplot(x=avg_rating_per_year.index, y=avg_rating_per_year.values)
plt.title("Average Rating of Books Over Years")
plt.xlabel("Published Year")
plt.ylabel("Average Rating")
plt.show()
