In [102]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
df = pd.read_csv('books_cleaned.csv')
df = df.dropna(how='any')
df.columns = df.columns.str.strip()  # Remove leading/trailing spaces

# Display the first few rows of the dataset
df

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher;;;
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9.780440e+12,eng,652.0,2095690.0,27591.0,9/16/2006,Scholastic Inc.;;;
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9.780439e+12,eng,870.0,2153167.0,29221.0,9/1/2004,Scholastic Inc.;;;
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9.780440e+12,eng,352.0,6333.0,244.0,11/1/2003,Scholastic;;;
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9.780440e+12,eng,435.0,2339585.0,36325.0,5/1/2004,Scholastic Inc.;;;
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9.780440e+12,eng,2690.0,41428.0,164.0,9/13/2004,Scholastic;;;
...,...,...,...,...,...,...,...,...,...,...,...,...
11122,45631,Expelled from Eden: A William T. Vollmann Reader,William T. Vollmann/Larry McCaffery/Michael He...,4.06,1560254416,9.781560e+12,eng,512.0,156.0,20.0,12/21/2004,Da Capo Press;;;
11123,45633,You Bright and Risen Angels,William T. Vollmann,4.08,0140110879,9.780140e+12,eng,635.0,783.0,56.0,12/1/1988,Penguin Books;;;
11124,45634,The Ice-Shirt (Seven Dreams #1),William T. Vollmann,3.96,0140131965,9.780140e+12,eng,415.0,820.0,95.0,8/1/1993,Penguin Books;;;
11125,45639,Poor People,William T. Vollmann,3.72,0060878827,9.780061e+12,eng,434.0,769.0,139.0,2/27/2007,Ecco;;;


In [None]:
plt.figure(figsize=(20, 12))

# Convert to datetime with error handling
df['publication_date'] = pd.to_datetime(
    df['publication_date'],
    errors='coerce',  # Convert invalid dates to NaT
    format='mixed'    # Handle different date formats
)

# Remove rows with invalid dates
df = df.dropna(subset=['publication_date'])

# Create decade and rating bins
df['decade'] = (df['publication_date'].dt.year // 10) * 10
df['rating_bin'] = pd.cut(df['average_rating'],
                          bins=[0, 2, 3, 4, 5],
                          labels=['0-2', '2-3', '3-4', '4-5'])

# Create heatmap data
heatmap_data = df.pivot_table(index='decade',
                              columns='rating_bin',
                              aggfunc='size',
                              fill_value=0)

# Plot heatmap
sns.heatmap(heatmap_data,
            annot=True,
            fmt=',d',
            cmap='rocket_r',
            annot_kws={'fontsize':16, 'fontweight':'bold'},
            cbar_kws={'label': 'Number of Books'})

# Styling
plt.xlabel('Rating Ranges', fontsize=16, labelpad=15)
plt.ylabel('Publication Decade', fontsize=16, labelpad=15)
plt.title('Book Distribution by Decade and Rating', fontsize=18, pad=20)
plt.xticks(fontsize=16, rotation=45)
plt.yticks(fontsize=16)

# Format y-axis to show complete decades
decades = sorted(df['decade'].unique())
plt.yticks(ticks=range(len(decades)), labels=decades)

plt.tight_layout()
plt.show()

In [105]:
import plotly.express as px

fig = px.scatter(df, x='ratings_count', y='text_reviews_count',
                 size='average_rating', color='language_code',
                 log_x=True, log_y=True, size_max=60,
                 hover_name='title', hover_data=['authors', 'publisher'])

fig.update_layout(
    title='Book Popularity Bubble Chart',
    xaxis_title="Ratings Count (Log)",
    yaxis_title="Text Reviews Count (Log)",
    font=dict(size=16),
    width=1200,
    height=800,
    plot_bgcolor='rgba(240,240,240,0.9)',
    paper_bgcolor='white'
)
fig.show()

ModuleNotFoundError: No module named 'plotly'

In [None]:
# change chart type
plt.figure(figsize=(20, 12))
hb = plt.hexbin(df['num_pages'], df['average_rating'],
                gridsize=40, cmap='YlOrRd', bins='log',
                mincnt=1, edgecolors='none')

# Add colorbar
cb = plt.colorbar(hb)
cb.set_label('Log Frequency', fontsize=16)
cb.ax.tick_params(labelsize=16)


plt.xlabel('Number of Pages', fontsize=16)
plt.ylabel('Average Rating', fontsize=16)
plt.xlim(0, 1500)  # Remove outliers
plt.title('Page Count vs Rating Density', fontsize=18, pad=20)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(linestyle='--', alpha=0.5, color='gray')
plt.show()

In [97]:
# quality for publishers