In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import sqlalchemy

In [None]:
# Database Connection
database_connection_string = 'postgresql://your_username:your_password@your_host:your_port/your_database'
engine = sqlalchemy.create_engine(database_connection_string)

In [None]:
# Fetch Data from Database
query = "SELECT * FROM your_table_name"
df = pd.read_sql(query, engine)

In [None]:
# Display Basic Information
print(df.head())
print(df.isnull().sum())
print(df.describe())

In [None]:
# Data Visualization - Top 10 Highly Rated Books
top_ten = df[df['ratings_count'] > 1000000].sort_values(by='average_rating', ascending=False).head(10)
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(10, 10))
sns.barplot(x="average_rating", y="title", data=top_ten, palette='inferno')

In [None]:
# Data Visualization - Top 10 Authors with Most Books
most_books = df.groupby('authors')['title'].count().reset_index().sort_values('title', ascending=False).head(10).set_index('authors')
plt.figure(figsize=(15, 10))
ax = sns.barplot(most_books['title'], most_books.index, palette='inferno')
ax.set_title("Top 10 authors with most books")
ax.set_xlabel("Total number of books")
for i in ax.patches:
    ax.text(i.get_width() + 0.2, i.get_y() + 0.2, str(round(i.get_width())), fontsize=15, color='black')
plt.show()

In [None]:
# Data Visualization - Top 10 Most Rated Books
most_rated = df.sort_values('ratings_count', ascending=False).head(10).set_index('title')
plt.figure(figsize=(15, 10))
ax = sns.barplot(most_rated['ratings_count'], most_rated.index, palette='inferno')
for i in ax.patches:
    ax.text(i.get_width() + 0.2, i.get_y() + 0.2, str(round(i.get_width())), fontsize=15, color='black')
plt.show()

In [None]:
# Data Visualization - Average Rating Distribution
df['average_rating'] = df['average_rating'].astype(float)
fig, ax = plt.subplots(figsize=[15, 10])
sns.distplot(df['average_rating'], ax=ax)
ax.set_title('Average rating distribution for all books', fontsize=20)
ax.set_xlabel('Average rating', fontsize=13)

In [None]:
# Data Visualization - Relationship between Rating Counts and Average Ratings
ax = sns.relplot(data=df, x="average_rating", y="ratings_count", color='red', sizes=(100, 200), height=7, marker='o')
plt.title("Relation between Rating counts and Average Ratings", fontsize=15)
ax.set_axis_labels("Average Rating", "Ratings Count")

In [None]:
# Data Visualization - Relationship between Average Ratings and Number of Pages
plt.figure(figsize=(15, 10))
ax = sns.relplot(x="average_rating", y="num_pages", data=df, color='red', sizes=(100, 200), height=7, marker='o')
ax.set_axis_labels("Average Rating", "Number of Pages")

In [None]:
#making a copy of our data just in case we mess something up
df2 = df.copy()

In [None]:
# Data Preprocessing for Recommendation System
df2 = df.copy()
df2['rating_between'] = pd.cut(df2['average_rating'], bins=[0, 1, 2, 3, 4, 5], labels=["0-1", "1-2", "2-3", "3-4", "4-5"])
rating_df = pd.get_dummies(df2['rating_between'])
language_df = pd.get_dummies(df2['language_code'])
features = pd.concat([rating_df, language_df, df2['average_rating'], df2['ratings_count']], axis=1)

In [None]:
# Feature Scaling
min_max_scaler = MinMaxScaler()
features = min_max_scaler.fit_transform(features)

## Book Recommendation System:

In [None]:
# Nearest Neighbors Model
model = neighbors.NearestNeighbors(n_neighbors=6, algorithm='ball_tree')
model.fit(features)
dist, idlist = model.kneighbors(features)

In [None]:
# Book Recommender Function
def BookRecommender(book_name):
    book_list_name = []
    book_id = df2[df2['title'] == book_name].index[0]
    for newid in idlist[book_id]:
        book_list_name.append(df2.loc[newid].title)
    return book_list_name

In [None]:
# for example
BookNames = BookRecommender('Harry Potter and the Half-Blood Prince (Harry Potter  #6)')
print(BookNames)