# Cleaning and exploration

In [4]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import glob
import missingno as msno
import seaborn as sns

import sys
sys.path.append("Modules/nlp_module.py")

import nlp_module as nlp

ModuleNotFoundError: No module named 'click.exceptions'

In [None]:
sns.set_theme(style="ticks", palette="Pastel2")

## 1) Exploring the books datasets

In [None]:
# importing different files and 
# concatenating in one dataset
filepath = 'Datasets/'

all_files = glob.glob(filepath + "books/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename)
    li.append(df)

merged_books = pd.concat(li,
                        axis=0,
                        ignore_index=True)
merged_books.head()

In [None]:
# Identify missing values
msno.matrix(merged_books)

In [None]:
merged_books["NumberOfPages"] = merged_books["PagesNumber"].fillna(0) + merged_books["pagesNumber"].fillna(0)

In [None]:
# deleting columns about publish day, month and ratingdists
merged_books.drop(["PublishMonth", "PublishDay",
                   "RatingDist5", "RatingDist4",
                   "RatingDist3", "RatingDist3",
                   "RatingDist2", "RatingDist1",
                   "RatingDistTotal", "Count of text reviews",
                   "PagesNumber", "pagesNumber"], axis=1,
                   inplace=True)

In [None]:
merged_books.head()

In [None]:
merged_books.dropna(inplace=True)

In [None]:
msno.matrix(merged_books)

In [None]:
merged_books.shape

In [None]:
merged_books.isnull().mean() * 100

### a) Numerical variables

In this part, we explore informations about numerical variables. These variables are the year of publication, the average ratings of each books, the number of ratings, the number of pages and the number of reviews.

In [None]:
sns.boxplot(x="PublishYear", data=merged_books)
plt.show()

In [None]:
# dropping data where year of publication is 
# less than 1820 & more than 2019
ind = merged_books.loc[(merged_books["PublishYear"] <= 1820) | (merged_books["PublishYear"] >= 2019)].index.to_list()
merged_books.drop(index=ind, inplace=True)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharex=True)
fig.suptitle('Description of the year of publication of each book')

sns.histplot(ax=axes[0], x="PublishYear", data=merged_books, kde=True)
axes[0].set_title("Distribution of the year of publication")

sns.boxplot(ax=axes[1], x="PublishYear", data=merged_books)
axes[1].set_title("Boxplot of the year of publication")

plt.show()

After cleaning, only books with a year of publication between 1820 and 2019 were conserved. The majority of the books had been published between 1980 and 2019. The average is around 2000. 

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharex=True)
fig.suptitle('Description of the number of reviews of each book')

sns.histplot(ax=axes[0], x="CountsOfReview", data=merged_books, kde=True)
axes[0].set_title("Distribution of the number of reviews")

sns.boxplot(ax=axes[1], x="CountsOfReview", data=merged_books)
axes[1].set_title("Boxplot of the number of reviews")

plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharex=True)
fig.suptitle('Description of the average ratings of each book')

sns.histplot(ax=axes[0], x="Rating", data=merged_books, kde=True)
axes[0].set_title("Distribution of the average ratings")

sns.boxplot(ax=axes[1], x="Rating", data=merged_books)
axes[1].set_title("Boxplot of the average ratings")

plt.show()

### b) Categorical variables

In [None]:
sns.countplot(y="Authors", data=merged_books,
              order=merged_books["Authors"].value_counts().index[:15])
plt.title("Top 15 of the most read authors")
plt.show()

In [None]:
sns.countplot(y="Publisher", data=merged_books,
              order=merged_books["Publisher"].value_counts().index[:15])
plt.title("Top 15 of the most present publishers")
plt.show()

In [None]:
sns.countplot(y="Language", data=merged_books,
              order=merged_books["Language"].value_counts().index[:15])
plt.title("Top 15 of the most represented languages")
plt.show()

In [None]:
lang = {
    "en-US" : "eng",
    "en-GB" : "eng",
    "en-CA" : "eng"
}

In [None]:
for key, value in lang.items():
    merged_books["Language"].loc[merged_books["Language"].str.contains(key, case=False, regex=False)] = value

In [None]:
sns.countplot(y="Language", data=merged_books,
              order=merged_books["Language"].value_counts().index[:15])
plt.title("Top 15 of the most represented languages")
plt.show()

In [None]:
merged_books_en = merged_books[merged_books["Language"] == "eng"]
merged_books_en.head()

### c) Description of books analysis

In [None]:
desc = merged_books_en[["Id", "Name", "Authors", "Description"]]
desc.head(3)

In [None]:
desc["Description"] = nlp.normalize_corpus(desc["Description"])

In [None]:
desc.head(3)