<h1 align='center'> EDA: <br> Amazon Top 50 Bestselling Books 2009 - 2019 </h1>

Esse notebook mostra uma análise univariada com EDA com os Top 50 Bestselling Books da Amazon 


## Importando as bibliotecas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

paletadeCores = ['#F5DF4D', '#939597', '#0F4C81', '#FF6F61', '#5F4B8B', '#88B04B', '#92A8D1', '#F7CAC9', '#955251',\
           '#B163A3', '#009473']
paletadeCores.reverse()

## Importando os dados

In [2]:
df = pd.read_csv('bestsellers with categories.csv')
df.head()

Unnamed: 0,Name,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction


### Propriedades básicas dos df - describe e info

In [3]:
df.describe()

Unnamed: 0,User Rating,Reviews,Price,Year
count,550.0,550.0,550.0,550.0
mean,4.618364,11953.281818,13.1,2014.0
std,0.22698,11731.132017,10.842262,3.165156
min,3.3,37.0,0.0,2009.0
25%,4.5,4058.0,7.0,2011.0
50%,4.7,8580.0,11.0,2014.0
75%,4.8,17253.25,16.0,2017.0
max,4.9,87841.0,105.0,2019.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550 entries, 0 to 549
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         550 non-null    object 
 1   Author       550 non-null    object 
 2   User Rating  550 non-null    float64
 3   Reviews      550 non-null    int64  
 4   Price        550 non-null    int64  
 5   Year         550 non-null    int64  
 6   Genre        550 non-null    object 
dtypes: float64(1), int64(3), object(3)
memory usage: 30.2+ KB


### Transformação da coluna Year para DateTime

In [5]:
df['Year'] =  pd.to_datetime(df['Year'], format='%Y')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550 entries, 0 to 549
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Name         550 non-null    object        
 1   Author       550 non-null    object        
 2   User Rating  550 non-null    float64       
 3   Reviews      550 non-null    int64         
 4   Price        550 non-null    int64         
 5   Year         550 non-null    datetime64[ns]
 6   Genre        550 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 30.2+ KB
None


Unnamed: 0,Name,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016-01-01,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011-01-01,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018-01-01,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017-01-01,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019-01-01,Non Fiction


## Visualização

### Histogramas

In [None]:
# Seta o tamanho da figura com as propriedades (width, height) 
plt.figure(figsize = (10, 6))

# Plot do histograma utilizando o Seaborn
sns.histplot(df['User Rating'], stat='density', binwidth=0.1, color='#F5DF4D')
  
# Set label no eixo x
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
# Set label no eixo y
plt.ylabel( "% per User Rating" , size=12)
  
# Coloca título a figura
plt.title( "Amazon Best Sellers User Ratings" , size=24);

In [None]:
# Seta o tamanho da figura com as propriedades (width, height) 
plt.figure(figsize = (10, 6))

# Plot do histograma utilizando o Seaborn
sns.histplot(df.loc[df['Genre'] == 'Fiction']['User Rating'], stat='density', binwidth=0.1, color='#F5DF4D')
sns.histplot(df.loc[df['Genre'] == 'Non Fiction']['User Rating'], stat='density', binwidth=0.1, color='#939597') 

# Cria a legenda do Plot
plt.legend(labels=['Fiction', 'Non Fiction'])

# Set label no eixo x
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
#  Set label no eixo y
plt.ylabel( "% per User Rating" , size=12)
  
# SColoca título a figura
plt.title( "Amazon Best Sellers User Ratings by Genre" , size=24);

In [None]:
#Seta o tamanho da figura com as propriedades (width, height) 
plt.figure(figsize = (10, 6))

years = np.sort(df['Year'].dt.year.unique())
for i, year in enumerate(years):
    # Histograma
    sns.histplot(df.loc[df['Year'] == str(year)]['User Rating'], stat='density', binwidth=0.1, color=palette[i], alpha=0.5) 

#Legenda
plt.legend(labels=years)

# Label eixo x
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
#Label eixo y
plt.ylabel( "% per User Rating" , size=12)
  
# Título
plt.title( "Amazon Best Sellers User Ratings by Year" , size=24);

In [None]:
# Seta o tamanho da figura com as propriedades (width, height) 
plt.figure(figsize = (10, 6))

years = np.sort(df['Year'].dt.year.unique())
for i, year in enumerate(years):
    # Histograma
    if i in range(5):
        sns.histplot(df.loc[df['Year'] == str(year)]['User Rating'], stat='density', binwidth=0.1, color=palette[i]) 

# Legenda
plt.legend(labels=years[0:5])

# Label eixo x
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
# Label eixo y
plt.ylabel( "% per User Rating" , size=12)
  
# Título
plt.title( "Amazon Best Sellers User Ratings by Year 2009-2013" , size=24);

In [None]:
# Seta o tamanho da figura com as propriedades (width, height)
plt.figure(figsize = (10, 6))

years = np.sort(df['Year'].dt.year.unique())
for i, year in enumerate(years):
    # Histograma
    if i in range(5,10):
        sns.histplot(df.loc[df['Year'] == str(year)]['User Rating'], stat='density', binwidth=0.1, color=palette[i]) 

# Legenda
plt.legend(labels=years[5:10])

# Eixo x
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
# Eixo y
plt.ylabel( "% per User Rating" , size=12)
  
# Título
plt.title( "Amazon Best Sellers User Ratings by Year 2014-2018" , size=24);

In [None]:
# Seta o tamanho da figura com as propriedades (width, height)
plt.figure(figsize = (10, 6))

years = np.sort(df['Year'].dt.year.unique())
for i, year in enumerate(years):
    # Plot histogram using Seaborn
    if i in range(8,12):
        sns.histplot(df.loc[df['Year'] == str(year)]['User Rating'], stat='density', binwidth=0.1, color=palette[i]) 

# Legenda
plt.legend(labels=years[8:12])

# label x-axis
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
# label y-axis
plt.ylabel( "% per User Rating" , size=12)
  
# Título 
plt.title( "Amazon Best Sellers User Ratings by Year 2017-2019" , size=24);

In [6]:
df[['Price']].describe()

Unnamed: 0,Price
count,550.0
mean,13.1
std,10.842262
min,0.0
25%,7.0
50%,11.0
75%,16.0
max,105.0


In [None]:
df['Price Quantile'] = pd.qcut(df['Price'], 4, labels=False)

# Set figure size (width, height) in inches
plt.figure(figsize = (10, 6))

prices = np.sort(df['Price Quantile'].unique())
for i, price in enumerate(prices):
    # Plot histogram using Seaborn
    sns.histplot(df.loc[df['Price Quantile'] == price]['User Rating'], stat='density', binwidth=0.1, color=palette[i]) 

# Create plot legend
plt.legend(labels=prices)

# Set label for x-axis
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
# Set label for y-axis
plt.ylabel( "% per User Rating" , size=12)
  
# Set title for figure
plt.title( "Amazon Best Sellers User Ratings by Price Quantile" , size=24);

In [None]:
df['Reviews Quantile'] = pd.qcut(df['Reviews'], 4, labels=False)

# Set figure size (width, height) in inches
plt.figure(figsize = (10, 6))

reviews = np.sort(df['Reviews Quantile'].unique())
for i, review in enumerate(reviews):
    # Plot histogram using Seaborn
    sns.histplot(df.loc[df['Reviews Quantile'] == review]['User Rating'], stat='density', binwidth=0.1, color=palette[i]) 

# Create plot legend
plt.legend(labels=reviews)

# Set label for x-axis
plt.xlabel( "User Rating (Bin Size = 0.1)" , size=12)
  
# Set label for y-axis
plt.ylabel( "% per User Rating" , size=12)
  
# Set title for figure
plt.title( "Amazon Best Sellers User Ratings by Review Quantile" , size=24);

In [None]:
# Set figure size (width, height) in inches
plt.figure(figsize = (10, 6))

# Plot boxplot using Seaborn
sns.boxplot(data=df['User Rating'], color='#F5DF4D')
  
# Set label for x-axis
plt.xlabel( "User Rating" , size=12)

  
# Set title for figure
plt.title( "Amazon Best Sellers User Ratings" , size=24);

In [None]:
ax = df.boxplot('User Rating', 'Genre', figsize=(10,6), fontsize=12)
ax.set_ylabel('User Rating')
ax.set_title('');

In [None]:
df['Year'] = df['Year'].dt.year
ax = df.boxplot('User Rating', 'Year', figsize=(10,6), fontsize=16, rot=35)
ax.set_ylabel('User Rating')
ax.set_title('');

In [None]:
ax = df.boxplot('User Rating', 'Price Quantile', figsize=(10,6), fontsize=16, rot=35)
ax.set_ylabel('User Rating')
ax.set_title('');

In [None]:
ax = df.boxplot('User Rating', 'Reviews Quantile', figsize=(10,6), fontsize=16, rot=35)
ax.set_ylabel('User Rating')
ax.set_title('');