# Getting to know a dataset

## Initial exploration

In [None]:
books.head()

In [None]:
books.info()

In [None]:
books.value_counts('genre')

In [None]:
books.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.histplot(data=books, x="rating", binwidth=.1)
# sns.histplot(data=books, x="rating", binwidth=1)
plt.show()

## Data validation

In [None]:
books.dtypes

In [None]:
books['year'] = books['year'].astype(int)

In [None]:
books['genre'].isin(['Fiction', 'Non Fiction'])

In [None]:
# invert True/False values
~books['genre'].isin(['Fiction', 'Non Fiction'])

In [None]:
books[books['genre'].isin(['Fiction', 'Non Fiction'])].head()

In [None]:
books.select_dtypes("number").head()

In [None]:
books['year'].min()
books['year'].max()

sns.boxplot(data=books, x='year')
plt.show()

sns.boxplot(data=books, x='year', y='genre')

## Data Summarization

### Aggregating functions
- sum: ```.sum()```
- count: ```.count()```
- minimum: ```.min()```
- maximum: ```.max()```
- variance: ```.var()```
- standard deviation: ```.std()```

In [None]:
books.groupby('genre').mean()

In [None]:
books.agg(['mean', 'std'])

In [None]:
books.agg({'rating': ['mean', 'std'], 'year': ['median']})

In [None]:
books.groupby('genre').agg(
    mean_rating=('rating', 'mean'),
    std_rating=('rating', 'std'),
    median_year=('year', 'median')
)

In [None]:
# visualizing categorical summaries
sns.barplot(data=books, x='genre', y='rating')
plt.show()