# Bar chart as an effective alternative to a pie chart

In this exercise, we will visualize the distribution of different categories in a data set. We'll start with a pie chart, understand its problems, and will evolve into a neat summary graph.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

In [None]:
tbl_movies = pd.read_csv('data/tmdb_5000_movies.csv')
tbl_movies.head()

In [None]:
import json

In [None]:
def get_genre(genres):
    genres = sorted(json.loads(genres), key=lambda d: d['id'])
    if len(genres) > 0:
        return genres[0]['name']
    else:
        return 'N/A'
tbl_movies['genre'] = tbl_movies['genres'].apply(get_genre)

In [None]:
genre_counts = pd.value_counts(tbl_movies['genre']).sort_index()
genre_counts

In [None]:
fig, ax = plt.subplots()
ax.pie(genre_counts, labels=genre_counts.index);
ax.set_title("Please don't")

In [None]:
fig, ax = plt.subplots()
ax.set_aspect(1)
ax.pie(genre_counts, labels=genre_counts.index);
ax.set_title("Still don't")

In [None]:
# Make a bar chart using the same data
fig, ax = plt.subplots(figsize=(12, 4), dpi=240)
sns.barplot(x=genre_counts.index, y=genre_counts)

In [None]:
#! Sorting improves readability


In [None]:
# "vanila matplotlib" version (sort of)
fig, ax = plt.subplots(figsize=(12, 4), dpi=240)
y = list(range(len(genre_counts)))
bars = ax.bar(
     y,
     genre_counts,
)
i=0
for b in bars:
    try:
        b.set_color(f'C{i}')
    except :
        i = 0
    else:
        i += 1
ax.set_xticks(range(len(genre_counts)))
ax.set_xticklabels(genre_counts.index, rotation=90)
ax.set_ylabel('# of movies')
ax.set_title("For in much color is much vexation, and he who increases noise, increases pain.")
None

In [None]:
fig, ax = plt.subplots(figsize=(12,4), dpi=240)
y = list(range(len(genre_counts)))
bars = ax.bar(
     y,
     genre_counts,
)
ax.set_xticks(range(len(genre_counts)))
ax.set_xticklabels(genre_counts.index)
# Rotated labels are hard to read. A better option is to alternate label heights
for i, tk in enumerate(ax.get_xticklabels()):
    pos = tk.get_position()
    pos = [pos[0], pos[1] - (0.05 * (i % 2))]
    tk.set_position(pos)
ax.set_ylabel('# of movies')
None

In [None]:
#! Your turn -- use ax.barh to create a horizontal bar chart. Use only one color


Look at what's important. Let's combine the small genres to a single category.

In [None]:
tmp = pd.DataFrame(genre_counts).rename(columns={'genre': 'n'})
tmp['is_other'] = tmp.n < (tmp.n.sum() * 0.05)
tbl_genres = tmp.loc[~tmp.is_other].copy()[['n']]
tbl_genres.loc['Other'] = dict(n=tmp.loc[tmp.is_other]['n'].sum())
tbl_genres['n'] = tbl_genres['n'].astype(int)
tbl_genres

In [None]:
y = list(range(len(tbl_genres)))
fig, ax = plt.subplots()
ax.barh(
    y,
    tbl_genres.n,
)
ax.set_yticks(y)
ax.set_yticklabels(tbl_genres.index)
ax.set_ylim(reversed(ax.get_ylim()))
None

In [None]:
tbl_genres['percent'] = tbl_genres['n'] / tbl_genres['n'].sum() * 100

In [None]:
#! text labels


In [None]:
#! summary table 


In [None]:
#! better allignment
