# VISUALIZATION WITH PANDAS & MATPLOTLIB

### OUTLINE

* STEP 1: HISTOGRAM - SHOW THE DISTRIBUTION OF A NUMERICAL VARIABLE
* STEP 2: SCATTER PLOT - SHOW THE RELATIONSHIP BETWEEN TWO NUMERICAL VARIABLES
* STEP 3: BAR PLOT - SHOW A NUMERICAL COMPARISON ACROSS DIFFERENT CATEGORIES
* STEP 4: BOX PLOT - SHOW QUARTILES (AND OUTLIERS) FOR ONE OR MORE NUMERICAL VARIABLES
* STEP 5: LINE PLOT - SHOW THE TREND OF A NUMERICALS VARIABLE OVER TIME
* STEP 6: GROUPED BOX PLOT AND GROUPED HISTOGRAMS: SHOW ONE PLOT FOR EACH GROUP
* STEP 7: ASSORTED FUNCTIONALITY

** HISTOGRAM  **
- drinks.beer.plot(`kind='hist'`, bins=20)

** SMOOTH VERSION OF HISTOGRAM **
- drinks.beer.plot(`kind='density'`, xlim=(0, 500)) 

** STACKED HISTOGRAM **
- drinks[['beer', 'spirit', 'wine']].plot(`kind='hist'`, stacked=True)

** SCATTER **
- drinks.plot(`kind='scatter'`, x='beer', y='wine')

** SCATTER MATRIX **
- pd.scatter_matrix(drinks[['beer', 'spirit', 'wine']])

** NUMERICAL COMPARISON ACROSS MULTIPLE CATEGORIES **
- drinks.groupby('continent').mean().drop('liters', axis=1).plot(`kind='bar'`, stacked=True) 

** QUARTILES & OUTLIERS **
- drinks.beer.plot(`kind='box'`)


In [None]:
%matplotlib inline

In [None]:
%magic

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

## READ IN THE DRINKS DATA

In [None]:
drink_cols = ['country', 'beer', 'spirit', 'wine', 'liters', 'continent']
drinks = pd.read_csv('../data/drinks.csv', header=0, names=drink_cols, na_filter=False)

In [None]:
drinks.head()

## STEP 1: HISTOGRAM - SHOW THE DISTRIBUTION OF A NUMERICAL VARIABLE

In [None]:
# sort the beer column and split it into 3 groups
drinks.beer.sort_values().values

In [None]:
# compare with histogram
drinks.beer.plot(kind='hist', bins=3)

In [None]:
# try more bins
drinks.beer.plot(kind='hist', bins=10)

In [None]:
drinks[drinks.beer < 10].spirit.plot(kind='hist', bins=3)

In [None]:
# add title and labels
drinks.beer.plot(kind='hist', bins=30, title='Histogram of Beer Servings')
plt.xlabel('Beer Servings')
plt.ylabel('Frequency')

In [None]:
# compare with density plot (smooth version of a histogram)
drinks.beer.plot(kind='density', xlim=(0, 385))

In [None]:
# stacked histogram with multiple variables
drinks[['wine', 'spirit', 'beer']].plot(kind='hist', stacked=True, bins=3, figsize=(10,5))

## STEP 2: SCATTER PLOT - SHOW THE RELATIONSHIP BETWEEN TWO NUMERICAL VARIABLES

In [None]:
# select the beer and wine columns and sort by beer
drinks[['beer', 'wine']].sort_values('beer').values

In [None]:
len(set(drinks.country))

In [None]:
# compare with scatter plot
drinks.plot(kind='scatter', x='beer', y='wine')

In [None]:
# add transparency
drinks.plot(kind='scatter', x='beer', y='wine', alpha=0.3)

In [None]:
# vary point color by spirit servings
drinks.plot(kind='scatter', x='beer', y='wine', c='spirit', colormap='Blues')

In [None]:
# scatter matrix of three numerical columns
pd.scatter_matrix(drinks[['beer', 'spirit', 'wine']])

In [None]:
# increase figure size
pd.scatter_matrix(drinks[['beer', 'spirit', 'wine']], figsize=(10, 8))

## STEP 3: BAR PLOT - SHOW A NUMERICAL COMPARISON ACROSS DIFFERENT CATEGORIES

In [None]:
# count the number of countries in each continent
drinks.continent.value_counts()

In [None]:
# compare with bar plot
drinks.continent.value_counts().plot(kind='bar')

In [None]:
# calculate the average beer/spirit/wine amounts for each continent
drinks.groupby('continent').mean().drop('liters', axis=1)

In [None]:
drinks.columns

In [None]:
drinks2 = drinks.drop('liters', axis=1)
grp_mean = drinks2.groupby('continent').mean()
grp_mean.plot(kind='bar', colormap='Greens')

In [None]:
# side-by-side bar plots
drinks.drop('liters', axis=1).groupby('continent').mean().plot(kind='bar', colormap='Greens')

In [None]:
# stacked bar plots
drinks.groupby('continent').mean().drop('liters', axis=1).plot(kind='bar', stacked=True)

## STEP 4: BOX PLOT - SHOW QUARTILES (AND OUTLIERS) FOR ONE OR MORE NUMERICAL VARIABLES

In [None]:
# show "five-number summary" for beer
drinks.beer.describe()

In [None]:
# compare with box plot
drinks.beer.plot(kind='box')

In [None]:
import numpy as np
print (np.median(drinks.beer))
print (drinks.beer.median())

In [None]:
# include multiple variables
drinks.drop('liters', axis=1).plot(kind='box', showmeans=True)

In [None]:
drinks.wine.describe()

In [None]:
drinks[drinks.wine > 145]

## STEP 5: LINE PLOT - SHOW THE TREND OF A NUMERICAL VARIABLE OVER TIME

In [None]:
# read in the ufo data
ufo = pd.read_csv('../data/ufo.csv')
ufo['Time'] = pd.to_datetime(ufo.Time)
ufo['Year'] = ufo.Time.dt.year

In [None]:
ufo.info()

In [None]:
# count the number of ufo reports each year (and sort by year)
ufo.Year.value_counts().sort_index()

In [None]:
ufo.Time.dt.dayofweek.plot()

In [None]:
ufo['dayofweek'] = ufo.Time.dt.dayofweek

In [None]:
ufo.head()

# Heading 1
## Heading 2
### Heading 3
__BOLD__
1. One
2. Two
3. Three

###Extract data into Days, Months, Years.... good feature engineering technique

In [None]:
ufo.dayofweek.value_counts().sort_index().plot()

In [None]:
# compare with line plot
ufo.Year.value_counts().sort_index().plot()

In [None]:
# compare with line plot
ufo.Year.value_counts().sort_index().plot()

In [None]:
# don't use a line plot when there is no logical ordering
drinks.continent.value_counts().plot()

## STEP 6: GROUPED BOX PLOTS AND GROUPED HISTOGRAMS - SHOW ONE PLOT FOR EACH GROUP

In [None]:
# reminder: box plot of beer servings
drinks.beer.plot(kind='box')

In [None]:
# reminder: histogram of beer servings
drinks.beer.plot(kind='hist')

In [None]:
# box plot of beer servings grouped by continent
drinks.boxplot(column='beer', by='continent')

In [None]:
drinks[(drinks.continent == 'OC') & (drinks.beer > 245)]

In [None]:
# histogram of beer servings grouped by continent
drinks.beer.hist(by=drinks.continent)

In [None]:
# share the x axes
drinks.beer.hist(by=drinks.continent, sharex=True, figsize=(10,10))

In [None]:
# share the x and y axes
drinks.beer.hist(by=drinks.continent, sharex=True, sharey=True, figsize=(10,10))

In [None]:
drinks.head()

In [None]:
# change the layout
drinks.beer.hist(by=drinks.continent, layout=(2, 3), sharex=True, sharey=True, figsize=(10,10));

In [None]:
# box plot of all numeric columns grouped by continent
drinks.boxplot(by='continent')

## STEP 7 - ASSORTED FUNCTIONALITY

In [None]:
# saving a plot to a file: run all four lines at once
drinks.beer.plot(kind='hist', bins=20, title='Histogram of Beer Servings')
plt.xlabel('Beer Servings')
plt.ylabel('Frequency')
plt.savefig('beer_histogram.png')

In [None]:
from IPython.display import Image
Image(filename='beer_histogram.png') 

In [None]:
# list available plot styles
plt.style.available

In [None]:
# change to a different style
plt.style.use('ggplot')

# saving a plot to a file: run all four lines at once
drinks.beer.plot(kind='hist', bins=20, title='Histogram of Beer Servings')
plt.xlabel('Beer Servings')
plt.ylabel('Frequency')

In [None]:
# Nate Silver's plotting / color scheme
plt.style.use('fivethirtyeight')

# saving a plot to a file: run all four lines at once
drinks.beer.plot(kind='hist', bins=20, title='Histogram of Beer Servings')
plt.xlabel('Beer Servings')
plt.ylabel('Frequency')

In [None]:
import seaborn as sn

In [None]:
# saving a plot to a file: run all four lines at once
drinks.beer.plot(kind='hist', bins=20, title='Histogram of Beer Servings')
plt.xlabel('Beer Servings')
plt.ylabel('Frequency')

## RESOURCES

- [Pandas Visualization](http://pandas.pydata.org/pandas-docs/stable/visualization.html)