# Data Visualization Starter Notebook

Visualization packages in python can be opaque and sometimes are more difficult to work with than the model development component. This notebook contains many common plot types with example code for both matplotlib and seaborn. Googling is generally your best bet to find code for more complicated graphs, but the below resources are both excellent more complete tutorials on the packages.

https://matplotlib.org/tutorials/introductory/sample_plots.html

https://seaborn.pydata.org/tutorial.html

In [1]:
import pandas as pd
user_cols = ['carat', 'cut', 'x', 'y', 'z']
diamonds = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/diamonds.csv')
print("First 6 rows:")
print(diamonds[user_cols].head(6))

First 6 rows:
   carat        cut     x     y     z
0   0.23      Ideal  3.95  3.98  2.43
1   0.21    Premium  3.89  3.84  2.31
2   0.23       Good  4.05  4.07  2.31
3   0.29    Premium  4.20  4.23  2.63
4   0.31       Good  4.34  4.35  2.75
5   0.24  Very Good  3.94  3.96  2.48


In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib as mpl

In [None]:
sns.violinplot(x="color",
y="price", 
hue="color",
data=diamonds)

In [None]:
df_red = pd.read_csv('../data/winequality-red.csv', delimiter=';')
df_white = pd.read_csv('../data/winequality-white.csv', delimiter=';')

In [None]:
df_red['type'] = 'red'

print(str(df_red.shape[0]) + ' red wines.')

df_white['type'] = 'white'

print(str(df_white.shape[0]) + ' white wines.')

df = pd.concat([df_red,df_white])
df['red_bool'] = (df.type == 'red')*1

df.head()

## Bar Charts

### Standard

#### Matplotlib

In [None]:
df_vc = df.type.value_counts()
x_list = list(range(len(df_vc)))

plt.bar(x_list, # Number of values on x axis
        df_vc, # Series of values
        align ='center',
       width = 2/3)

plt.xticks(x_list,df_vc.index)

plt.ylabel('Count')
plt.title('Count of wines by color')

plt.show()

In [None]:
plt.barh(x_list, # Number of values on x axis
        df_vc, # Series of values
        align ='center',
        height = 2/3)

plt.yticks(x_list,df_vc.index)

plt.xlabel('Count')
plt.title('Count of wines by color')

plt.show()

#### Subplots

We often want to see two graphs side by side to get a more clear understanding of the data.

Further reading:
https://matplotlib.org/devdocs/gallery/subplots_axes_and_figures/subplots_demo.html

In [None]:
fig, (ax1, ax2) = plt.subplots(2)

ax1.bar(x_list, # Number of values on x axis
        df_vc, # Series of values
        align ='center',
        width = 2/3)

plt.sca(ax1)  # Selects first chart in subplots
plt.xticks(x_list,df_vc.index)
plt.ylabel('Count')
plt.title('Count of wines by color')

plt.sca(ax2)  # Selects first chart in subplots
plt.barh(x_list, # Number of values on x axis
        df_vc, # Series of values
        align ='center',
        height = 2/3)

plt.yticks(x_list,df_vc.index)

plt.show()

#### Seaborn

In [None]:
bp = sns.barplot(x = df_vc.index, y = df_vc.values)
bp.set(ylabel= 'Count',xlabel='Type of Wine')
bp.set_title('Wine by color')
plt.show()

In [None]:
bp = sns.barplot(y = df_vc.index, x = df_vc.values)
bp.set(xlabel= 'Count',ylabel='Type of Wine')
bp.set_title('Wine by color')
plt.show()

### Stacked Bar

####  Matplotlib

Matplotlib doesn't have a clean way to produce stacked bar charts, but the belows is functional

In [None]:
wine_gb = df.groupby(['type','quality'])['quality'].count().unstack('type').fillna(0)
wine_gb

In [None]:
wid = 2/3

fig, ax = plt.subplots()
p1 = plt.bar([0,1], wine_gb.iloc[0], color = '0', width = wid)
p2 = plt.bar([0,1], wine_gb.iloc[1], bottom = wine_gb.cumsum().iloc[0], color = '.1', width = wid)
p3 = plt.bar([0,1], wine_gb.iloc[2], bottom = wine_gb.cumsum().iloc[1], color = '.2', width = wid)
p4 = plt.bar([0,1], wine_gb.iloc[3], bottom = wine_gb.cumsum().iloc[2], color = '.3', width = wid)
p5 = plt.bar([0,1], wine_gb.iloc[4], bottom = wine_gb.cumsum().iloc[3], color = '.4', width = wid)
p6 = plt.bar([0,1], wine_gb.iloc[5], bottom = wine_gb.cumsum().iloc[4], color = '.5', width = wid)
p7 = plt.bar([0,1], wine_gb.iloc[6], bottom = wine_gb.cumsum().iloc[5], color = '.6', width = wid)

plt.xticks([0,1],['Red','White'])

plt.legend( (p1[0],p2[0],p3[0],p4[0],p5[0],p6[0],p7[0]), (wine_gb.index))

plt.show()

In [None]:
## Rescale columns

wine_gb_100 = wine_gb

for column in wine_gb_100.columns:
    wine_gb_100[column] = wine_gb_100[column]/(wine_gb_100[column].sum())

In [None]:
fig, ax = plt.subplots()
p1 = plt.bar([0,1], wine_gb_100.iloc[0], color = '0')
p2 = plt.bar([0,1], wine_gb_100.iloc[1], bottom = wine_gb_100.cumsum().iloc[0], color = '.1', width = wid)
p3 = plt.bar([0,1], wine_gb_100.iloc[2], bottom = wine_gb_100.cumsum().iloc[1], color = '.2', width = wid)
p4 = plt.bar([0,1], wine_gb_100.iloc[3], bottom = wine_gb_100.cumsum().iloc[2], color = '.3', width = wid)
p5 = plt.bar([0,1], wine_gb_100.iloc[4], bottom = wine_gb_100.cumsum().iloc[3], color = '.4', width = wid)
p6 = plt.bar([0,1], wine_gb_100.iloc[5], bottom = wine_gb_100.cumsum().iloc[4], color = '.5', width = wid)
p7 = plt.bar([0,1], wine_gb_100.iloc[6], bottom = wine_gb_100.cumsum().iloc[5], color = '.6', width = wid)

plt.xticks([0,1],['Red','White'])

plt.legend( (p1[0],p2[0],p3[0],p4[0],p5[0],p6[0],p7[0]), (wine_gb.index))

plt.show()

### Seaborn

## Scatter Chart

In [None]:
plt.scatter(df['volatile acidity'], df['chlorides'])

In [None]:
colors = np.where(df["type"]=='red','maroon','gold')
plt.scatter(x = df['volatile acidity'], y = df['chlorides'], c = colors)

In [None]:
plt.scatter(x = df['volatile acidity'], y = df['chlorides'], c = colors, s=df['free sulfur dioxide'])

#### Seaborn

In [None]:
sns.scatterplot(x = df['volatile acidity'], y = df['chlorides'])

In [None]:
sns.scatterplot(x = df['volatile acidity'], y = df['chlorides'], hue = df["type"])

In [None]:
sns.pairplot(df, hue = 'type')

## Histograms

#### Matplotlib

In [None]:
plt.hist(df['pH'])
plt.show()

In [None]:
plt.hist(df['pH'], bins= 15)
plt.show()

#### Seaborn

In [None]:
sns.distplot(df['pH'], kde = False)

In [None]:
sns.distplot(df['pH'], kde = False, bins= 15)

## Line Charts

In [None]:
import numpy as np

def fake_date(days, base_date=np.datetime64('2014-01-01')):
    random_date = base_date + np.random.choice(np.arange(0,days))
    return random_date

df['date'] = df.apply(lambda x: fake_date(days = 365), axis = 1)

In [None]:
df_daily = df.groupby(['date','type'])['quality'].agg(['mean']).unstack()
df_daily.columns = df_daily.columns.droplevel()

#### Matplotlib

In [None]:
plt.plot(df_daily.index, df_daily.red, c = 'maroon')
plt.plot(df_daily.index, df_daily.white, c = 'gold')
plt.show()

In [None]:
df['month'] = df.date.dt.month
df_monthly = df.groupby(['month','type'])['quality'].agg(['mean']).unstack()
df_monthly.columns = df_monthly.columns.droplevel()

In [None]:
plt.plot(df_monthly.index, df_monthly.red, c = 'maroon')
plt.plot(df_monthly.index, df_monthly.white, c = 'gold')
plt.xlabel('Month')
plt.show()

#### Seaborn

In [None]:
df_monthly_stacked = df.groupby(['month','type'])['quality'].agg(['mean']).reset_index()

In [None]:
ax = sns.lineplot(x = 'month', y ='mean', hue = 'type', data = df_monthly_stacked)

## Area Charts

In [None]:
df['month'] = df.date.dt.month
df_monthly_count = df.groupby(['month','type'])['quality'].agg(['count']).unstack()
df_monthly_count.columns = df_monthly_count.columns.droplevel()
df_monthly_count.head()

In [None]:
plt.fill_between(np.arange(12),df_monthly_count.red, color = 'maroon', alpha = 0.5)
plt.fill_between(np.arange(12),df_monthly_count.white, color = 'gold', alpha = 0.5)

In [None]:
df_monthly_norm = df_monthly_count.divide(df_monthly_count.sum(axis =1), axis = 0 )

plt.stackplot(df_monthly_norm.index,
              [df_monthly_norm['white'], df_monthly_norm['red']],
              labels=['white', 'red'],
              colors = ['gold','maroon'],
              alpha=0.8)


## Heat Maps

#### Matplotlib

2D histograms are a special type of heat map that are helpful when our data is too dense to understand

In [None]:
fig, (whites, reds) = plt.subplots(2)

whites.hist2d(df_white['volatile acidity'], df_white['pH'], bins = 50)
reds.hist2d(df_red['volatile acidity'], df_red['pH'], bins = 50)

plt.show()

#### Seaborn

In [None]:
g = sns.jointplot('volatile acidity', 'pH', data=df, kind="hex")

In [None]:
g = sns.jointplot('volatile acidity', 'pH', data=df, kind="reg")

## Styles/Color Palettes

In [None]:
plt.style.available

In [None]:
mpl.style.use('ggplot')
plt.scatter(df['volatile acidity'], df['chlorides'])
plt.show()

In [None]:
mpl.style.use('fivethirtyeight')
plt.scatter(df['volatile acidity'], df['chlorides'])
plt.show()

#### Seaborn

In [None]:
sns.set_style('darkgrid')
sns.scatterplot(x = df['volatile acidity'], y = df['chlorides'])

In [None]:
sns.set_style('ticks')
sns.scatterplot(x = df['volatile acidity'], y = df['chlorides'])

In [None]:
custom_palette = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
sns.palplot(sns.color_palette(custom_palette))