# Visualizing the distribution of a dataset

In [None]:
import numpy as np
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes = True)

In [None]:
np.random.seed(sum(map(ord, 'distributions')))

## Plotting univariate distributions

`distplot()` - draws a histogram, and fits a kernel density estimate (KDE)

In [None]:
x = np.random.normal(size = 100)
sns.distplot(x)

### Histograms

`hist` function exists in matplotlib

Remove density curve, add rug plot

In [None]:
sns.distplot(x, kde = False, rug = True)

In [None]:
sns.distplot(x, kde = False, rug = True, vertical = True)

In [None]:
sns.distplot(x, bins = 20, kde = False, rug = True)

In [None]:
sns.distplot(x, hist = False, rug = True)

### Aside: Calculate density plot manually

#### Calculate each individual kernel

In [None]:
x = np.random.normal(0, 1, size = 30)
bandwidth = 1.06 * x.std() * x.size ** (-1 / 5.)
support = np.linspace(-4, 4, 200)

kernels = []
for x_i in x:
    kernel = stats.norm(x_i, bandwidth).pdf(support)
    kernels.append(kernel)
    plt.plot(support, kernel, color = 'b')

sns.rugplot(x, color = '.2', linewidth = 3);

#### Sum up the kernels, then normalize

In [None]:
density = np.sum(kernels, axis = 0)

In [None]:
density /= integrate.trapz(density, support)

In [None]:
plt.plot(support, density)

### Or, just get density plot using `kdeplot`

In [None]:
sns.kdeplot(x, shade = True)

In [None]:
sns.kdeplot(x)
sns.kdeplot(x, bw = 0.2, label = 'bw: 0.2')
sns.kdeplot(x, bw = 2, label = 'bw: 2')
plt.legend()

### Use `cut` to limit how much is drawn (doesn't affect fit)

In [None]:
sns.kdeplot(x, shade = True, cut = 0)
sns.rugplot(x)

## Fitting parametric distributions

In [None]:
x = np.random.gamma(6, size = 200)
sns.distplot(x, kde = False, fit = stats.gamma)

## Plotting bivariate distributions - `jointplot()`

In [None]:
mean, cov = [0, 1], [(1, .5), (.5, 1)]
data = np.random.multivariate_normal(mean, cov, 200)
df = pd.DataFrame(data, columns = ['x', 'y'])

### Scatterplots

In [None]:
plt.scatter?

In [None]:
plt.scatter(x = 'x', y = 'y', data = df)

In [None]:
sns.jointplot(x = 'x', y = 'y', data = df)

### Hexbin plots

Bivariate analogue of a histogram is a hexbin plot
* shows counts of observations that fall within a hexbin
* best with a white background
* plt.hexbin OR jointplot() style

In [None]:
x, y = np.random.multivariate_normal(mean, cov, 1000).T

In [None]:
plt.hexbin(x, y)

In [None]:
with sns.axes_style('white'):
    sns.jointplot(x = x, y = y, kind = 'hex', color = 'g')

### Kernel density estimation

In [None]:
sns.jointplot(x = 'x', y = 'y', data = df, kind = 'kde')

Use `kdeplot` to draw a 2-D kernel density plot
* allows you to draw this onto an exiting plot

In [None]:
f, ax = plt.subplots(figsize = (6, 6))
sns.kdeplot(df.x, df.y, color = 'b', ax = ax) # color has no affect?
sns.rugplot(df.x, color = 'g', ax = ax)
sns.rugplot(df.y, color = 'r', vertical = True, ax = ax)

Increase number of countour levels to draw more continuously

In [None]:
f, ax = plt.subplots(figsize = (6, 6))
cmap = sns.cubehelix_palette(
    as_cmap = True, 
    dark = 0, 
    light = 1,
    reverse = True)
sns.kdeplot(df.x, df.y, cmap = cmap, n_levels = 60, shade = True)

#### Use `JointGrid` directly so you can add more layers or tweak

In [None]:
g = sns.jointplot(x = 'x', y = 'y', data = df, kind = 'kde', color = 'm')

Can't add this in a separate cell

In [None]:
g.plot_joint(plt.scatter, c = 'w', s = 30, linewidth = 1, marker = '+')

In [None]:
g = sns.jointplot(x = 'x', y = 'y', data = df, kind = 'kde', color = 'm')
g.plot_joint(plt.scatter, c = 'w', s = 30, linewidth = 1, marker = '+')

In [None]:
g = sns.jointplot(x = 'x', y = 'y', data = df, kind = 'kde', color = 'm')
g.plot_joint(plt.scatter, c = 'w', s = 30, linewidth = 1, marker = '+')
g.ax_joint.collections[0].set_alpha(0)

In [None]:
g = sns.jointplot(x = 'x', y = 'y', data = df, kind = 'kde', color = 'm')
g.plot_joint(plt.scatter, c = 'w', s = 30, linewidth = 1, marker = '+')
g.ax_joint.collections[0].set_alpha(0)
g.set_axis_labels('$X$', '$Y_\mu$')

Note: I used LaTeX in the previous label

## Visualizing pairwise relationships in a dataset

Plot multiple bivariate distributions in a dataset using `pairplot`

In [None]:
iris = sns.load_dataset('iris')
sns.pairplot(iris)

#### `pairplot` is built on top of a `PairGrid` object

In [None]:
g = sns.PairGrid(iris)

In [None]:
type(g)

In [None]:
# only use 3 columns so that the entire grid fits on my laptop screen better
# copy() otherwise we just have a view
iris3 = iris.ix[:, :3].copy()

In [None]:
iris.columns

In [None]:
g = sns.PairGrid(iris3)
g.map_diag(sns.kdeplot)

In [None]:
g = sns.PairGrid(iris3)
g.map_diag(sns.kdeplot, shade = True)
g.map_offdiag(sns.kdeplot, cmap = 'Blues_d', n_levels = 6)

##### This gives an error  - not sure how I'd change the color on the diagonals

In [None]:
g = sns.PairGrid(iris3)
g.map_diag(sns.kdeplot, shade = True, color = 'r')
g.map_offdiag(sns.kdeplot, cmap = 'Blues_d', n_levels = 6)

In [None]:
sns.kdeplot(iris3, cmap = 'Reds')

In [None]:
sns.kdeplot(iris3.ix[:, 0], color = 'r')

In [None]:
setosa = iris.loc[iris.species == "setosa"]
virginica = iris.loc[iris.species == "virginica"]
ax = sns.kdeplot(setosa.sepal_width, setosa.sepal_length,
                 cmap="Reds", shade=True, shade_lowest=False)
ax = sns.kdeplot(virginica.sepal_width, virginica.sepal_length,
                 cmap="Blues", shade=True, shade_lowest=False)

In [None]:
sns.kdeplot(iris3, cmap = 'BuGn_d')
plt.title('Seaborn kdeplot in bokeh')