# Penguins

"Data were collected and made available by Dr. Kristen Gorman and the Palmer Station, Antarctica LTER, a member of the Long Term Ecological Research Network."
-- https://github.com/allisonhorst/palmerpenguins

In [None]:
# NOTE
# For the moment, we need to update seaborn on the JupyterHub
# which I do explicitly here
!pip install -U seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
penguins = sns.load_dataset("penguins")

In [None]:
penguins.to_csv('/home/jovyan/penguins.csv',index=False)

In [None]:
penguins.info()

In [None]:
penguins

### Histograms for numerical data, conditioned on categorical values

In [None]:
sns.histplot(data=penguins, 
             x="flipper_length_mm")

In [None]:
sns.histplot(data=penguins, 
             x="flipper_length_mm", 
             hue="species")

In [None]:
sns.histplot(data=penguins, 
             x="flipper_length_mm", 
             hue="species", 
             multiple="stack")

In [None]:
sns.histplot(data=penguins, 
             x="flipper_length_mm", 
             hue="species", 
             multiple="fill")

In [None]:
sns.histplot(data=penguins, 
             x="flipper_length_mm", 
             hue="species", 
             multiple="dodge")

## FacetGrid

The Seaborn FacetGrid "class maps a dataset onto multiple axes arrayed in a grid of rows and columns that correspond to levels of variables in the dataset. The plots it produces are often called 'lattice', 'trellis', or 'small-multiple' graphics." -- https://seaborn.pydata.org/generated/seaborn.FacetGrid.html

If you want to use facet plots, I highly recommend reading the above page for more information.

In [None]:
g = sns.FacetGrid(penguins, col="species")
g.map_dataframe(sns.histplot, x="flipper_length_mm")

"catplot" is useful for drawing categorical plots onto a FacetGrid.  https://seaborn.pydata.org/generated/seaborn.catplot.html

In [None]:
sns.catplot(data=penguins,
            x="flipper_length_mm",
            hue="species",
            col="species")

In [None]:
sns.catplot(data=penguins,
            x="flipper_length_mm",
            hue="species",
            col="species",
            kind='box')

In [None]:
sns.catplot(data=penguins,
            x="flipper_length_mm",
            hue="species",
            row="species",
            kind='box', height=1, aspect=4)

"displot" is useful for drawing distribution plots onto a FacetGrid.  https://seaborn.pydata.org/generated/seaborn.displot.html

In [None]:
sns.displot(data=penguins,
            x="flipper_length_mm",
            hue="species",
            col="species")

In [None]:
sns.displot(data=penguins,
            x="flipper_length_mm",
            hue="species",
            col="species", kind='kde')

You can just use the kdeplot too.

In [None]:
sns.kdeplot(data=penguins, 
             x="flipper_length_mm", 
             hue="species")

In [None]:
sns.kdeplot(data=penguins, 
             x="flipper_length_mm", 
             hue="species", 
             multiple="stack")

In [None]:
sns.kdeplot(data=penguins, 
             x="flipper_length_mm", 
             hue="species", 
             multiple="fill")

In [None]:
sns.kdeplot(data=penguins, 
             x="flipper_length_mm", 
             hue="species", 
             multiple="layer")

In [None]:
g = sns.FacetGrid(penguins, col="species")
g.map_dataframe(sns.kdeplot, x="flipper_length_mm")

## Pairplot

Extremely useful for getting a snapshot all at once of the relations between pairs of variables.
https://seaborn.pydata.org/generated/seaborn.pairplot.html

In [None]:
sns.pairplot(data=penguins)

In [None]:
sns.scatterplot(data=penguins, x='bill_length_mm', y='bill_depth_mm')

In [None]:
sns.scatterplot(data=penguins, x='body_mass_g', y='flipper_length_mm')

## Regression

"lmplot" and "regplot" are useful for looking at regression analysis.
* "regplot" will plot data and a linear regression model fit.
* "lmplot": regplot + FacetGrid.  Plot data and regression model fits across a FacetGrid.

In [None]:
sns.regplot(data=penguins, x='body_mass_g', y='flipper_length_mm')

In [None]:
sns.lmplot(data=penguins, x='body_mass_g', y='flipper_length_mm')

In [None]:
# this will give an error!

sns.regplot(data=penguins, x='body_mass_g', y='flipper_length_mm', hue='species')

In [None]:
sns.lmplot(data=penguins, x='body_mass_g', y='flipper_length_mm', hue='species')

In [None]:
sns.lmplot(data=penguins, x='body_mass_g', y='flipper_length_mm', col='species')

In [None]:
sns.lmplot(data=penguins, x='body_mass_g', y='flipper_length_mm', hue='species', col='sex')

In [None]:
sns.lmplot(data=penguins, x='body_mass_g', y='flipper_length_mm', col='species', hue='sex')

### Residuals

Plotting residuals (difference of data relative to the regression fit) allows us to see if there are missed patterns in the regression fit.

In [None]:
import numpy as np

In [None]:
# generate 100 points from a normal 
# distribution that has mean = 0 and std dev = 3.5
np.random.seed(42)
noise = np.random.normal(0,3.5,100)

x = np.linspace(0,10,100)
y = x**2 + noise

In [None]:
sns.regplot(x=x,y=y)

In [None]:
sns.residplot(x=x,y=y)

In [None]:
sns.regplot(x=x,y=y,order=2)

In [None]:
sns.residplot(x=x,y=y,order=2)

In [None]:
sns.regplot(data=penguins, x='body_mass_g', y='flipper_length_mm')

In [None]:
sns.residplot(data=penguins, x='body_mass_g', y='flipper_length_mm')

In [None]:
sns.jointplot(data=penguins, x='body_mass_g', y='flipper_length_mm')

In [None]:
sns.jointplot(data=penguins, x='body_mass_g', y='flipper_length_mm', kind='resid')

In [None]:
g = sns.FacetGrid(penguins, col="species")
g.map_dataframe(sns.residplot, x="flipper_length_mm", y="bill_depth_mm")

In [None]:
g = sns.FacetGrid(penguins, col="species", hue='species')
g.map_dataframe(sns.regplot, x="flipper_length_mm", y="bill_depth_mm")

## Back to scatter plots...

In [None]:
sns.scatterplot(data=penguins, x='body_mass_g', y='species')

In [None]:
sns.boxplot(data=penguins, x='body_mass_g', y='species')

In [None]:
sns.boxplot(data=penguins, y='body_mass_g', x='species')

Let's return to the pairplot and see how faceting can help us zero in on understanding correlations.

In [None]:
sns.pairplot(data=penguins)

In [None]:
penguins.corr(numeric_only=True)

Something seems fishy with the negative correlation.  We can make faceted plots or even faceted correlation matrices to understand variation across another variable, say species.

In [None]:
sns.pairplot(data=penguins, hue='species')

Can be ideal to follow pairplot with heatmap, because that will show you color representing the correlation coefficients.

In [None]:
penguins.corr(numeric_only=True)

In [None]:
sns.heatmap(data=penguins.corr(numeric_only=True))

In [None]:
# straightening out some useful parameters
sns.heatmap(data=penguins.corr(numeric_only=True),
            cmap='RdBu_r',annot=True,vmin=-1,vmax=1)

In [None]:
# straightening out some useful parameters

# note that the following does a bad job of overlapping graphical features

for s in penguins.species.unique():
    sns.heatmap(data=penguins[penguins['species']==s].corr(numeric_only=True),
                cmap='RdBu_r',annot=True,vmin=-1,vmax=1)

In [None]:
# straightening out some useful parameters

# the following bypasses the overlap by initializing a new figure
# each time through the loop

for s in penguins.species.unique():
    plt.figure()
    sns.heatmap(data=penguins[penguins['species']==s].corr(numeric_only=True),
                cmap='RdBu_r',annot=True,vmin=-1,vmax=1)

In [None]:
# ERROR
# ideally we would like to put the above into columns or rows
# similar to what we do with a facetgrid
# but the below will NOT work

g = sns.FacetGrid(penguins.corr(numeric_only=True), col="species")
g.map_dataframe(sns.heatmap)

In [None]:
for ix,val in enumerate([20,453,1.34]):
    print(ix,val)

In [None]:
# straightening out some useful parameters

# here we start to use matplotlib itself to make one column per species
# this WILL NOT work because it does not correctly use the axes object
# when using sns.heatmap

fig,ax = plt.subplots(1,3)
for ix,s in enumerate(penguins.species.unique()):
    ax[ix] = sns.heatmap(data=penguins[penguins['species']==s].corr(numeric_only=True),
                         cmap='RdBu_r',annot=True,vmin=-1,vmax=1)

In [None]:
# straightening out some useful parameters

# this does a better job by using ax as an input parameter in sns.heatmap

fig,ax = plt.subplots(1,3)
for ix,s in enumerate(penguins.species.unique()):
    sns.heatmap(data=penguins[penguins['species']==s].corr(numeric_only=True),
                cmap='RdBu_r',annot=True,vmin=-1,vmax=1,ax=ax[ix])

In [None]:
# straightening out some useful parameters

# this further uses "constrained_layout=True" to separate the graphics

fig,ax = plt.subplots(1,3,constrained_layout=True)
for ix,s in enumerate(penguins.species.unique()):
    sns.heatmap(data=penguins[penguins['species']==s].corr(numeric_only=True),
                cmap='RdBu_r',annot=True,vmin=-1,vmax=1,ax=ax[ix])

In [None]:
# straightening out some useful parameters

# and this changes the figsize so that the interior graphics are more properly
# sized relative to the overall figsize

fig,ax = plt.subplots(1,3,constrained_layout=True,figsize=(12,4))
for ix,s in enumerate(penguins.species.unique()):
    sns.heatmap(data=penguins[penguins['species']==s].corr(numeric_only=True),
                cmap='RdBu_r',annot=True,vmin=-1,vmax=1,ax=ax[ix],cbar=False)

In [None]:
# straightening out some useful parameters

# and this changes the figsize so that the interior graphics are more properly
# sized relative to the overall figsize

# we now further accommodate adding yticklabels and colorbar only for the right- and left-most plots

fig,ax = plt.subplots(1,3,constrained_layout=True,figsize=(12,4))
numspecies = penguins.species.nunique()
for ix,s in enumerate(penguins.species.unique()):
    if ix==0:
        sns.heatmap(data=penguins[penguins['species']==s].corr(numeric_only=True),
                    cmap='RdBu_r',annot=True,vmin=-1,vmax=1,ax=ax[ix],cbar=False,yticklabels=True)
    elif ix==numspecies-1:
        sns.heatmap(data=penguins[penguins['species']==s].corr(numeric_only=True),
                    cmap='RdBu_r',annot=True,vmin=-1,vmax=1,ax=ax[ix],cbar=True,yticklabels=False)
    else:
        sns.heatmap(data=penguins[penguins['species']==s].corr(numeric_only=True),
                    cmap='RdBu_r',annot=True,vmin=-1,vmax=1,ax=ax[ix],cbar=False,yticklabels=False)

# More on categories

In [None]:
# Bar plot -> but this will fail!
sns.barplot(data=penguins, x='species')

Why does the above fail?  
-> because we need to tell it what numerical values to plot
* a count of values of something
* an aggregate calculation (like average, median, ...)
* an aggregate of another column

In [None]:
sns.countplot(data=penguins, x='species')

In [None]:
penguins.groupby('species').count()

In [None]:
penguins['species'].value_counts()

In [None]:
penguins['species'].value_counts().reset_index()

In [None]:
a=penguins['species'].value_counts().reset_index()
sns.barplot(data=a, x='species', y='count')
plt.xlabel('species')
plt.ylabel('count')

Bar plot will work in the cell below because when you pass in a numerical column for `y`, it will calculate the mean of rows and the confidence interval.

In [None]:
sns.barplot(data=penguins, x='species', y='body_mass_g')

"countplot" is not appropriate in that same case.  If you pass in "y='body_mass_g'" it will give an error because y should normally just be a count related to the x-feature.  If you pass in "hue='body_mass_g'", it will produce output without error, but it outputs a count for every single unique value of the x-y pairs.

In [None]:
sns.countplot(data=penguins, x='species', hue='body_mass_g')

With "countplot", hue should be used for more clearly discrete data with a small number of values.

In [None]:
sns.countplot(data=penguins, x='species', hue='sex')

In [None]:
sns.countplot(data=penguins, y='species', hue='sex')

In [None]:
sns.countplot(data=penguins, y='species', hue='sex', dodge=True)

In [None]:
sns.countplot(data=penguins, y='species', hue='sex', dodge=False)

In [None]:
g = sns.FacetGrid(penguins, col="island")
g.map_dataframe(sns.countplot, x="species")

In [None]:
g = sns.FacetGrid(penguins, col="species")
g.map_dataframe(sns.countplot, x="island", hue="sex")

Beware that the same funniness can happen in barplot when trying to use hue for numerical values.

In [None]:
sns.barplot(data=penguins, x='species',y='flipper_length_mm')

In [None]:
sns.barplot(data=penguins, x='species',y='flipper_length_mm', hue='bill_depth_mm')

In [None]:
sns.barplot(data=penguins, x='species',y='flipper_length_mm', hue='island')

In [None]:
sns.barplot(data=penguins, x='species',y='flipper_length_mm', hue='sex')

In [None]:
sns.catplot(data=penguins, x='species',y='flipper_length_mm', hue='sex', kind='bar')

### Illustrative examples
Here's a fuller set of categorical+numerical plot examples:

In [None]:
for ix,val in enumerate(['strip','box','violin','boxen','point','bar']):
    sns.catplot(data=penguins, x='species',y='flipper_length_mm', hue='sex', kind=val)

In [None]:
sns.catplot(data=penguins, 
            x='species',
            y='flipper_length_mm', 
            hue='sex', 
            kind='violin', 
            col='island')

The categorizations can be tailored visually to emphasize the relationships that you want to focus on.

In [None]:
sns.catplot(data=penguins, hue='species',y='flipper_length_mm', x='sex', kind='violin')

In [None]:
sns.catplot(data=penguins, col='species',y='flipper_length_mm', x='sex', kind='violin')

In [None]:
sns.catplot(data=penguins, x='species',y='flipper_length_mm', hue='sex', kind='violin', 
            split=False)

In [None]:
sns.catplot(data=penguins, x='species',y='flipper_length_mm', hue='sex', kind='violin', 
            split=True)

In [None]:
# Note that the following will not give an error but will
# not quite be as straightforward to interpret

sns.catplot(data=penguins, hue='species',y='flipper_length_mm', x='sex', kind='violin', split=True)

In [None]:
sns.catplot(data=penguins, col='species',y='flipper_length_mm', x='sex', kind='point')

In [None]:
sns.catplot(data=penguins, hue='species',y='flipper_length_mm', x='sex', kind='point')

In [None]:
sns.catplot(data=penguins, hue='species',y='body_mass_g', x='sex', kind='point')

In [None]:
sns.catplot(data=penguins, hue='species',y='body_mass_g', x='sex', kind='bar')

In [None]:
sns.catplot(data=penguins, x='species',y='body_mass_g', col='sex', kind='box')

In [None]:
sns.catplot(data=penguins, x='species',y='body_mass_g', hue='sex', kind='box')

In [None]:
sns.catplot(data=penguins, y='species',x='body_mass_g', hue='sex', kind='box')

In [None]:
sns.catplot(data=penguins, hue='species',y='body_mass_g', x='sex', kind='box')

In [None]:
sns.catplot(data=penguins, x='species',y='body_mass_g', hue='sex', kind='point')

In [None]:
sns.catplot(data=penguins, x='species',y='body_mass_g', hue='sex', kind='violin',split=True)

In [None]:
sns.catplot(data=penguins, x='species',y='body_mass_g', hue='sex', kind='point', col='island')

In [None]:
sns.catplot(data=penguins, col='species',y='body_mass_g', hue='sex', kind='point', x='island')

In [None]:
sns.catplot(data=penguins, hue='species',y='body_mass_g', col='sex', kind='point', x='island')