In [None]:
%matplotlib inline

import readline
import altair as alt
import pandas as pd
import seaborn as sns
from matplotlib import pyplot
from plotnine import *
import numpy as np

from plotly import figure_factory
from plotly import graph_objects
import plotly.express as px
from IPython.core.magic import Magics, magics_class, cell_magic

from IPython.display import Image

from pylab import rcParams

size = 20
params = {
    "legend.fontsize": size,
    "figure.figsize": (15, 5),
    "axes.labelsize": size,
    "axes.titlesize": size,
    "xtick.labelsize": size,
    "ytick.labelsize": size,
    "axes.titlesize": 1.5 * size,
    "figure.figsize": (12, 12),
}
rcParams.update(params)
theme_update(
    figure_size=(9, 9),
    title=element_text(size=size),
    text=element_text(size=0.6 * size),
)  # for plotnine


import plotly.io as pio
pio.renderers.default = "png"
pio.renderers["png"].width = 750
pio.renderers["png"].height = 750

alt.renderers.enable('png', webdriver='firefox')

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
library(ggplot2) 
theme_set(theme_grey(base_size = 24))

Outline

* Plots 
    * Bar 
    * Histogram
    * Scatter 
    * Box and whisker
    * Line 
        * Time series
* Extensions
    * Faceting on categorical
    * Stacked bar/histogram
    * Line over scatter
    * Multi color/symbol scatter
    * Multiple line plots
* Modifications
    * Log scaled axes
    * Time labeled axes
    * Histogram bin-width
    * Aspect ratio
    * Axis boundaries


## Load Data

Data is from [ggplot2](http://ggplot2.tidyverse.org/reference/mpg.html).

Many ggplot examples taken from [R for data science](http://r4ds.had.co.nz/visualize.html) by Garrett Grolemund and Hadley Wickham.

In [None]:
%%R
summary(mpg)

In [None]:
mpg = pd.read_csv('https://raw.githubusercontent.com/tidyverse/ggplot2/master/data-raw/mpg.csv')
diamonds = pd.read_csv('https://raw.githubusercontent.com/tidyverse/ggplot2/master/data-raw/diamonds.csv')

ts = (pd
      .Series(np.random.randn(1000), 
              index=pd.date_range('1/1/2000', periods=1000))
      .cumsum()
      .to_frame()
      .reset_index()
     )
ts.columns = ['date', 'value']

In [None]:
from IPython.display import display, HTML
display(HTML(ts.head(5).to_html(index=False)))

In [None]:
from IPython.display import display, HTML
display(HTML(mpg.head(5).to_html(index=False)))

In [None]:
from IPython.display import display, HTML
display(HTML(diamonds.head(5).to_html(index=False)))

## Plots

### Basic Plots

#### Bar Charts of Counts

In [None]:
%%R -w 10 -h 10 -u in
ggplot(data=mpg) + 
    aes(x=manufacturer) + 
    geom_bar() + 
    coord_flip() +
    ggtitle("Number of Cars by Make")

In [None]:
(mpg['manufacturer']
 .value_counts(sort=False)
 .plot.barh()
 .set_title('Number of Cars by Make')
);

In [None]:
"""plotnine gives an error on `ggplot(data=mpg)`.
"""
(ggplot(mpg) + 
   aes(x="manufacturer") +
   geom_bar(size=20) + 
   coord_flip() +
   ggtitle("Number of Cars by Make")
)

In [None]:
px.histogram(
    mpg, y="manufacturer", 
    title='Number of Cars by Make'
)

In [None]:
(
    alt.Chart(
        mpg, title="Number of Cars by Make"
    )
    .mark_bar()
    .encode(
        x="count()", y=alt.Y("manufacturer")
    )
)

#### Histogram of Continuous Values

In [None]:
%%R -w 10 -h 10 -u in
ggplot(data=mpg) + 
    aes(x=cty) + 
    geom_histogram(binwidth=2)

In [None]:
pyplot.hist('cty', bins=12, data=mpg);

In [None]:
(mpg['cty']
 .plot
 .hist(bins=12));

In [None]:
(ggplot(mpg) + 
    aes(x="cty") +
    geom_histogram(binwidth=2))

In [None]:
px.histogram(
    mpg, x="cty"
)

In [None]:
(
    alt.Chart(mpg)
    .mark_bar()
    .encode(
        alt.X("cty", bin=alt.Bin(step=2)),
        y="count()",
    )
)

#### Scatter Plot

In [None]:
%%R -w 10 -h 10 -u in 
ggplot(data = mpg) +
    aes(x = displ, y = hwy) +
    geom_point() + 
    ggtitle("Engine Displacement in Liters vs Highway MPG") +
    xlab("Engine Displacement in Liters") +
    ylab("Highway MPG")

In [None]:
(mpg
 .plot
 .scatter(x='displ', y='hwy')
 .set(title='Engine Displacement in Liters vs Highway MPG',
      xlabel='Engine Displacement in Liters',
      ylabel='Highway MPG'))

In [None]:
(ggplot(mpg) +
    aes(x = "displ", y = "hwy") +
    geom_point() + 
    ggtitle("Engine Displacement in Liters vs Highway MPG") +
    xlab("Engine Displacement in Liters") +
    ylab("Highway MPG"))

In [None]:
alt.Chart(mpg).mark_circle().encode(
    alt.X(
        "displ",
        title="Engine Displacement in Liters",
    ),
    alt.Y(
        "hwy",
        title="Highway Miles per Gallon",
    ),
).properties(
    title="Engine Displacement in Liters"
)

In [None]:
px.scatter(
    mpg, x="displ", y="hwy", 
    title='Engine Displacement in Liters vs Highway MPG',
    labels=dict(
       displ='Engine Displacement in Liters', 
       hwy='Highway MPG')
)

### Advanced Scatter Plotting

In [None]:
%%R -w 10 -h 10 -u in 
ggplot(data = mpg) +
    aes(x = displ, y = hwy) +
    geom_point() +
    geom_smooth(method=lm)

In [None]:
(ggplot(mpg) +
    aes("displ", "hwy") +
    geom_point() +
    geom_smooth(method="lm"))

In [None]:
sns.lmplot(x="displ", y="hwy", 
           data=mpg, size=12)

In [None]:
'''No built in method to calculate and display confidence intervals.
Must calculate manually and utilise existing features to build the confidence band.'''
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import summary_table

y=mpg.hwy
x=mpg.displ
X = sm.add_constant(x)
res = sm.OLS(y, X).fit()

st, data, ss2 = summary_table(res, alpha=0.05)
preds = pd.DataFrame.from_records(data, columns=[s.replace('\n', ' ') for s in ss2])
preds['displ'] = mpg.displ
preds = preds.sort_values(by='displ')

fig = graph_objects.Figure(layout={
    'title' : 'Engine Displacement in Liters vs Highway MPG',
    'xaxis' : {
        'title' : 'Engine Displacement in Liters'
    },
    'yaxis' : {
        'title' : 'Highway MPG'
    }
})
p1 = graph_objects.Scatter(**{
    'mode' : 'markers',
    'x' : mpg.displ,
    'y' : mpg.hwy,
    'name' : 'Points'
})
p2 = graph_objects.Scatter({
    'mode' : 'lines',
    'x' : preds['displ'],
    'y' : preds['Predicted Value'],
    'name' : 'Regression',
})
#Add a lower bound for the confidence interval, white
p3 = graph_objects.Scatter({
    'mode' : 'lines',
    'x' : preds['displ'],
    'y' : preds['Mean ci 95% low'],
    'name' : 'Lower 95% CI',
    'showlegend' : False,
    'line' : {
        'color' : 'white'
    }
})
# Upper bound for the confidence band, transparent but with fill
p4 = graph_objects.Scatter( {
    'type' : 'scatter',
    'mode' : 'lines',
    'x' : preds['displ'],
    'y' : preds['Mean ci 95% upp'],
    'name' : '95% CI',
    'fill' : 'tonexty',
    'line' : {
        'color' : 'white'
    },
    'fillcolor' : 'rgba(255, 127, 14, 0.3)'
})
fig.add_trace(p1)
fig.add_trace(p2)
fig.add_trace(p3)
fig.add_trace(p4)
Image(fig.to_image(format="png", width=750, height=750))

#### Scatter Plot with Colors

In [None]:
%%R -w 10 -h 10 -u in
ggplot(data = mpg) + 
    aes(x = displ, y = hwy, color = class) +
    geom_point() + 
    ggtitle("Engine Displacement in Liters vs Highway MPG") +
    xlab("Engine Displacement in Liters") +
    ylab("Highway MPG")

In [None]:
fig, ax = pyplot.subplots()
for c, df in mpg.groupby('class'):
    ax.scatter(df['displ'], df['hwy'], label=c)
ax.legend()
ax.set_title('Engine Displacement in Liters vs Highway MPG')
ax.set_xlabel('Engine Displacement in Liters')
ax.set_ylabel('Highway MPG');

In [None]:
(
    alt.Chart(
        mpg,
        title="Engine Displacement in Liters vs Highway MPG",
    )
    .mark_circle()
    .encode(
        alt.X(
            "displ",
            title="Engine Displacament in Liters",
        ),
        alt.Y("hwy", title="Highway MPG"),
        color="class",
    )
)

In [None]:
"""seaborn.FacetGrid overrides the `rcParams['figure.figsize'] global parameter. 
You have to set the size in the size with `size=` in `FacetGrid`"""
(sns
 .FacetGrid(mpg, hue="class", size=10)
 .map(pyplot.scatter, "displ", "hwy")
 .add_legend()
 .set(
    title="Engine Displacement in Liters vs Highway MPG",
    xlabel="Engine Displacement in Liters",
    ylabel="Highway MPG"
));

In [None]:
(ggplot(mpg) + 
    aes(x = "displ", y = "hwy", color = "class") +
    geom_point() + 
    ggtitle("Engine Displacement in Liters vs Highway MPG") +
    xlab("Engine Displacement in Liters") +
    ylab("Highway MPG"))

In [None]:
px.scatter(
    mpg, x="displ", y="hwy", color="class", 
    title='Engine Displacement in Liters vs Highway MPG',
    labels=dict(
       displ='Engine Displacement in Liters', 
       hwy='Highway MPG')
)

#### Scatter Plot with Sizes and Transparency

In [None]:
%%R -w 10 -h 10 -u in
ggplot(data = mpg) +
    aes(x = cty, y = hwy, size = cyl) +
    geom_point(alpha=.5)

In [None]:
(
    alt.Chart(
        mpg,
        title="City MPG vs Highway MPG",
    )
    .mark_circle(opacity=0.3)
    .encode(
        x=alt.X(
            "cty",
            axis=alt.Axis(title="City MPG"),
        ),
        y=alt.Y(
            "hwy",
            axis=alt.Axis(
                title="Highway MPG"
            ),
        ),
        size="cyl",
    )
)

In [None]:
ax = (mpg
    .plot
    .scatter(x='cty', 
             y='hwy', 
             s=10*mpg['cyl'],
             alpha=.5))
ax.set_title('City MPG vs Highway MPG')
ax.set_xlabel('City MPG')
ax.set_ylabel('Highway MPG');

In [None]:
(ggplot(mpg) +
    aes(x="cty", y="hwy", size="cyl") +
    geom_point(alpha=.5))

In [None]:
px.scatter(
    mpg, x="cty", y="hwy", 
    size="cyl", size_max=10,
    title='City MPG vs Highway MPG',
    labels=dict(cty='City MPG', hwy='Highway MPG')
)

#### Scatter Plot with Single Facet

In [None]:
alt.Chart(mpg).mark_circle().encode(
    x=alt.X("displ"),
    y=alt.Y("hwy"),
    facet=alt.Facet("class:O", columns=4),
).properties(width=200, height=300)

In [None]:
%%R -w 10 -h 10 -u in
ggplot(data = mpg) + 
  aes(x=displ, y=hwy) +
  geom_point() + 
  facet_wrap(~ class, nrow = 2)

In [None]:
(mpg
 .pipe(sns.FacetGrid, 
       col="class", 
       col_wrap=4, 
       aspect=.5, 
       size=6)
 .map(pyplot.scatter, "displ", "hwy", s=20)
 .fig.subplots_adjust(wspace=.2, hspace=.2)
);

In [None]:
(ggplot(mpg.assign(c=mpg["class"])) + 
  aes(x="displ", y="hwy") +
  geom_point() +
  facet_wrap(" ~ c", nrow = 2))

In [None]:
px.scatter(
    mpg, x="displ", y="hwy", 
    facet_col="class", facet_col_wrap=4
)

#### Scatter Plot with Facets

In [None]:
%%R -w 10 -h 10 -u in
ggplot(data = mpg) + 
  aes(x = displ, y = hwy) +
  geom_point() + 
  facet_grid(drv ~ cyl)

In [None]:
(alt
 .Chart(mpg)
 .mark_circle()
 .encode(x="displ", y="hwy",)
 .properties(
    width=100, height=150
  )
 .facet(column="cyl", row="drv")
)

In [None]:
(mpg
 .pipe(sns.FacetGrid, 
       col="cyl", 
       row="drv", 
       aspect=.9, 
       size=4)
 .map(pyplot.scatter, "displ", "hwy", s=20)
 .fig.subplots_adjust(wspace=.02, hspace=.02)
);

In [None]:
(ggplot(mpg) + 
  aes(x="displ", y="hwy") +
  geom_point() + 
  facet_grid("drv ~ cyl"))

In [None]:
px.scatter(
    mpg, x="displ", y="hwy", 
    facet_col="cyl", facet_row="drv",
    category_orders=dict(cyl=[4,5,6,8])
)

Stacked Scatter and Smoothed Line Plot

In [None]:
%%R -w 10 -h 10 -u in
subcompact = mpg[mpg$`class` == "subcompact", ]
ggplot(data = mpg, 
       mapping = aes(x = displ, y = hwy)) + 
  geom_point(mapping = aes(color = class)) + 
  geom_smooth(data = subcompact, 
              se = FALSE,
              method = 'loess')

In [None]:
scatter = (
    alt.Chart(
        mpg,
        title="Engine Displacement in Liters vs Highway MPG",
    )
    .mark_circle()
    .encode(
        x=alt.X(
            "displ",
            axis=alt.Axis(
                title="Engine Displacament in Liters"
            ),
        ),
        y=alt.Y(
            "hwy",
            axis=alt.Axis(
                title="Highway MPG"
            ),
        ),
        color="class",
    )
)

line = (
    alt.Chart(
        mpg[mpg["class"] == "subcompact"]
    )
    .transform_loess("displ", "hwy")
    .mark_line()
    .encode(x=alt.X("displ"), y=alt.Y("hwy"))
)

scatter + line

In [None]:
"""Notice the smoothed line isn't as smooth as it is in ggplot2."""
(ggplot(data=mpg, 
        mapping=aes(x="displ", y="hwy")) + 
  geom_point(mapping=aes(color = "class")) + 
  geom_smooth(data=mpg[mpg['class'] == "subcompact"], 
              se=False,
              method = 'loess'
             ))

In [None]:
'''Plotly's builtin smoothing function is very weak'''

traces = []
for cls in mpg['class'].unique():
    traces.append(graph_objects.Scatter({
        'mode' : 'markers',
        'x' : mpg.displ[mpg['class'] == cls],
        'y' : mpg.hwy[mpg['class'] == cls],
        'name' : cls
    }))

    
subcompact = mpg[mpg['class'] == 'subcompact'].sort_values(by='displ')

traces.append(graph_objects.Scatter({
    'mode' : 'lines',
    'x' : subcompact.displ,
    'y' : subcompact.hwy,
    'name' : 'smoothing',
    'line' : {
        'shape' : 'spline',
        'smoothing' : 1.3
    }
}))
    
fig = graph_objects.Figure(**{
    'data' : traces,
    'layout' : {
        'title' : 'Engine Displacement in Liters vs Highway MPG',
        'xaxis' : {
            'title' : 'Engine Displacement in Liters',
        },
        'yaxis' : {
            'title' : 'Highway MPG'
        }
    }
})
Image(fig.to_image(format="png", width=750, height=750))

In [None]:
%%R -w 10 -h 10 -u in
ggplot(data = diamonds) + 
  aes(x = cut, fill = clarity) +
  geom_bar()

In [None]:
(ggplot(diamonds) + 
  aes(x="cut", fill="clarity") +
  geom_bar())

In [None]:
alt.data_transformers.disable_max_rows()
alt.Chart(diamonds).mark_bar().encode(
    x="cut", y="count(cut)", color="clarity"
).properties(width=300)

In [None]:
(diamonds
 .groupby(['cut', 'clarity'])
 .size()
 .unstack()
 .plot.bar(stacked=True)
);

In [None]:
px.histogram(
    diamonds, x="cut", color="clarity",
    category_orders=dict(cut=[
     "Fair", "Good",  "Very Good", 
     "Premium", "Ideal"])
)

In [None]:
%%R -w 10 -h 10 -u in
ggplot(data = diamonds) + 
  aes(x = cut, fill = clarity) +
  geom_bar(position = "dodge")

In [None]:
alt.data_transformers.disable_max_rows()
alt.Chart(diamonds).mark_bar().encode(
    x="clarity",
    y="count(cut)",
    color="clarity",
    column="cut",
).properties(width=100)

In [None]:
(ggplot(diamonds) + 
  aes(x='cut', fill='clarity') +
  geom_bar(position = "dodge"))


In [None]:
(diamonds
 .groupby(['cut', 'clarity'])
 .size()
 .unstack()
 .plot.bar()
);

In [None]:
px.histogram(
    diamonds, x="cut", color="clarity", barmode="group",
    category_orders=dict(cut=[
     "Fair", "Good",  "Very Good", 
     "Premium", "Ideal"])
)

In [None]:
%%R -w 10 -h 10 -u in
ggplot(diamonds) +
  aes(depth, fill = cut, colour = cut) +
  geom_density(alpha = 0.1) +
  xlim(55, 70)

In [None]:
"""I don't know whether Pandas can fill a KDE curve.

This requires using some Matplotlib to get them to stack and to have a legend.
"""
fig, ax = pyplot.subplots()
ax.set_xlim(55, 70)
for cut in diamonds['cut'].unique():
    s = diamonds[diamonds["cut"] == cut]['depth']
    s.plot.kde(ax=ax, label=cut)
ax.legend()

In [None]:
(sns
  .FacetGrid(diamonds, 
             hue="cut", 
             size=10, 
             xlim=(55, 70))
  .map(sns.kdeplot, 'depth', shade=True)
 .add_legend()
);

In [None]:
"""`+ xlim(55, 70)` results in an error."""
(ggplot(diamonds) +
  aes('depth', fill='cut', color='cut') +
  geom_density(alpha=0.1))

In [None]:
alt.data_transformers.disable_max_rows()
alt.Chart(diamonds).transform_density(
    "depth",
    as_=["depth", "density"],
    groupby=["cut"],
    extent=[55, 70],
).mark_area(fillOpacity=0.3,).encode(
    x="depth",
    y="density:Q",
    color="cut",
    stroke="cut",
)

In [None]:
fig = figure_factory.create_distplot(
    [diamonds["depth"][diamonds["cut"] == c].values 
     for c in diamonds.cut.unique()
    ],
    diamonds.cut.unique(),
    show_hist=False,
    show_rug=False,
)
for d in fig["data"]:
    d.update({"fill": "tozeroy"})
Image(fig.to_image(format="png", width=750, height=750))

In [None]:
%%R -i ts -w 10 -h 10 -u in 
ggplot(ts) + aes(date, value) + geom_line()

In [None]:
(ggplot(ts) 
 + aes("date", "value") 
 + geom_line())

In [None]:
ts.set_index('date')['value'].plot()

In [None]:
px.line(
    ts, x="date", y="value"
)

In [None]:
alt.Chart(ts).mark_line().encode(
    x="date", y="value"
)