In [None]:
%matplotlib inline
import matplotlib
import seaborn as sns
sns.set()
matplotlib.rcParams['figure.dpi'] = 144

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import simplejson as json

# get GOOG data
with open('small_data/goog.json') as raw_f:
    raw_data = raw_f.read()
    json_data = json.loads(raw_data)

goog = pd.DataFrame(json_data['data'], columns=json_data['column_names'])
goog['Day'] = goog.index.values
goog.set_index(pd.DatetimeIndex(goog['Date']), inplace=True)

# Visualization Theory
<!-- requirement: small_data/goog.json -->
<!-- requirement: images/visual_variables.jpg -->
<!-- requirement: images/visual_precision.jpg -->
<!-- requirement: images/color_blindness_illustration.jpg -->
<!-- requirement: images/size_illusion.png -->
<!-- requirement: images/mach_bands.png -->
<!-- requirement: images/adelson_checkerboard.jpg -->
<!-- requirement: images/us_evapo_map.jpg -->
<!-- requirement: images/colormap_comparison.png -->
<!-- requirement: images/colormap_nonlinearity.png -->
<!-- requirement: images/colormap_linear.webp -->
<!-- requirement: images/preattentive_1.jpg -->
<!-- requirement: images/preattentive_2.jpg -->
<!-- requirement: images/preattentive_3.jpg -->
<!-- requirement: images/preattentive_4.jpg -->
<!-- requirement: images/attentive_5.png -->
<!-- requirement: images/preattentive_5.png-->


## Different types of data for visualization purposes


For visualization purposes, there are four types of data: **Nominal**, **Ordered**, **Interval**, and **Ratio**.  We give each one below and the associated operations which can be applied to each data type: **Equality**, **Comparison**, **Difference**, **Ratio**:
  
<table>

<tr>
<th>Data Type</th>
<th>Example</th>
<th>Equality</th>
<th>Comparison</th>
<th>Difference</th>
<th>Ratio</th>
</tr>

<tr>
<td>**Nominal**</td>
<td>e.g. countries of the world</td>
<td>`=`,`!=`</td>
<td></td>
<td></td>
<td></td>
</tr>

<tr>
<td>**Ordered**</td>
<td>e.g. bond ratings: A, AA</td>
<td>`=`,`!=`</td>
<td>`>`, `<=`</td>
<td></td>
<td></td>
</tr>

<tr>
<td>**Interval** (location of 0 is arbitrary)</td>
<td>e.g. dates and time, lat-long</td>
<td>`=`,`!=`</td>
<td>`>`, `<=`</td>
<td>`-`</td>
<td></td>
</tr>

<tr>
<td>**Ratio**</td>
<td>e.g. physical measures or values</td>
<td>`=`,`!=`</td>
<td>`>`, `<=`</td>
<td>`-`</td>
<td>`/`</td>
</tr>

</table>




## Seven categories of visual cues

There are seven well-recognized ways of visually encoding data, **Position**, **Size**, **Value** (Saturation or density of a color), **Texture**, **Color** (Hue), **Orientation**, **Shape**.

| Label      | Nominal  | Ordinal | Quantitative (Interval or Ratio) |
| ---------- |:-:|:-:|:-:|
|Position    | N | O | Q |
|Size        |   | O | Q |
|Value       | N | O | Q?|
|Texture     | N | O |   |
|Color       | N | O |   |
|Orientation | N |   |   |
|Shape       | N |   |&nbsp;|


![Categories of Visual Cues](images/visual_variables.jpg)

As indicated, certain cues are not well-suited to representing certain types of variables. Similarly our eye has greater precision in discriminating between certain visual cues.

![Illustration of visual precision](images/visual_precision.jpg)

Below, we plot some of Google's stock prices using various visual cues.  Which are easiest for you to read?  Which give you the most precision in determining the numeric values?

In [None]:
d = goog[-50:]
d['Ones'] = np.ones(50)

In [None]:
matplotlib.rcParams['figure.figsize'] = [10, 3]

In [None]:
ax = d[(500 < d['Close']) & (d['Close'] <= 520)].plot(kind='scatter', x='Day', y='Ones', marker='o')
d[(520 < d['Close']) & (d['Close'] <= 540)].plot(kind='scatter', x='Day', y='Ones', marker='s', ax=ax)
d[(540 < d['Close']) & (d['Close'] <= 560)].plot(kind='scatter', x='Day', y='Ones', marker='d', ax=ax)
d[(560 < d['Close']) & (d['Close'] <= 580)].plot(kind='scatter', x='Day', y='Ones', marker='*', ax=ax)

x = np.linspace(370,380,4)
s = [10,30,50,70]
shapes = 'osd*'
for xx, ss, sh in zip(x, s, shapes):
    plt.scatter(xx, [1.01], marker=sh, c=sns.color_palette()[0])
    plt.text(xx, 1.008, 500 + ss, horizontalalignment='center',
             verticalalignment='top')
plt.yticks([])
plt.ylabel('')
plt.text(375, 1.005, 'Close', horizontalalignment='center', verticalalignment='top')
plt.gcf().set_figheight(2)
plt.title('Shape');

In [None]:
def angle(close):
    return (close - 500) / 80 * 360

for row in d.iterrows():
    plt.scatter(row[1].Day, [0], marker=(2, 0, -angle(row[1].Close)/3.), s=200, c=sns.color_palette()[0])

x = np.linspace(370,380,4)
s = [510,530,550,570]
for xx, ss, sh in zip(x, s, shapes):
    plt.scatter(xx, [0.01], marker=(2, 0, -angle(1.0*ss)/3.), s=200, c=sns.color_palette()[0])
    plt.text(xx, 0.008, ss, horizontalalignment='center',
             verticalalignment='top')
plt.yticks([])
plt.ylabel('')
plt.text(375, 0.005, 'Close', horizontalalignment='center', verticalalignment='top')
plt.gcf().set_figheight(2)
plt.title('Orientation');

In [None]:
d.plot(kind='scatter', x='Day', y='Ones', c='Close', cmap=plt.cm.rainbow,
       ax=plt.gca())  # Hack for vanishing x-axis
plt.yticks([])
plt.ylabel('')
plt.gcf().set_figheight(2)
plt.title('Color');

In [None]:
d.plot(kind='scatter', x='Day', y='Ones', c='Close', cmap=plt.cm.Blues,
       ax=plt.gca())  # Hack for vanishing x-axis
plt.yticks([])
plt.ylabel('')
plt.gcf().set_figheight(2)
plt.title('Value');

In [None]:
d.plot(kind='scatter', x='Day', y='Ones', s=d['Close']-500)
x = np.linspace(370,380,4)
s = [10,30,50,70]
plt.scatter(x, [1.01]*4, s=s)
for xx, ss in zip(x, s):
    plt.text(xx, 1.008, 500 + ss, horizontalalignment='center',
             verticalalignment='top')
plt.yticks([])
plt.ylabel('')
plt.text(375, 1.005, 'Close', horizontalalignment='center', verticalalignment='top')
plt.gcf().set_figheight(2)
plt.title('Size');

In [None]:
d.plot(kind='scatter', x='Day', y='Close')
plt.title('Position');

## Generic algorithm for creating a visualization

Here is the rough algorithm for how to draw a visualization.

1. Express your message in terms of a few quantitative relationships to be expressed (probably no more than 2 and definitely not more than 3).
1. Rank those quantitative relationships.
1. Use the "accuracy of visual perception" and the table above to think about how to encode the data.

In reality, you probably can't follow this algorithm literally but hopefully this can help you better understand what's going on.

## Portability & Accessibility

Colors are not produced the same way in all media. For instance, electronic devices use an RGB additive color system since on screens the colors are produced by mixing wavelengths of light. Meanwhile, printers use a CMYK subtractive color system since printed materials produce color by mixing pigments which absorb wavelengths of light. Even different electronic devices may render certain colors robustly and other only weakly. These factors can strongly limit or distort contrast between colors.

Similarly, many men and some women have some form of color blindness, limiting their ability to distinguish between certain colors. When designing graphics to be accessible for color blind individuals, you may want mark categorical differences with differing texture or shape in addition to or instead of color.

![Color blindness illustration](images/color_blindness_illustration.jpg)

It is a good idea to use the intensity of a monochrome palette as a visual cue instead of hue/color where possible.

## Perception and Visual Response

Our eyes and brain don't respond linearly to changes in color, intensity, or size. Many of these factors interact in ways that can mislead us concerning the underlying data, perceiving larger or smaller contrasts than are accurate.

![Relative size distortions](images/size_illusion.png)

Relative sizes can lead us to believe that two identical objects differ in size.

![Mach bands](images/mach_bands.png)

Contrast in color or intensity is often dependent on what colors surround the objects. Our mind infers shadows or difference in lighting conditions when interpreting colors, [which can dramatically skew our perception](https://en.wikipedia.org/wiki/The_dress).

![Adelson checkerboard](images/adelson_checkerboard.jpg)



### Perceptually uniform colormaps

One pitfall many visualizations suffer from is from choosing colors that have high contrast even where there are no significant differences in the data.

![US evapotranspiration map](images/us_evapo_map.jpg)

In this example there is a perceived sharp divide in values between the eastern and western half of the United States. However, inspecting the legend reveals that there is very little difference along this apparent divide. Instead, the perceived difference is entirely due to the difference in brightness as the dark green colors shift towards bright yellow in the scale.

![colormap comparison](images/colormap_comparison.png)

Our eye is more sensitive to changes in brightness than changes in hue. Many commonly used colormaps (such as the previous default in `matplotlib`, "jet") are not "perceptually uniform," meaning their perceived brightness shows nonlinearity or even reversals as the scale advances from low to high values.

![colormap nonlinearity](images/colormap_nonlinearity.png)

![linear colormap](images/colormap_linear.webp)

Fortunately `matplotlib` has replaced their default colormap with the visually uniform `viridis` colormap. However, when representing categorical data, periodic data, or data where we want to highlight a strong divergence (for example differentiating values below and above an average), the `viridis` colormap may not be a good choice. [There are a number of excellent choices for these purposes available in `matplotlib`](https://matplotlib.org/tutorials/colors/colormaps.html#lightness-of-matplotlib-colormaps), and they provide plots of the perceived brightness for each colormap in the library.

### Sizing markers


When using size as a cue, pay attention to whether you are scaling linear dimension or area.  Usually you want area, but if you are using an elongated shape you may want a linear dimension.  In `matplotlib` the size argument is in units of pixel$^2$.  Not all plotting libraries use the same convention.

## Attention and Memory

Our sense organs (such as our eyes) collect information in tremendous volume at tremendous speed. Much of this information either leads the eye in a [*pre-attentive process*](https://en.wikipedia.org/wiki/Pre-attentive_processing) or is discarded as irrelevant. The visual cues listed at the beginning of the notebook are targets for pre-attentive processing. We can think of these cues as organizing the viewer's conscious processing. Compare how quickly you can complete the following tasks.

**Find the red circle.**

![pre-attentive task 1](images/preattentive_1.jpg)
![pre-attentive task 2](images/preattentive_2.jpg)

**Find the filled circle.**

![pre-attentive task 3](images/preattentive_3.jpg)

**Find the boundary between circles and squares.**

![pre-attentive task 4](images/preattentive_4.jpg)

**Count the number of 5s in the image below.**

![attentive task 5](images/attentive_5.png)

![pre-attentive task 5](images/preattentive_5.png)

Notice how pre-attentive processing dramatically improves the speed at which we can identify the desired information. Therefore we should use visual cues to highlight the most important data feature, and avoid encoding any other data features using visual cues that may interfere with the primary pre-attentive processing.

Memory is another limited resource when processing information. In analogy with computers, we can think of our mind as having short-term, ready-access memory (like RAM) and long-term, slower-recall memory (disk storage). However, just like in a computer, our short-term/working memory [is limited in size (quotes range from 3-10 "items" or "chunks")](https://en.wikipedia.org/wiki/Short-term_memory). This means we can only compare a few groups and/or values at a time. The implication for visualization is that we should also structure our visual cues in a way that aggregates multiple data sources or implies a possible aggregation the user could make by eye.

The following graph requires the viewer to frequently reference the legend because there are too many categories to remember which colors represent them.

In [None]:
np.random.seed(42)

x = np.arange(100)
trials = np.empty((8, 100))
for trial in range(8):
    trend = np.random.rand() * (x / x.max() * np.random.rand() * np.random.choice([1, -1]) + 1)
    noise = .05 * np.random.randn(x.size) * trend
    trials[trial, :] = trend + noise
    plt.plot(x, trend + noise, label='trial {}'.format(trial))

plt.legend()

In the above example, if we are representing a sensor reading from subsequent experiments, we may wish to aggregate these trials into a single signal with some uncertainty.

In [None]:
median = np.median(trials, axis=0)
errors = np.vstack([np.percentile(trials, 25, axis=0), np.percentile(trials, 75, axis=0)])

plt.plot(x, median, 'k', label='median among trials')
plt.fill_between(x, errors[0, :], errors[1, :], alpha=.5, label='IQR')
plt.legend()

## Visual storytelling

* Narrative is about establishing a baseline, then disrupting it
* The "actionable" is what is needed to restore balance
* Why does the audience care?
* Make everyone a stakeholder - "the conflict between what is and what could be" -Nancy Duarte

*Copyright &copy; 2019 The Data Incubator.  All rights reserved.*