In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%load_ext ipython_unittest
%load_ext ipython_nose
%load_ext ipython_pytest
import termios, fcntl, struct
fcntl.ioctl(1, termios.TIOCSWINSZ, struct.pack('hhhh', 57, 102, 0, 0))  # terminal width correction
from IPython.display import HTML
HTML('''<link rel="stylesheet" href="eniram-theme/eniram-theme.css" type="text/css"></link>
        <script type="text/javascript" src="eniram-theme/rise-shortcuts.js"></script>''')

# Assertions
- goal: verify that test code outputs expected values
- typically a strict equality comparison<br/>
  `assert result == expected`
- frameworks support basic types well
  - `int`, `str`, `None`, `list`, `dict`, `set`, `class`
- numeric and vector types are tricky

# Numeric data structures

<div class="data-table" style="float: left"><table style="width: 9em">
<caption>NumPy 1-D array</caption>
<tr><td>60.153</td></tr>
<tr><td>NaN</td></tr>
<tr><td>&hellip;</td></tr>
<tr><td>60.177</td></tr>
</table></div>

<div class="data-table" style="float: left; margin-left: 3em"><table style="width: 11em">
<caption>Pandas DatetimeIndex</caption>
<tr><th>time</th></tr>
<tr><td>2016-10-31 13:30:00</td></tr>
<tr><td>2016-10-31 13:31:00</td></tr>
<tr><td>&hellip;</td></tr>
<tr><td>2016-10-31 13:32:00</td></tr>
</table></div>

<div class="data-table" style="margin-left: 3em"><table style="width: 11em">
<caption>Pandas Series</caption>
<tr><th colspan="2" style="text-align: center">latitude</th></tr>
<tr><th>time</th><th></th></tr>
<tr><td>2016-10-31 13:30:00</td><td>NaN</td></tr>
<tr><td>2016-10-31 13:31:00</td><td>60.161</td></tr>
<tr><td>&hellip;</td><td>&hellip;</td></tr>
<tr><td>2016-10-31 13:32:00</td><td>60.153</td></tr>
</table></div>

<div class="data-table" style="margin-top: 2em"><table>
<caption>Pandas DataFrame</caption>
<tr><th>time</th><th>lon</th><th>lat</th></tr>
<tr><td>2016-10-31 13:30:00</td><td>24.903</td><td>60.161</td></tr>
<tr><td>2016-10-31 13:31:00</td><td>24.877</td><td>60.153</td></tr>
<tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr>
<tr><td>2016-10-31 13:32:00</td><td>24.948</td><td>60.177</td></tr>
</table>
</div>

## Vector equality
- pure Python list equality is simple:

In [None]:
a = [60.153, 60.177]
b = [60.153, 60.177]
a == b

In [None]:
a = [60.153, 60.177]
b = [60.161, 60.177]
a == b

- NumPy does an element-wise comparison:

In [None]:
import numpy as np

a = np.array([60.153, 60.177])
b = np.array([60.161, 60.177])
a == b

- `assert` can't convert an boolean vector to a boolean scalar:

In [None]:
%%nose -v --expand-tracebacks

def test_array_equality():
    a = np.array([60.153, 60.177])
    b = np.array([60.161, 60.177])
    assert a == b

In [None]:
bool(np.array([True, False]))

Use `array.all()` instead:

In [None]:
np.array([True, True]).all()

Pure Python `all(my_boolean_array)` works too:

In [None]:
all(np.array([True, False]))

Using `.all()` we now get an `AssertionError` as expected:

In [None]:
%%nose -v --expand-tracebacks

def test_numpy_array_all():
    a = np.array([60.153, 60.177])
    b = np.array([60.161, 60.177])
    assert (a == b).all()

# NaN – Not A Number?
- NumPy arrays are typed
- can't use `None` to denote a missing value
- use NaN instead
  - `np.nan` or
  - `float('nan')`

- gotcha: NaN doesn't equal anything, even itself

In [None]:
float('nan') == float('nan')

In [None]:
np.nan == np.nan

- NaNs in arrays cause unexpected assertion failures:

In [None]:
%%nose -v

def test_ignored_nan():
    a = np.array([1.0, np.nan, 2.0])
    b = np.array([1.0, np.nan, 2.0])
    assert (a == b).all()

...stay tuned for solutions!

# NumPy assertion helpers
- for docs, search ["numpy.testing"](http://docs.scipy.org/doc/numpy/reference/routines.testing.html)
- useful for comparisons of
  - arrays
  - special values (NaNs, infinities)
  - floating point values with tolerance

## `assert_equal` is NaN-aware:

In [None]:
from numpy.testing import assert_equal

In [None]:
%%nose -v

def test_assert_equal_nan_array():
    a = np.array([1.0, np.nan, 2.0])
    b = np.array([1.0, np.nan, 2.0])
    assert_equal(a, b)

It works for scalars as well:

In [None]:
%%nose -v

def test_assert_equal_nan_scalar():
    assert_equal(np.nan, np.nan)

## Floating point numbers
- the same computation with a different algorithm may give slightly different values
- can happen e.g. when optimizing code
- you'll want to ignore minuscule inaccuracies

In [None]:
%%nose -v --expand-tracebacks

def test_float_inaccuracy():
    a = 1.2 - 1.0
    b = 2.2 - 2.0
    assert_equal(a, b)

## `allclose()` and `assert_allclose()`
- NumPy provides these helpers for comparing floats

In [None]:
np.allclose(0.19999999999999996, 0.20000000000000018)

In [None]:
from numpy.testing import assert_allclose

In [None]:
%%nose -v --expand-tracebacks

def test_assert_allclose_scalar():
    a = 1.2 - 1.0
    b = 2.2 - 2.0
    assert_allclose(a, b)

## `allclose`/`assert_allclose` – custom tolerance
- use `atol=` for absolute tolerance
- use `rtol=` for additional tolerance relative to the second argument
- formula: $\left\lvert a - b \right\rvert \le atol + rtol \cdot \left\lvert b \right\rvert$
- to illustrate using large numbers: "are 2 and 8 close according to these tolerances?"

In [None]:
np.allclose(2, 8, atol=4, rtol=0.25)

passes because $4 + 0.25 \cdot 8 = 6.00$ and $\left\lvert 2 - 8 \right\rvert \le 6.00$

In [None]:
np.allclose(2, 8, atol=4, rtol=0.24)

fails because $4 + 0.24 \cdot 8 = 5.92$ and $\left\lvert 2 - 8 \right\rvert \not\le 5.92$

## `allclose`/`assert_allclose` gotcha
- different default tolerances:
  - `allclose(atol=1e-8, rtol=1e-5)`
  - `assert_allclose(atol=0, rtol=1e-7)`

In [None]:
np.allclose(1e-8, 2e-8)  # atol=1e-8, rtol=1e-5

In [None]:
%%nose -v

def test_assert_allclose_tolerance():
    assert_allclose(1e-8, 2e-8)  # atol=0, rtol=1e-7

## `allclose`/`assert_allclose` with NaNs
- use the `equal_nan=True` argument

In [None]:
%%nose -v

def test_assert_allclose_array_with_nans_and_custom_tolerance():
    a = np.array([1.0001, 1.0002, np.nan])
    b = np.array([1.0002, 1.0003, np.nan])
    assert_allclose(a, b, equal_nan=True, atol=1e-3)

## A slight detour to a NumPy bug
While preparing this talk, I noticed that `allclose` and `assert_allclose` behaved differently with `equal_nan=False`:

In [None]:
np.allclose(np.nan, np.nan, equal_nan=False)

In [None]:
assert_allclose(np.nan, np.nan, equal_nan=False)  # should raise an AssertionError

- I filed an issue: https://github.com/numpy/numpy/issues/8145
- created some [failing tests](https://github.com/EniramLtd/numpy/commit/4b1281296dce87e85576314636c7ed0326c3f77f)
- submitted [a patch](https://github.com/EniramLtd/numpy/commit/91140502d4a3b775bb22952ee81310897e236236)
- ...which was [merged](https://github.com/numpy/numpy/pull/8165) in 5 days!
  Open Source at its best!
- in NumPy <=1.11.2, don't use `assert_allclose` if you *don't* need NaNs to be equal<br/>
  (rare in testing code)

## Assertions: NumPy recap
- `numpy.testing.assert_equal(a, b)`
  - NaNs are equal
  - floats must match exactly
- `numpy.testing.assert_allclose(a, b, equal_nan=True)`
  - NaNs are equal
  - floats with tolerance
  - adjust tolerance using `atol=` and `rtol=`
- other helpers in `numpy.testing`
  - not so relevant for asserting unit test results
  - similar functionality found in test frameworks

# Pandas data structures

<div class="data-table" style="float: left; margin-left: 3em"><table style="width: 11em">
<caption>DatetimeIndex</caption>
<tr><th>time</th></tr>
<tr><td>2016-10-31 13:30:00</td></tr>
<tr><td>2016-10-31 13:31:00</td></tr>
<tr><td>&hellip;</td></tr>
<tr><td>2016-10-31 13:32:00</td></tr>
</table></div>

<div class="data-table" style="margin-left: 3em"><table style="width: 11em">
<caption>Series</caption>
<tr><th colspan="2" style="text-align: center">latitude</th></tr>
<tr><th>time</th><th></th></tr>
<tr><td>2016-10-31 13:30:00</td><td>NaN</td></tr>
<tr><td>2016-10-31 13:31:00</td><td>60.161</td></tr>
<tr><td>&hellip;</td><td>&hellip;</td></tr>
<tr><td>2016-10-31 13:32:00</td><td>60.153</td></tr>
</table></div>

<div class="data-table" style="margin-top: 2em"><table>
<caption>DataFrame</caption>
<tr><th>time</th><th>lon</th><th>lat</th></tr>
<tr><td>2016-10-31 13:30:00</td><td>24.903</td><td>60.161</td></tr>
<tr><td>2016-10-31 13:31:00</td><td>24.877</td><td>60.153</td></tr>
<tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr>
<tr><td>2016-10-31 13:32:00</td><td>24.948</td><td>60.177</td></tr>
</table>
</div>

## Index objects
- simple index objects behave like NumPy arrays – just use `==`
- use `pandas.util.assert_index_equal` to make sure names match, too

In [None]:
%%pytest --tb=short -q
import pandas as pd

def test_assert_index_equal_names():
    a = pd.Index([1.0, 2.0], name='Jeff')
    b = pd.Index([1.0, 2.0], name='Wes')
    pd.util.testing.assert_index_equal(a, b)

## Series and DataFrame objects
- `pandas.util.testing` provides highly customizable helpers for both
- can check index names
- "exact", "more" precise or "less" precise tolerances only
- consider NaNs equal

In [None]:
import pandas as pd
pd.util.testing.assert_series_equal(
    pd.Series(), pd.Series(), check_dtype=True, check_index_type='equiv', check_series_type=True,
    check_less_precise=False, check_names=True, check_exact=False, check_datetimelike_compat=False,
    check_categorical=True, obj='Series')

In [None]:
pd.util.testing.assert_frame_equal(
    pd.DataFrame(), pd.DataFrame(), check_dtype=True, check_index_type='equiv', check_column_type='equiv', 
    check_frame_type=True, check_less_precise=False, check_names=True, by_blocks=False, check_exact=False, 
    check_datetimelike_compat=False, check_categorical=True, check_like=False, obj='DataFrame')

In [None]:
%%nose -v

def test_assert_series_equal():
    pd.util.testing.assert_series_equal(
        pd.Series([1.00000000001, np.nan], index=pd.Index([5, 6], name='Wes')),
        pd.Series([1.00000000002, np.nan], index=pd.Index([5, 6], name='Jeff')),
        check_names=False)

# Vectors inside complex data structures

In [None]:
import pandas as pd

ds1 = {'description': 'A dictionary containing a Series',
       'series': pd.Series([1.0, np.nan])}
ds2 = {'description': 'A dictionary containing a Series',
       'series': pd.Series([1.0, np.nan])}
ds1 == ds2

## `pd.util.testing.assert_dict_equal`
- NumPy and Pandas objects in dicts compared sanely
- can't tweak comparison details

In [None]:
from pandas.util.testing import assert_dict_equal

In [None]:
%%nose -v

def test_assert_dict_equal_series_without_index():
    ds1 = {'description': 'A dictionary containing a Series',
           'series': pd.Series([1.0, np.nan])}
    ds2 = {'description': 'A dictionary containing a Series',
           'series': pd.Series([1.0, np.nan])}
    assert_dict_equal(ds1, ds2)

## `assert_dict_equal` broken with a custom index

In [None]:
ds3 = {'description': 'A dictionary containing a Series with a custom index',
       'series': pd.Series([1.0, np.nan], index=[10, 20])}
ds4 = {'description': 'A dictionary containing a Series with a custom index',
       'series': pd.Series([1.0, np.nan], index=[10, 20])}
assert_dict_equal(ds3, ds4)

## NumPy/Pandas-aware equality assertion helper

- example on next slide supports
  - `dict` (but not subclasses)
  - `list`, `tuple` (but not subclasses)
  - `np.array`, `pd.Series` and `pd.DataFrame` (with default tolerances)

- design for testability – remember to implement `__eq__()` properly in your classes if they can contain vectors

- if needed, include a custom implementation for the test suite in each application
  - different data structures
  - different tolerance needs
  - additional data structures, e.g. `OrderedDict` or lists of floats
- if all else fails, compare dicts part by part

## NumPy/Pandas-aware equality assertion helper

In [None]:
import pandas as pd
import numpy as np
from numpy.testing import assert_allclose
from pandas.util.testing import assert_series_equal, assert_frame_equal

def assert_data_equal(a, b):
    assert type(a) is type(b)

    if type(a) is dict:
        assert set(a) == set(b)  # same keys
        for key in a:
            assert_data_equal(a[key], b[key])

    elif type(a) in [list, tuple]:
        assert len(a) == len(b)
        for item_a, item_b in zip(a, b):
            assert_data_equal(item_a, item_b)
            
    elif isinstance(a, np.ndarray):
        assert_allclose(a, b, equal_nan=True)
        
    elif isinstance(a, pd.Series):
        assert_series_equal(a, b)
        
    elif isinstance(a, pd.DataFrame):
        assert_frame_equal(a, b)
        
    else:
        assert a == b

In [None]:
%%nose -v

def test_dict_equality():
    assert_data_equal({'description': 'A dictionary containing a Series object',
                       'series': pd.Series([1.2 - 1.0, np.nan], index=[10, 20])},

                      {'description': 'A dictionary containing a Series object',
                       'series': pd.Series([2.2 - 2.0, np.nan], index=[10, 20])})

In [None]:
%%nose -v --expand-tracebacks

def test_dict_equality():
    assert_data_equal({'description': 'A dictionary containing a Series object',
                       'series': pd.Series([1.2 - 1.0, np.nan], index=pd.Index([10, 20], name='Wes'))},

                      {'description': 'A dictionary containing a Series object',
                       'series': pd.Series([2.2 - 2.0, np.nan], index=pd.Index([10, 20], name='Jeff'))})

# Assertions: Pandas & data structures recap
- index comparison: `"=="` usually ok
- `pandas.util.testing`:
  - `assert_series_equal()`
  - `assert_frame_equal()`
  - `assert_dict_equal()` (with index caveat)
- design your data structures for testing
- create your own helpers