## Unit Testing in Python


#### Function life cycle

   - A function is tested after the first implementation and then any time the function is modified, which happens mainly when new bugs are found, new features are implemented or the code is refactored.

<img src="assets/unit_testing/life_cycle.png" style="height: 280px;"/>


   - Module structure with unit tests
<img src="assets/unit_testing/module_structure.png" style="height: 200px;"/>


   - Unit testing libraries:
       - pytest
       - unittest
       - nosetests
       - doctests
   
   
   - pytest is one of the most populars
   
   
   - Correct IPython console command to run the tests: !pytest test_convert_to_int.py
   
   
   - An exception is raised when running the unit test. This could be an AssertionError raised by the assert statement or another exception, e.g. NameError, which is raised before the assert statement can run.
   
   - Unit tests:
       - Small independent piece of code, a function or class
       - Benefits
           - Time savings, leading to faster development of new features.
           - Improved documentation, which will help new colleagues understand the code base better.
           - More user trust in the software product.
           - Better user experience due to reduced downtime.
   - Integration tests:
       - Check if multiple units work well together when they are connected, not just independently

In [4]:
# Import the pytest package
import pytest

def convert_to_int(string_with_comma):
    # Fix this line so that it returns an int, not a str
    return int(string_with_comma.replace(",", ""))

# Complete the unit test name by adding a prefix
def test_convert_to_int():
  # Complete the assert statement
  assert convert_to_int("2,081") == 2081

test_convert_to_int()

#### Print a message if assert does not pass

In [9]:
def test_on_string_with_one_comma():
    test_argument = "2,082"
    expected = 2081
    actual = convert_to_int(test_argument)
    # Format the string with the actual return value
    message = "convert_to_int('2,081') should return the int 2081, but it actually returned {}".format(actual)
    # Write the assert statement which prints message on failure
    assert expected == actual, message

test_on_string_with_one_comma()

AssertionError: convert_to_int('2,081') should return the int 2081, but it actually returned 2082

#### Testing exceptions

<img src="assets/unit_testing/testing_exceptions.png" style="height: 200px;"/>


In [10]:
import pytest

try:
    # Fill in with a context manager that raises Failed if no OSError is raised
    with pytest.raises(OSError):
        raise ValueError
except:
    print("pytest raised an exception because no OSError was raised in the context.")

pytest raised an exception because no OSError was raised in the context.


In [11]:
with pytest.raises(ValueError) as exc_info:
    raise ValueError("Silence me!")
    
# Check if the raised ValueError contains the correct message
assert exc_info.match("Silence me!")

In [None]:
import numpy as np
import pytest
from train import split_into_training_and_testing_sets

def test_on_one_row():
    test_argument = np.array([[1382.0, 390167.0]])
    
    # Store information about raised ValueError in exc_info
    with pytest.raises(ValueError) as exc_info:
        split_into_training_and_testing_sets(test_argument)
        
    expected_error_msg = "Argument data_array must have at least 2 rows, it actually has just 1"
    
    # Check if the raised ValueError contains the correct message
    assert exc_info.match(expected_error_msg)

#### Unit testing with boundary values

In [None]:
import pytest
from preprocessing_helpers import row_to_list

def test_on_no_tab_no_missing_value():    # (0, 0) boundary value
    # Assign actual to the return value for the argument "123\n"
    actual = row_to_list("123\n")
    assert actual is None, "Expected: None, Actual: {0}".format(actual)
    
def test_on_two_tabs_no_missing_value():    # (2, 0) boundary value
    actual = row_to_list("123\t4,567\t89\n")
    # Complete the assert statement
    assert actual is None, "Expected: None, Actual: {0}".format(actual)
    
def test_on_one_tab_with_missing_value():    # (1, 1) boundary value
    actual = row_to_list("\t4,567\n")
    # Format the failure message
    assert actual is None, "Expected: None, Actual: {0}".format(actual)

#### Unit testing with values triggering special logic

In [None]:
import pytest
from preprocessing_helpers import row_to_list

def test_on_no_tab_with_missing_value():    # (0, 1) case
    # Assign to the actual return value for the argument "\n"
    actual = row_to_list("\n")
    # Write the assert statement with a failure message
    assert actual is None,"Expected: None, Actual: {0}".format(actual)
    
def test_on_two_tabs_with_missing_value():    # (2, 1) case
    # Assign to the actual return value for the argument "123\t\t89\n"
    actual = row_to_list("123\t\t89\n")
    # Write the assert statement with a failure message
    assert actual is None, "Expected: None, Actual: {0}".format(actual)

#### Unit testing with normal arguments

In [None]:
import pytest
from preprocessing_helpers import row_to_list

def test_on_normal_argument_1():
    actual = row_to_list("123\t4,567\n")
    # Fill in with the expected return value for the argument "123\t4,567\n"
    expected = ["123", "4,567"]
    assert actual == expected, "Expected: {0}, Actual: {1}".format(expected, actual)
    
def test_on_normal_argument_2():
    actual = row_to_list("1,059\t186,606\n")
    expected = ["1,059", "186,606"]
    # Write the assert statement along with a failure message
    assert actual == expected, "Expected: {0}, Actual: {1}".format(expected, actual)

#### Test driven development (TDD)
    
  - In TDD, you write the tests first and implement the function later
  - Write unit tests before implementation

<img src="assets/unit_testing/tdd.png" style="height: 400px;"/>

In TDD, you write the tests first and implement the function later.

Normal arguments for convert_to_int() are integer strings with comma as thousand separators. Since the best practice is to test a function for two to three normal arguments, here are three examples with no comma, one comma and two commas respectively.

In [5]:
def test_with_no_comma():
    actual = convert_to_int("756")
    # Complete the assert statement
    assert actual == 756, "Expected: 756, Actual: {0}".format(actual)
    
def test_with_one_comma():
    actual = convert_to_int("2,081")
    # Complete the assert statement
    assert actual == 2081, "Expected: 2081, Actual: {0}".format(actual)
    
def test_with_two_commas():
    actual = convert_to_int("1,034,891")
    # Complete the assert statement
    assert actual == 1034891, "Expected: 1034891, Actual: {0}".format(actual)

test_with_no_comma()
test_with_one_comma()
test_with_two_commas()

What should convert_to_int() do if the arguments are not normal? In particular, there are three special argument types:

Arguments that are missing a comma e.g. "178100,301".<br>
Arguments that have the comma in the wrong place e.g. "12,72,891".<br>
Float valued strings e.g. "23,816.92".<br>
Also, should convert_to_int() raise an exception for specific argument values?

In [11]:
# Give a name to the test for an argument with missing comma
def test_on_string_with_missing_comma():
    actual = convert_to_int("178100,301")
    assert actual is None, "Expected: None, Actual: {0}".format(actual)
test_on_string_with_missing_comma()

In [12]:
def test_on_string_with_incorrectly_placed_comma():
    # Assign to the actual return value for the argument "12,72,891"
    actual = convert_to_int("12,72,891")
    assert actual is None, "Expected: None, Actual: {0}".format(actual)
test_on_string_with_incorrectly_placed_comma()

In [13]:
def test_on_float_valued_string():
    actual = convert_to_int("23,816.92")
    # Complete the assert statement
    assert actual is None, "Expected: None, Actual: {0}".format(actual)
test_on_float_valued_string()

Implement the function

convert_to_int() returns None for the following:

Arguments with missing thousands comma e.g. "178100,301". If you split the string at the comma using "178100,301".split(","), then the resulting list ["178100", "301"] will have at least one entry with length greater than 3 e.g. "178100".

Arguments with incorrectly placed comma e.g. "12,72,891". If you split this at the comma, then the resulting list is ["12", "72", "891"]. Note that the first entry is allowed to have any length between 1 and 3. But if any other entry has a length other than 3, like "72", then there's an incorrectly placed comma.

Float valued strings e.g. "23,816.92". If you remove the commas and call int() on this string i.e. int("23816.92"), you will get a ValueError.

Re-run the tests above

In [10]:
def convert_to_int(integer_string_with_commas):
    comma_separated_parts = integer_string_with_commas.split(",")
    for i in range(len(comma_separated_parts)):
        # Write an if statement for checking missing commas
        if len(comma_separated_parts[i]) > 3:
            return None
        # Write the if statement for incorrectly placed commas
        if i != 0 and len(comma_separated_parts[i]) != 3:
            return None
    integer_string_without_commas = "".join(comma_separated_parts)
    try:
        return int(integer_string_without_commas)
    # Fill in with a ValueError
    except ValueError:
        return None

#### Code structure with unit tests

<img src="assets/unit_testing/code_structure.png" style="height: 400px;"/>

<img src="assets/unit_testing/test_classes.png" style="height: 400px;"/>




Test classes are containers inside test modules. They help separate tests for different functions within the test module, and serve as a structuring tool in the pytest framework.

Test classes are written in CamelCase e.g. TestMyFunction as opposed to tests, which are written using underscores e.g. test_something().

In [15]:
import pytest
import numpy as np

from models.train import split_into_training_and_testing_sets

# Declare the test class
class TestSplitIntoTrainingAndTestingSets(object):
    
    # Fill in with the correct mandatory argument
    def test_on_one_row(self):
        test
        _argument = np.array([[1382.0, 390167.0]])
        with pytest.raises(ValueError) as exc_info:
            split_into_training_and_testing_sets(test_argument)
            
        expected_error_msg = "Argument data_array must have at least 2 rows, it actually has just 1"
        
        assert exc_info.match(expected_error_msg)

#### Running the tests
<img src="assets/unit_testing/run_tests_folder.png" style="height: 400px;"/>

<img src="assets/unit_testing/run_tests.png" style="height: 320px;"/>

What is the correct command to run all the tests in this test class using node IDs?

In [None]:
!pytest models/test_train.py::TestSplitIntoTrainingAndTestingSets

#### Expected failures and conditional skipping

In [None]:
# Mark the whole test class as "expected to fail"
@pytest.mark.xfail
class TestModelTest(object):
    def test_on_linear_data(self):
        test_input = np.array([[1.0, 3.0], [2.0, 5.0], [3.0, 7.0]])
        expected = 1.0
        actual = model_test(test_input, 2.0, 1.0)
        message = "model_test({0}) should return {1}, but it actually returned {2}".format(test_input, expected, actual)
        assert actual == pytest.approx(expected), message
        
    def test_on_one_dimensional_array(self):
        test_input = np.array([1.0, 2.0, 3.0, 4.0])
        with pytest.raises(ValueError) as exc_info:
            model_test(test_input, 1.0, 1.0)

In [None]:
# Mark the whole test class as "expected to fail"

# Add a reason for the expected failure
@pytest.mark.xfail(reason="Using TDD, model_test() has not yet been implemented")
class TestModelTest(object):
    def test_on_linear_data(self):
        test_input = np.array([[1.0, 3.0], [2.0, 5.0], [3.0, 7.0]])
        expected = 1.0
        actual = model_test(test_input, 2.0, 1.0)
        message = "model_test({0}) should return {1}, but it actually returned {2}".format(test_input, expected, actual)
        assert actual == pytest.approx(expected), message
        
    def test_on_one_dimensional_array(self):
        test_input = np.array([1.0, 2.0, 3.0, 4.0])
        with pytest.raises(ValueError) as exc_info:
            model_test(test_input, 1.0, 1.0)

In [None]:
# Import the sys module
import sys

class TestGetDataAsNumpyArray(object):
    # Add a reason for skipping the test
    @pytest.mark.skipif(sys.version_info > (2, 7), reason="Works only on Python 2.7 or lower")
    def test_on_clean_file(self):
        expected = np.array([[2081.0, 314942.0],
                             [1059.0, 186606.0],
                             [1148.0, 206186.0]
                             ]
                            )
        actual = get_data_as_numpy_array("example_clean_data.txt", num_columns=2)
        message = "Expected return value: {0}, Actual return value: {1}".format(expected, actual)
        assert actual == pytest.approx(expected), message

Teardown: when you have to clean up any modifications to the environment and restore it to its initial state

Setup -> Assert -> Teardown

The setup and teardown is placed outside the test in a function called fixture. This is a function with the pytest.fixture decorator. It uses the yield argument instead of the return argument.

<img src="assets/unit_testing/fixture.png" style="height: 320px;"/>

Use a fixture for a clean data file

In [None]:
# Add a decorator to make this function a fixture
@pytest.fixture
def clean_data_file():
    file_path = "clean_data_file.txt"
    with open(file_path, "w") as f:
        f.write("201\t305671\n7892\t298140\n501\t738293\n")
    yield file_path
    os.remove(file_path)
    
# Pass the correct argument so that the test can use the fixture
def test_on_clean_file(clean_data_file):
    expected = np.array([[201.0, 305671.0], [7892.0, 298140.0], [501.0, 738293.0]])
    # Pass the clean data file path yielded by the fixture as the first argument
    actual = get_data_as_numpy_array(clean_data_file, 2)
    assert actual == pytest.approx(expected), "Expected: {0}, Actual: {1}".format(expected, actual) 

Write a fixture for an empty data file

In [None]:
@pytest.fixture
def empty_file():
    # Assign the file path "empty.txt" to the variable
    file_path = "empty.txt"
    open(file_path, "w").close()
    # Yield the variable file_path
    yield file_path
    # Remove the file in the teardown
    os.remove(file_path)
    
def test_on_empty_file(self, empty_file):
    expected = np.empty((0, 2))
    actual = get_data_as_numpy_array(empty_file, 2)
    assert actual == pytest.approx(expected), "Expected: {0}, Actual: {1}".format(expected, actual)

Fixture chaining using tmpdir

The built-in tmpdir fixture is very useful when dealing with files in setup and teardown. tmpdir combines seamlessly with user defined fixture via fixture chaining.

In [None]:
import pytest

@pytest.fixture
# Add the correct argument so that this fixture can chain with the tmpdir fixture
def empty_file(tmpdir):
    # Use the appropriate method to create an empty file in the temporary directory
    file_path = tmpdir.join("empty.txt")
    open(file_path, "w").close()
    yield file_path

In what order will the setup and teardown of empty_file() and tmpdir be executed?

setup of tmpdir 
→
 setup of empty_file() 
→
 teardown of empty_file() 
→
 teardown of tmpdir