In [None]:
!pip install wandb

In [None]:
!pip install pytest pytest-sugar

In [3]:
import wandb

In [4]:
# Login to Weights & Biases
!wandb login --relogin

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
%%file test_data.py
import pytest
import wandb
import pandas as pd

# This is global so all tests are collected under the same run
run = wandb.init(project="decision_tree", job_type="data_checks")

@pytest.fixture(scope="session")
def data():

    local_path = run.use_artifact("decision_tree/preprocessed_data.csv:latest").file()
    df = pd.read_csv(local_path)

    return df

def test_data_length(data):
    """
    We test that we have enough data to continue
    """
    assert len(data) > 1000


def test_number_of_columns(data):
    """
    We test that we have enough data to continue
    """
    assert data.shape[1] == 17

def test_column_presence_and_type(data):

    required_columns = {
        "age": pd.api.types.is_int64_dtype,
        "job": pd.api.types.is_object_dtype,
        "marital": pd.api.types.is_object_dtype,
        "education": pd.api.types.is_object_dtype,
        "default": pd.api.types.is_object_dtype,
        "balance": pd.api.types.is_int64_dtype,
        "housing": pd.api.types.is_object_dtype,
        "loan": pd.api.types.is_object_dtype,
        "contact": pd.api.types.is_object_dtype,
        "day": pd.api.types.is_int64_dtype,
        "month": pd.api.types.is_object_dtype,
        "duration": pd.api.types.is_int64_dtype,
        "campaign": pd.api.types.is_int64_dtype,
        "pdays": pd.api.types.is_int64_dtype,
        "previous": pd.api.types.is_int64_dtype,
        "poutcome": pd.api.types.is_object_dtype,
        "y": pd.api.types.is_object_dtype
    }

    # Check column presence
    assert set(data.columns.values).issuperset(set(required_columns.keys()))

    for col_name, format_verification_funct in required_columns.items():

        assert format_verification_funct(data[col_name]), f"Column {col_name} failed test {format_verification_funct}"

def test_column_ranges(data):

    ranges = {
        "age": (18, 95),
        "balance": (-8019, 102127),
        "day": (1, 31),
        "duration": (0, 4918),
        "campaign": (1, 63),
        "pdays": (-1, 871),
        "previous": (0, 275)
    }

    for col_name, (minimum, maximum) in ranges.items():

        assert data[col_name].dropna().between(minimum, maximum).all(), (
            f"Column {col_name} failed the test. Should be between {minimum} and {maximum}, "
            f"instead min={data[col_name].min()} and max={data[col_name].max()}"
        )

Writing test_data.py


In [6]:
!pytest . -vv

[1mTest session starts (platform: linux, Python 3.7.13, pytest 3.6.4, pytest-sugar 0.9.4)[0m
cachedir: .pytest_cache
rootdir: /content, inifile:
plugins: typeguard-2.7.1, sugar-0.9.4

 [36mtest_data.py[0m::test_data_length[0m [32m✓[0m                                 [32m25% [0m[40m[32m█[0m[40m[32m█▌       [0m
 [36mtest_data.py[0m::test_number_of_columns[0m [32m✓[0m                           [32m50% [0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m██     [0m
 [36mtest_data.py[0m::test_column_presence_and_type[0m [32m✓[0m                    [32m75% [0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m██[0m[40m[32m█[0m[40m[32m█▌  [0m
 [36mtest_data.py[0m::test_column_ranges[0m [32m✓[0m                              [32m100% [0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m██[0m[40m[32m█[0m[40m[32m█[0m[40m[32m█[0m[40m[32m██[0m

Results (6.98s):
[32m       4 passed[0m
