In [19]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

In [20]:
# Get data from the link
csv_url =\
    'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'

data = pd.read_csv(csv_url, sep=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## Simple inspection

In [21]:
# with pandas
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [22]:
# Generate the profile report with Pandas Profiling
profile = ProfileReport(
    data,
    title="Example of summarization of wine data"
)


In [23]:
# Generate the dataset profile
# This is a nice and simple way to document the data
#profile.to_notebook_iframe()

## Unit tests

### Basic examples - function tests

We will learn how the unit test work on  a simple function. First, we will define a function `square`, which returns the square of a number. Then, we will test it by writing assertions (correct answers) in a test function. 

In [24]:
import pytest

# install the following to be able to run the tests in notebook
import ipytest
ipytest.autoconfig()

In [25]:
# A simple function: calculate square of a number
def square(x):
    return x * x

In [26]:
%%run_pytest[clean]

# Let's test the function
# Think about the limit cases
def test_square():
    assert square(2) == 4
    assert square(0) == 0
    assert square(-2) == 4


[32m.[0m[32m                                                                                            [100%][0m
[32m[32m[1m1 passed[0m[32m in 0.00s[0m[0m


%%run_pytest[clean] and %%run_pytest are deprecated in favor of %%ipytest. %%ipytest will clean tests, evaluate the cell and then run pytest. To disable cleaning, configure ipytest with ipytest.config(clean=False).


Make the test fail to be sure to understand how it works.

### Basic examples - data tests

As we did for the function, we can also write assertions for the data. In the following example we will define a data frame on the fly and thest for the null values in it. 

In [27]:
%%run_pytest[clean]

def test_column_is_null():
    df = pd.DataFrame(data = [(1, 0), (2, None)],
                      columns = ['a', 'b'])
    
    assert np.all(pd.notna(df))

[31mF[0m[31m                                                                                            [100%][0m
[31m[1m_______________________________________ test_column_is_null ________________________________________[0m

    [94mdef[39;49;00m [92mtest_column_is_null[39;49;00m():
        df = pd.DataFrame(data = [([94m1[39;49;00m, [94m0[39;49;00m), ([94m2[39;49;00m, [94mNone[39;49;00m)],
                          columns = [[33m'[39;49;00m[33ma[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mb[39;49;00m[33m'[39;49;00m])
    
>       [94massert[39;49;00m np.all(pd.notna(df))
[1m[31mE       assert False[0m
[1m[31mE        +  where False = <function all at 0x7f6890029ef0>(      a      b\n0  True   True\n1  True  False)[0m
[1m[31mE        +    where <function all at 0x7f6890029ef0> = np.all[0m
[1m[31mE        +    and         a      b\n0  True   True\n1  True  False = <function notna at 0x7f6858e0b320>(   a    b\n0  1  0.0\n1  2  NaN)[0m
[1m[

%%run_pytest[clean] and %%run_pytest are deprecated in favor of %%ipytest. %%ipytest will clean tests, evaluate the cell and then run pytest. To disable cleaning, configure ipytest with ipytest.config(clean=False).


## Test the wine data

Previously, we generated the data frame inside the test function. If we want to run multiple tests on the same df, we would rather pass it to each function as an argument (as usual in programming). To do that in testing, we need to define the data as **fixtures**. They look like ordinary function definitions, preceeded by a decorator `@pytest.fixture`. 

### Raw data tests

In [28]:
# Define fixtures
@pytest.fixture
def input_schema():
    # Define range and type for each column
    schema = {
    'fixed acidity': {'min': 1.0, 'max': 17.0, 'type': float},
    'volatile acidity': {'min': 0.0, 'max': 2.0, 'type': float},
    'citric acid': {'min': 0.0, 'max': 2.0, 'type': float},
    'residual sugar': {'min': 0.5, 'max': 17.0, 'type': float},
    'chlorides': {'min': 0.0, 'max': 17.0, 'type': float},
    'free sulfur dioxide': {'min': 0.0, 'max': 80.0, 'type': float},
    'total sulfur dioxide': {'min': 0.0, 'max': 300.0, 'type': float},
    'density': {'min': 0.8, 'max': 1.1, 'type': float},
    'pH': {'min': 1.0, 'max': 10.0, 'type': float},
    'sulphates': {'min': 0.0, 'max': 2.0, 'type': float},
    'alcohol': {'min': 7.0, 'max': 17.0, 'type': float},
    'quality': {'min': 1, 'max': 10, 'type': int},
    }
    return schema


# Download the data
@pytest.fixture
def input_data():
    csv_url =\
    'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
    data = pd.read_csv(csv_url, sep=';')
    return data

Write the following tests:
- is the number of columns in the data frame the same as in schema definition?
- are the values within defined ranges?
- are the types of the columns correct?

In [29]:
%%run_pytest[clean]

def test_number_of_columns(input_data, input_schema):
    
    assert len(input_data.columns) == len(input_schema.keys())


def test_input_data_ranges(input_data, input_schema):
    
    for column in input_data.columns:
        min_val = input_data.loc[:, column].min()
        max_val = input_data.loc[:, column].max()

        min_schema = input_schema[column]['min']
        max_schema = input_schema[column]['max']
        
        #assert True
        #assert False
        assert min_val >= min_schema
        assert max_val <= max_schema
        
        
def test_input_types(input_data, input_schema):
    
    for column in input_data.columns:
        input_type = input_data.dtypes[column]
        schema_type = input_schema[column]['type']
        
        assert input_type == schema_type
    

%%run_pytest[clean] and %%run_pytest are deprecated in favor of %%ipytest. %%ipytest will clean tests, evaluate the cell and then run pytest. To disable cleaning, configure ipytest with ipytest.config(clean=False).


[32m.[0m[32m.[0m[32m.[0m[32m                                                                                          [100%][0m
[32m[32m[1m3 passed[0m[32m in 4.43s[0m[0m


### Feature engineering tests

**NOTE:** Data transformaton should be done only on test dataset. You fit the transformer on the test dataset and then apply it on the train dataset. Since we are only illustrating the functioning of the unit testing, we will do it on the whole dataset.

In [30]:
from sklearn.preprocessing import StandardScaler
from numpy import mean, std

In [31]:
# Let's transform a column...

# define standard scaler
scaler = StandardScaler()
# transform data
scaled = scaler.fit_transform(data[['alcohol']])
print(scaled)

[[-0.96024611]
 [-0.58477711]
 [-0.58477711]
 ...
 [ 0.54162988]
 [-0.20930812]
 [ 0.54162988]]


In [32]:
# And check the stats...
print('mean:', mean(scaled))
print('std:', std(scaled))

mean: 1.0664806540489309e-16
std: 1.0


In [33]:
@pytest.fixture
def scaled_alcohol():
    csv_url =\
    'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
    data = pd.read_csv(csv_url, sep=';')
    
    # Define scaler
    scaler = StandardScaler()
    # Transform data
    scaled = scaler.fit_transform(data[['alcohol']])
    return scaled

In [34]:
%%run_pytest[clean]
# Test: is mean around zero and std around one?

def test_scaled_mean_zero(scaled_alcohol):
    
    mean_val = mean(scaled_alcohol)
    std_val = std(scaled_alcohol)
    
    assert pytest.approx(mean_val) == 0.0
    assert pytest.approx(std_val) == 1.0


%%run_pytest[clean] and %%run_pytest are deprecated in favor of %%ipytest. %%ipytest will clean tests, evaluate the cell and then run pytest. To disable cleaning, configure ipytest with ipytest.config(clean=False).


[32m.[0m[32m                                                                                            [100%][0m
[32m[32m[1m1 passed[0m[32m in 1.17s[0m[0m


## Additional exercises:

- implement and Test MinMaxScaler
- test null on 'quality'
- repeat the tests on synthetic data

In [35]:
%%run_pytest[clean]

def test_number_of_columns(input_data, input_schema):
    
    print(len(input_data.columns))
    #assert len(input_data.columns) == len(input_schema)

%%run_pytest[clean] and %%run_pytest are deprecated in favor of %%ipytest. %%ipytest will clean tests, evaluate the cell and then run pytest. To disable cleaning, configure ipytest with ipytest.config(clean=False).


[32m.[0m[32m                                                                                            [100%][0m
[32m[32m[1m1 passed[0m[32m in 1.20s[0m[0m
