## Aula 9 - Validação de Esquema com Pandas

In [2]:
!pip install pandera

Collecting pandera
  Downloading pandera-0.8.0-py3-none-any.whl (186 kB)
[?25l[K     |█▊                              | 10 kB 24.1 MB/s eta 0:00:01[K     |███▌                            | 20 kB 27.2 MB/s eta 0:00:01[K     |█████▎                          | 30 kB 11.9 MB/s eta 0:00:01[K     |███████                         | 40 kB 8.9 MB/s eta 0:00:01[K     |████████▉                       | 51 kB 5.1 MB/s eta 0:00:01[K     |██████████▋                     | 61 kB 5.6 MB/s eta 0:00:01[K     |████████████▎                   | 71 kB 5.4 MB/s eta 0:00:01[K     |██████████████                  | 81 kB 6.1 MB/s eta 0:00:01[K     |███████████████▉                | 92 kB 4.6 MB/s eta 0:00:01[K     |█████████████████▋              | 102 kB 5.0 MB/s eta 0:00:01[K     |███████████████████▍            | 112 kB 5.0 MB/s eta 0:00:01[K     |█████████████████████▏          | 122 kB 5.0 MB/s eta 0:00:01[K     |██████████████████████▉         | 133 kB 5.0 MB/s eta 0:00:01[K

In [1]:
import numpy as np
import pandas as pd
import pandera as pa

categories = ["A", "B", "C"]

np.random.seed(100)

dataframe = pd.DataFrame({
    "cat_var_1": np.random.choice(categories, size=100),
    "cat_var_2": np.random.choice(categories, size=100),
    "num_var_1": np.random.uniform(0, 10, size=100),
    "num_var_2": np.random.uniform(20, 30, size=100),
})
dataframe



Unnamed: 0,cat_var_1,cat_var_2,num_var_1,num_var_2
0,A,A,6.804147,24.743304
1,A,C,3.684308,22.774633
2,A,C,5.911288,28.416588
3,C,A,4.790627,21.951250
4,C,B,4.504166,28.563142
...,...,...,...,...
95,C,B,8.918212,20.537053
96,A,B,9.925908,26.377857
97,C,C,2.802506,20.730084
98,C,A,4.465055,29.914306


In [3]:
schema = pa.DataFrameSchema({
    "num_var_.+": pa.Column(
        float,
        checks=pa.Check.greater_than_or_equal_to(0),
        regex=True,
    ),
    "cat_var_.+": pa.Column(
        pa.Category,
        checks=pa.Check.isin(categories),
        coerce=True,
        regex=True,
    ),
})
validated_df = schema.validate(dataframe)
validated_df

Unnamed: 0,cat_var_1,cat_var_2,num_var_1,num_var_2
0,A,A,6.804147,24.743304
1,A,C,3.684308,22.774633
2,A,C,5.911288,28.416588
3,C,A,4.790627,21.951250
4,C,B,4.504166,28.563142
...,...,...,...,...
95,C,B,8.918212,20.537053
96,A,B,9.925908,26.377857
97,C,C,2.802506,20.730084
98,C,A,4.465055,29.914306


In [6]:
schema = pa.DataFrameSchema(
    columns={col: pa.Column(int) for col in ["a", "b", "c"]},
    unique=["a", "c"],
)
df = pd.DataFrame.from_records([
    {"a": 1, "b": 2, "c": 3},
    {"a": 1, "b": 2, "c": 3},
])
df

Unnamed: 0,a,b,c
0,1,2,3
1,1,2,3


In [7]:
schema.validate(df)

SchemaError: ignored