<center>

# Great Expectations demo

### "Always know what to expect from your data."

### https://greatexpectations.io/
</center>

### Import modules.

In [44]:
import great_expectations as ge
import pandas as pd
import json
from uuid import uuid4

### Create a sample dataframe.

In [45]:
df = pd.DataFrame({"TIMESTAMP": ["26-12-2020 00:00:00",
                                 "27-01-1990 00:00:00",
                                 "28-12-2020 00:00:00",
                                 "29-12-2020",
                                 "30-12-2020 00:00:00"],
                   "ISO_COUNTRY": [pd.np.NaN,
                                   "US",
                                   "MX",
                                   "IT",
                                   "CA"],
                   "EMAIL": ["test1@ibm.com",
                             "test.4_invalid.email.gov",
                             "test.2@mx1.ibm.com",
                             "test-3@nsa.gov",
                             "test5@us.ibm.com"]})
uuid_series = df.index.to_series().map(lambda x: uuid4())
df.insert(loc=0, column="UNIQUE_ID", value=uuid_series)
print (df.to_markdown())

|    | UNIQUE_ID                            | TIMESTAMP           | ISO_COUNTRY   | EMAIL                    |
|---:|:-------------------------------------|:--------------------|:--------------|:-------------------------|
|  0 | 0e0b6cf2-b80b-410d-aa5b-7ad98b6a46e7 | 26-12-2020 00:00:00 | nan           | test1@ibm.com            |
|  1 | e7501c1f-70ae-4b11-93bc-b1af7099293e | 27-01-1990 00:00:00 | US            | test.4_invalid.email.gov |
|  2 | 76865200-73f6-4316-9b26-d10bcf4ea6ea | 28-12-2020 00:00:00 | MX            | test.2@mx1.ibm.com       |
|  3 | e7082b6d-f820-4fb8-96f4-0e1c537fcd38 | 29-12-2020          | IT            | test-3@nsa.gov           |
|  4 | 82e0941e-3492-4c0f-8611-b93caa3c17d2 | 30-12-2020 00:00:00 | CA            | test5@us.ibm.com         |


### Convert our dataframe to a GE dataframe.

In [46]:
df_ge = ge.dataset.PandasDataset(df)

### Assert that there are no duplicate values in a column.

In [47]:
print (df.to_markdown())
df_ge.expect_column_values_to_be_unique('UNIQUE_ID', result_format={'result_format': 'COMPLETE'})

|    | UNIQUE_ID                            | TIMESTAMP           | ISO_COUNTRY   | EMAIL                    |
|---:|:-------------------------------------|:--------------------|:--------------|:-------------------------|
|  0 | 0e0b6cf2-b80b-410d-aa5b-7ad98b6a46e7 | 26-12-2020 00:00:00 | nan           | test1@ibm.com            |
|  1 | e7501c1f-70ae-4b11-93bc-b1af7099293e | 27-01-1990 00:00:00 | US            | test.4_invalid.email.gov |
|  2 | 76865200-73f6-4316-9b26-d10bcf4ea6ea | 28-12-2020 00:00:00 | MX            | test.2@mx1.ibm.com       |
|  3 | e7082b6d-f820-4fb8-96f4-0e1c537fcd38 | 29-12-2020          | IT            | test-3@nsa.gov           |
|  4 | 82e0941e-3492-4c0f-8611-b93caa3c17d2 | 30-12-2020 00:00:00 | CA            | test5@us.ibm.com         |


{
  "meta": {},
  "exception_info": null,
  "result": {
    "element_count": 5,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": [],
    "partial_unexpected_index_list": [],
    "partial_unexpected_counts": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  },
  "success": true
}

### Assert that all the values in a column belong to a set.

In [48]:
print (df.to_markdown())
df_ge.expect_column_values_to_be_in_set('ISO_COUNTRY', ['MX', 'JP', 'IT'], 
                                        result_format={'result_format': 'COMPLETE'})

|    | UNIQUE_ID                            | TIMESTAMP           | ISO_COUNTRY   | EMAIL                    |
|---:|:-------------------------------------|:--------------------|:--------------|:-------------------------|
|  0 | 0e0b6cf2-b80b-410d-aa5b-7ad98b6a46e7 | 26-12-2020 00:00:00 | nan           | test1@ibm.com            |
|  1 | e7501c1f-70ae-4b11-93bc-b1af7099293e | 27-01-1990 00:00:00 | US            | test.4_invalid.email.gov |
|  2 | 76865200-73f6-4316-9b26-d10bcf4ea6ea | 28-12-2020 00:00:00 | MX            | test.2@mx1.ibm.com       |
|  3 | e7082b6d-f820-4fb8-96f4-0e1c537fcd38 | 29-12-2020          | IT            | test-3@nsa.gov           |
|  4 | 82e0941e-3492-4c0f-8611-b93caa3c17d2 | 30-12-2020 00:00:00 | CA            | test5@us.ibm.com         |


{
  "meta": {},
  "exception_info": null,
  "result": {
    "element_count": 5,
    "missing_count": 1,
    "missing_percent": 20.0,
    "unexpected_count": 2,
    "unexpected_percent": 40.0,
    "unexpected_percent_nonmissing": 50.0,
    "partial_unexpected_list": [
      "US",
      "CA"
    ],
    "partial_unexpected_index_list": [
      1,
      4
    ],
    "partial_unexpected_counts": [
      {
        "value": "CA",
        "count": 1
      },
      {
        "value": "US",
        "count": 1
      }
    ],
    "unexpected_list": [
      "US",
      "CA"
    ],
    "unexpected_index_list": [
      1,
      4
    ]
  },
  "success": false
}

### We can assert the row counts.

In [50]:
print (df.to_markdown())
df_ge.expect_table_row_count_to_be_between(1, 3, result_format={'result_format': 'COMPLETE'})

|    | UNIQUE_ID                            | TIMESTAMP           | ISO_COUNTRY   | EMAIL                    |
|---:|:-------------------------------------|:--------------------|:--------------|:-------------------------|
|  0 | 0e0b6cf2-b80b-410d-aa5b-7ad98b6a46e7 | 26-12-2020 00:00:00 | nan           | test1@ibm.com            |
|  1 | e7501c1f-70ae-4b11-93bc-b1af7099293e | 27-01-1990 00:00:00 | US            | test.4_invalid.email.gov |
|  2 | 76865200-73f6-4316-9b26-d10bcf4ea6ea | 28-12-2020 00:00:00 | MX            | test.2@mx1.ibm.com       |
|  3 | e7082b6d-f820-4fb8-96f4-0e1c537fcd38 | 29-12-2020          | IT            | test-3@nsa.gov           |
|  4 | 82e0941e-3492-4c0f-8611-b93caa3c17d2 | 30-12-2020 00:00:00 | CA            | test5@us.ibm.com         |


{
  "meta": {},
  "exception_info": null,
  "result": {
    "observed_value": 5
  },
  "success": false
}

### We can assert that a specific column exist.

In [52]:
print (df.to_markdown())
df_ge.expect_column_to_exist("ISO_COUNTRY", result_format={'result_format': 'COMPLETE'})

|    | UNIQUE_ID                            | TIMESTAMP           | ISO_COUNTRY   | EMAIL                    |
|---:|:-------------------------------------|:--------------------|:--------------|:-------------------------|
|  0 | 0e0b6cf2-b80b-410d-aa5b-7ad98b6a46e7 | 26-12-2020 00:00:00 | nan           | test1@ibm.com            |
|  1 | e7501c1f-70ae-4b11-93bc-b1af7099293e | 27-01-1990 00:00:00 | US            | test.4_invalid.email.gov |
|  2 | 76865200-73f6-4316-9b26-d10bcf4ea6ea | 28-12-2020 00:00:00 | MX            | test.2@mx1.ibm.com       |
|  3 | e7082b6d-f820-4fb8-96f4-0e1c537fcd38 | 29-12-2020          | IT            | test-3@nsa.gov           |
|  4 | 82e0941e-3492-4c0f-8611-b93caa3c17d2 | 30-12-2020 00:00:00 | CA            | test5@us.ibm.com         |


{
  "meta": {},
  "exception_info": null,
  "result": {},
  "success": false
}

### We can assert that the columns list has an exact order.

In [55]:
print (df.to_markdown())
df_ge.expect_table_columns_to_match_ordered_list(["UNIQUE_ID", "TIMESTAMP", "ISO_COUNTRY", "EMAIL", "testcolumn"], 
                                                 result_format={'result_format': 'COMPLETE'})

|    | UNIQUE_ID                            | TIMESTAMP           | ISO_COUNTRY   | EMAIL                    |
|---:|:-------------------------------------|:--------------------|:--------------|:-------------------------|
|  0 | 0e0b6cf2-b80b-410d-aa5b-7ad98b6a46e7 | 26-12-2020 00:00:00 | nan           | test1@ibm.com            |
|  1 | e7501c1f-70ae-4b11-93bc-b1af7099293e | 27-01-1990 00:00:00 | US            | test.4_invalid.email.gov |
|  2 | 76865200-73f6-4316-9b26-d10bcf4ea6ea | 28-12-2020 00:00:00 | MX            | test.2@mx1.ibm.com       |
|  3 | e7082b6d-f820-4fb8-96f4-0e1c537fcd38 | 29-12-2020          | IT            | test-3@nsa.gov           |
|  4 | 82e0941e-3492-4c0f-8611-b93caa3c17d2 | 30-12-2020 00:00:00 | CA            | test5@us.ibm.com         |


{
  "meta": {},
  "exception_info": null,
  "result": {
    "observed_value": [
      "UNIQUE_ID",
      "TIMESTAMP",
      "ISO_COUNTRY",
      "EMAIL"
    ],
    "details": {
      "mismatched": [
        {
          "Expected Column Position": 4,
          "Expected": "testcolumn",
          "Found": null
        }
      ]
    }
  },
  "success": false
}

### We can assert that a column does not have any null values.

In [56]:
print (df.to_markdown())
df_ge.expect_column_values_to_not_be_null("EMAIL", 
                                          result_format={'result_format': 'COMPLETE'})

|    | UNIQUE_ID                            | TIMESTAMP           | ISO_COUNTRY   | EMAIL                    |
|---:|:-------------------------------------|:--------------------|:--------------|:-------------------------|
|  0 | 0e0b6cf2-b80b-410d-aa5b-7ad98b6a46e7 | 26-12-2020 00:00:00 | nan           | test1@ibm.com            |
|  1 | e7501c1f-70ae-4b11-93bc-b1af7099293e | 27-01-1990 00:00:00 | US            | test.4_invalid.email.gov |
|  2 | 76865200-73f6-4316-9b26-d10bcf4ea6ea | 28-12-2020 00:00:00 | MX            | test.2@mx1.ibm.com       |
|  3 | e7082b6d-f820-4fb8-96f4-0e1c537fcd38 | 29-12-2020          | IT            | test-3@nsa.gov           |
|  4 | 82e0941e-3492-4c0f-8611-b93caa3c17d2 | 30-12-2020 00:00:00 | CA            | test5@us.ibm.com         |


{
  "meta": {},
  "exception_info": null,
  "result": {
    "element_count": 5,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "unexpected_list": [],
    "unexpected_index_list": []
  },
  "success": true
}

### Assert that a column matches a regex (for example, email).

In [57]:
print (df.to_markdown())
df_ge.expect_column_values_to_match_regex("EMAIL", r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", 
                                          result_format={'result_format': 'COMPLETE'})

|    | UNIQUE_ID                            | TIMESTAMP           | ISO_COUNTRY   | EMAIL                    |
|---:|:-------------------------------------|:--------------------|:--------------|:-------------------------|
|  0 | 0e0b6cf2-b80b-410d-aa5b-7ad98b6a46e7 | 26-12-2020 00:00:00 | nan           | test1@ibm.com            |
|  1 | e7501c1f-70ae-4b11-93bc-b1af7099293e | 27-01-1990 00:00:00 | US            | test.4_invalid.email.gov |
|  2 | 76865200-73f6-4316-9b26-d10bcf4ea6ea | 28-12-2020 00:00:00 | MX            | test.2@mx1.ibm.com       |
|  3 | e7082b6d-f820-4fb8-96f4-0e1c537fcd38 | 29-12-2020          | IT            | test-3@nsa.gov           |
|  4 | 82e0941e-3492-4c0f-8611-b93caa3c17d2 | 30-12-2020 00:00:00 | CA            | test5@us.ibm.com         |


{
  "meta": {},
  "exception_info": null,
  "result": {
    "element_count": 5,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 1,
    "unexpected_percent": 20.0,
    "unexpected_percent_nonmissing": 20.0,
    "partial_unexpected_list": [
      "test.4_invalid.email.gov"
    ],
    "partial_unexpected_index_list": [
      1
    ],
    "partial_unexpected_counts": [
      {
        "value": "test.4_invalid.email.gov",
        "count": 1
      }
    ],
    "unexpected_list": [
      "test.4_invalid.email.gov"
    ],
    "unexpected_index_list": [
      1
    ]
  },
  "success": false
}

### Assert that a column matches a strftime format.

In [59]:
print (df.to_markdown())
test = df_ge.expect_column_values_to_match_strftime_format("TIMESTAMP", "%d-%m-%Y", 
                                                           result_format={'result_format': 'COMPLETE'})
print(test)

|    | UNIQUE_ID                            | TIMESTAMP           | ISO_COUNTRY   | EMAIL                    |
|---:|:-------------------------------------|:--------------------|:--------------|:-------------------------|
|  0 | 0e0b6cf2-b80b-410d-aa5b-7ad98b6a46e7 | 26-12-2020 00:00:00 | nan           | test1@ibm.com            |
|  1 | e7501c1f-70ae-4b11-93bc-b1af7099293e | 27-01-1990 00:00:00 | US            | test.4_invalid.email.gov |
|  2 | 76865200-73f6-4316-9b26-d10bcf4ea6ea | 28-12-2020 00:00:00 | MX            | test.2@mx1.ibm.com       |
|  3 | e7082b6d-f820-4fb8-96f4-0e1c537fcd38 | 29-12-2020          | IT            | test-3@nsa.gov           |
|  4 | 82e0941e-3492-4c0f-8611-b93caa3c17d2 | 30-12-2020 00:00:00 | CA            | test5@us.ibm.com         |
{
  "expectation_config": {
    "kwargs": {
      "column": "TIMESTAMP",
      "strftime_format": "%d-%m-%Y",
      "result_format": {
        "result_format": "COMPLETE"
      }
    },
    "expectation_type": "expect_colum

### Drop the "unexpected" values from the dataframe.

In [60]:
df.drop(test.result["unexpected_index_list"], inplace=True)
print(df.to_markdown())

|    | UNIQUE_ID                            | TIMESTAMP   | ISO_COUNTRY   | EMAIL          |
|---:|:-------------------------------------|:------------|:--------------|:---------------|
|  3 | e7082b6d-f820-4fb8-96f4-0e1c537fcd38 | 29-12-2020  | IT            | test-3@nsa.gov |


### Assert that the values are in a specific range (even works with dates!).

In [21]:
print (df.to_markdown())
test = df_ge.expect_column_values_to_be_between("TIMESTAMP",
                                                parse_strings_as_datetimes=True,
                                                min_value="01-01-2020 00:00:00",
                                                max_value="30-12-2020 00:00:00",
                                                result_format={'result_format': 'COMPLETE'})
print (test)

|    | UNIQUE_ID                            | TIMESTAMP           | ISO_COUNTRY   | EMAIL                    |
|---:|:-------------------------------------|:--------------------|:--------------|:-------------------------|
|  0 | 1af2ecff-4cc7-4522-80b0-d7f65d213a49 | 26-12-2020 00:00:00 | nan           | test1@ibm.com            |
|  1 | 90f62a96-95b3-4647-a790-8ac5cb40029c | 27-01-1990 00:00:00 | US            | test.4_invalid.email.gov |
|  2 | 4f47b60e-7ac1-46c3-bceb-f840bb449630 | 28-12-2020 00:00:00 | MX            | test.2@mx1.ibm.com       |
|  4 | 574389ea-375c-4e3d-ab3d-43d27c941fd7 | 30-12-2020 00:00:00 | CA            | test5@us.ibm.com         |
{
  "expectation_config": {
    "kwargs": {
      "column": "TIMESTAMP",
      "parse_strings_as_datetimes": true,
      "min_value": "01-01-2020 00:00:00",
      "max_value": "30-12-2020 00:00:00",
      "result_format": {
        "result_format": "COMPLETE"
      }
    },
    "expectation_type": "expect_column_values_to_be_between

### Since expectations rarely change, we can save them to a configuration file and re-use them.

In [33]:
df_ge.save_expectation_suite('/home/alan/saved_config.json')

### Create a new dataset.

In [61]:
df2 = pd.DataFrame({"TIMESTAMP": ["26-12-2020 00:00:00",
                                  "27-01-2020 00:00:00",
                                  "28-12-2020 00:00:00",
                                  "29-12-2020",
                                  "30-12-2020 00:00:00"],
                    "ISO_COUNTRY": [pd.np.NaN,
                                    "CN",
                                    "US",
                                    "JP",
                                    "UK"],
                    "EMAIL": ["test1@ibm.com",
                              "test.2@valid.email.gov",
                              "test.3@mx1.ibm.com",
                              "test-4@nsa.gov",
                              "test5@us.ibm.com"]})
uuid_series = df2.index.to_series().map(lambda x: uuid4())
df2.insert(loc=0, column="UNIQUE_ID", value=uuid_series)
print (df2.to_markdown())

|    | UNIQUE_ID                            | TIMESTAMP           | ISO_COUNTRY   | EMAIL                  |
|---:|:-------------------------------------|:--------------------|:--------------|:-----------------------|
|  0 | aeb7cd90-9e5f-4e7f-990f-138928a3dc6d | 26-12-2020 00:00:00 | nan           | test1@ibm.com          |
|  1 | e52e8030-5f26-4bec-941f-72db13d08c7f | 27-01-2020 00:00:00 | CN            | test.2@valid.email.gov |
|  2 | 4273f4b6-46da-432e-af57-009988a3838f | 28-12-2020 00:00:00 | US            | test.3@mx1.ibm.com     |
|  3 | 60c34e1a-b208-4398-80ed-f9e9ec68f3a6 | 29-12-2020          | JP            | test-4@nsa.gov         |
|  4 | 95b91829-7ba1-4f0e-84fa-785f098dad65 | 30-12-2020 00:00:00 | UK            | test5@us.ibm.com       |


### Using expectations files, we can easily validate any dataset against them.

In [62]:
print (df2.to_markdown())
df_ge2 = ge.dataset.PandasDataset(df2)
validation_results = df_ge2.validate(expectation_suite='/home/alan/saved_config.json',
                                     result_format={'result_format': 'COMPLETE'})
print(validation_results)

|    | UNIQUE_ID                            | TIMESTAMP           | ISO_COUNTRY   | EMAIL                  |
|---:|:-------------------------------------|:--------------------|:--------------|:-----------------------|
|  0 | aeb7cd90-9e5f-4e7f-990f-138928a3dc6d | 26-12-2020 00:00:00 | nan           | test1@ibm.com          |
|  1 | e52e8030-5f26-4bec-941f-72db13d08c7f | 27-01-2020 00:00:00 | CN            | test.2@valid.email.gov |
|  2 | 4273f4b6-46da-432e-af57-009988a3838f | 28-12-2020 00:00:00 | US            | test.3@mx1.ibm.com     |
|  3 | 60c34e1a-b208-4398-80ed-f9e9ec68f3a6 | 29-12-2020          | JP            | test-4@nsa.gov         |
|  4 | 95b91829-7ba1-4f0e-84fa-785f098dad65 | 30-12-2020 00:00:00 | UK            | test5@us.ibm.com       |
{
  "results": [
    {
      "expectation_config": {
        "kwargs": {
          "column": "UNIQUE_ID",
          "result_format": {
            "result_format": "COMPLETE"
          }
        },
        "expectation_type": "expect_col