# Create the expectations and test data

We will be using expectations that are already implemented and supplied by great expectations:
* expect_table_columns_to_match_set
* expect_column_values_to_not_be_null
* expect_column_values_to_be_unique
* expect_column_values_to_be_in_set

There are 298 expectations already available for us to use: https://greatexpectations.io/expectations/, if they don't fit your needs, you can create a custom one: https://docs.greatexpectations.io/docs/guides/expectations/creating_custom_expectations/overview/

In [1]:
import great_expectations as ge
import pandas as pd
import json
import pipeline

In [2]:
input_path = './titanic_data.csv'
output_path = './titanic_data_processed.csv'
df = pipeline.run_pipeline(input_path, output_path,True)


In [3]:
gdf = ge.from_pandas(df)
gdf.head()

Unnamed: 0,passengerid,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked
0,1,1,1,female,29,0,0,211.3375,B5,S
1,2,1,1,male,0,1,2,151.55,C22 C26,S
2,3,1,0,female,2,1,2,151.55,C22 C26,S
3,4,1,0,male,30,1,2,151.55,C22 C26,S
4,5,1,0,female,25,1,2,151.55,C22 C26,S


In [4]:
expected_columns = ['passengerid', 'pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare', 'cabin', 'embarked']
gdf.expect_table_columns_to_match_set(column_set = expected_columns)

{
  "result": {
    "observed_value": [
      "passengerid",
      "pclass",
      "survived",
      "sex",
      "age",
      "sibsp",
      "parch",
      "fare",
      "cabin",
      "embarked"
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [5]:
gdf.expect_column_values_to_not_be_null(column = 'age')

{
  "result": {
    "element_count": 1309,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [6]:
gdf.expect_column_values_to_be_unique(column = 'passengerid')

{
  "result": {
    "element_count": 1309,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [7]:
gdf.expect_column_values_to_be_in_set(column = 'sex', value_set=['male', 'female'])

{
  "result": {
    "element_count": 1309,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

# Create Expectations Suite

In [8]:
expectation_suite = gdf.get_expectation_suite(discard_failed_expectations=False)

We then save the expectations suite to a json file

In [9]:
with open( "./my_expectation_file.json", "w") as my_file:
    my_file.write(
        json.dumps(expectation_suite.to_json_dict(), sort_keys=True, indent=4)
    )