In [1]:
# !pip install -q great-expectations

In [2]:
# Create a data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

In [4]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'Income Data'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'Total Income Data'
path_to_data = 'P2M3_adhy_arya_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [5]:
# Creat an expectation suite
expectation_suite_name = 'expectation-trip-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,age,working_class,final_weight,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,id
0,0,39,self emp not inc,327120,hs grad,9,married civ spouse,craft repair,husband,white,male,0,0,40,portugal,1
1,1,32,private,123253,assoc acdm,12,married civ spouse,craft repair,husband,white,male,0,0,42,united states,2
2,2,47,private,232628,hs grad,9,married civ spouse,craft repair,husband,black,male,0,0,40,united states,3
3,3,19,private,374262,12th,8,never married,handlers cleaners,own child,white,male,0,0,20,united states,4
4,4,46,self emp not inc,311231,hs grad,9,married civ spouse,farming fishing,husband,white,male,0,0,40,united states,5


In [6]:
# Expectation 1 : Column `id` must be unique

validator.expect_column_values_to_be_unique('id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 899,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# Expectation 2 : Column `age` must be less than 100

validator.expect_column_values_to_be_between(
    column='age', min_value=0, max_value=100
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 899,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# Expectation 3 : Column `gender` must contain one of the following 2 words things :
# male
# female

validator.expect_column_values_to_be_in_set('gender', ['male','female'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 899,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# Expectation 4 : Column `final_weight` must be in form of int64

validator.expect_column_values_to_be_in_type_list('final_weight', ['int64'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [10]:
# Expectation 5 : Column `educational_num` can not contain missing values

validator.expect_column_values_to_not_be_null('educational_num')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 899,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [11]:
# Expectation 6 : column 'final_weight' standard deviation to be between minimum value and a maximum value

validator.expect_column_stdev_to_be_between(
    column= 'final_weight', min_value=0, max_value=1000000
)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 104945.49434888133
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [12]:
# Expectation 7 : column 'id' it expected increasing every single data in it.
validator.expect_column_values_to_be_increasing('id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 899,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [13]:
# Expectation 8 column 'race', a column contain:
# white
# black
# amer indian eskimo
# other 
# asian pac islander 

validator.expect_column_distinct_values_to_contain_set('race',
                                                       ['white',
                                                        'black',
                                                        'amer indian eskimo',
                                                        'other',
                                                        'asian pac islander'])

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      "amer indian eskimo",
      "asian pac islander",
      "black",
      "other",
      "white"
    ],
    "details": {
      "value_counts": [
        {
          "value": "amer indian eskimo",
          "count": 10
        },
        {
          "value": "asian pac islander",
          "count": 19
        },
        {
          "value": "black",
          "count": 90
        },
        {
          "value": "other",
          "count": 5
        },
        {
          "value": "white",
          "count": 775
        }
      ]
    }
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}