# Library

In [1]:
# import libraries
import great_expectations
from great_expectations.data_context import FileDataContext

import warnings
warnings.filterwarnings("ignore")

# Instaiate Data Context

In [5]:
# Create a data context

context = FileDataContext.create(project_root_dir='./')

# Connect to Datasource

In [6]:
# This code give a name to a datasource. The name must unique
datasource_name = 'csv-data-day1'
datasource = context.sources.add_pandas(datasource_name)

# This code give a name to a data asset
asset_name = 'day1'
path_to_data = 'data\P2M3_betara_candra_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

# Create an Expactation Suite

In [7]:
# Create an expectation suite
expectation_suite_name = 'expectation-cleaning-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using  expectation suite on expactation above
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,dimensions_height,dimensions_length,dimensions_width,engine_information_driveline,engine_information_engine_type,engine_information_hybrid,engine_information_number_of_forward_gears,engine_information_transmission,fuel_information_city_mpg,fuel_information_fuel_type,fuel_information_highway_mpg,identification_classification,identification_id,identification_make,identification_model_year,identification_year,engine_information_engine_statistics_horsepower,engine_information_engine_statistics_torque
0,0,140,143,202,All-wheel drive,Audi 3.2L 6 cylinder 250hp 236ft-lbs,True,6,6 Speed Automatic Select Shift,18,Gasoline,25,Automatic transmission,2009 Audi A3 3.2,Audi,2009 Audi A3,2009,250,236
1,1,140,143,202,Front-wheel drive,Audi 2.0L 4 cylinder 200 hp 207 ft-lbs Turbo,True,6,6 Speed Automatic Select Shift,22,Gasoline,28,Automatic transmission,2009 Audi A3 2.0 T AT,Audi,2009 Audi A3,2009,200,207
2,2,140,143,202,Front-wheel drive,Audi 2.0L 4 cylinder 200 hp 207 ft-lbs Turbo,True,6,6 Speed Manual,21,Gasoline,30,Manual transmission,2009 Audi A3 2.0 T,Audi,2009 Audi A3,2009,200,207
3,3,140,143,202,All-wheel drive,Audi 2.0L 4 cylinder 200 hp 207 ft-lbs Turbo,True,6,6 Speed Automatic Select Shift,21,Gasoline,28,Automatic transmission,2009 Audi A3 2.0 T Quattro,Audi,2009 Audi A3,2009,200,207
4,4,140,143,202,All-wheel drive,Audi 2.0L 4 cylinder 200 hp 207 ft-lbs Turbo,True,6,6 Speed Automatic Select Shift,21,Gasoline,28,Automatic transmission,2009 Audi A3 2.0 T Quattro,Audi,2009 Audi A3,2009,200,207


## Expectation Unique column

In [8]:
# Expectation 1 : Column `id` have unique in every data

validator.expect_column_values_to_be_unique('id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 5076,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Expectation Unique column

In [9]:
# Expectation 2 : Column `eengine_information_number_of_forward_gears` must be value from 4 to 8

validator.expect_column_values_to_be_between(
    column='engine_information_number_of_forward_gears', min_value=4, max_value=8
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 5076,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Expectation to be in set

In [10]:
# Expectation 3 : Column `engine_information_driveline` is must in set of list that in code
validator.expect_column_values_to_be_in_set('engine_information_driveline', 
                                            ['All-wheel drive', 'Front-wheel drive',
                                              'Rear-wheel drive','Four-wheel drive'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 5076,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Expectation to be in type list

In [26]:
# Expectation 3 : Column `dimensions_height` is must in type list int or float
validator.expect_column_values_to_be_in_type_list('dimensions_width',['int64'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Expectation Exist

In [29]:
# Expectation 3 : Column `fuel_information_fuel_type` is exist
validator.expect_column_to_exist('fuel_information_fuel_type')

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{
  "success": true,
  "result": {},
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Expectataion order list

In [31]:
# Expectation 3 : Column `fuel_information_fuel_type` is exist
validator.expect_table_columns_to_match_ordered_list(['id', 'dimensions_height', 'dimensions_length', 'dimensions_width',
       'engine_information_driveline', 'engine_information_engine_type',
       'engine_information_hybrid',
       'engine_information_number_of_forward_gears',
       'engine_information_transmission', 'fuel_information_city_mpg',
       'fuel_information_fuel_type', 'fuel_information_highway_mpg',
       'identification_classification', 'identification_id',
       'identification_make', 'identification_model_year',
       'identification_year',
       'engine_information_engine_statistics_horsepower',
       'engine_information_engine_statistics_torque'])

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      "id",
      "dimensions_height",
      "dimensions_length",
      "dimensions_width",
      "engine_information_driveline",
      "engine_information_engine_type",
      "engine_information_hybrid",
      "engine_information_number_of_forward_gears",
      "engine_information_transmission",
      "fuel_information_city_mpg",
      "fuel_information_fuel_type",
      "fuel_information_highway_mpg",
      "identification_classification",
      "identification_id",
      "identification_make",
      "identification_model_year",
      "identification_year",
      "engine_information_engine_statistics_horsepower",
      "engine_information_engine_statistics_torque"
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Expectation Most common value

In [32]:
# Expectation 3 : Column `fuel_information_fuel_type` Most common value os Gasoline
validator.expect_column_most_common_value_to_be_in_set('fuel_information_fuel_type',['Gasoline'])

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      "Gasoline"
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [33]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

# Checkpoints

In [34]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [35]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/25 [00:00<?, ?it/s]

# Data Docs

In [36]:
# Build data docs

context.build_data_docs()

{'local_site': 'file://c:\\Users\\betarac\\Documents\\GitHub\\Milestones\\M3\\airflow\\gx\\uncommitted/data_docs/local_site/index.html'}