# A. Instantiate Data Context and Connect Datasource

In [19]:
# membuat data context
from great_expectations.data_context import FileDataContext
context = FileDataContext.create(project_root_dir='./') # './' untuk menyimpan validasi dan proses data validasi ke konfigurasi dalam working category

In [20]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'milestone3-csv'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'aribnb-dataset'
path_to_data = 'airbnb.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [21]:
# Creat an expectation suite
expectation_suite_name = 'expectation-airbnb-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,latitude,longitude,country,...,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,house_rules,license,unique_listing_key
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,United States,...,10,9,2021-10-19,0.21,4,6,286,Clean up and treat the home the way you'd like...,Unknown,80014485718_40.64749_-73.97237_Clean & quiet a...
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,United States,...,30,45,2022-05-21,0.38,4,2,228,Pet friendly but please confirm with me if the...,Unknown,52335172823_40.75362_-73.98377_Skylit Midtown ...
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,Unknown,Elise,Manhattan,Harlem,40.80902,-73.9419,United States,...,3,0,2019-06-23,0.0,5,1,352,"I encourage you to use my kitchen, cooking and...",Unknown,78829239556_40.80902_-73.9419_THE VILLAGE OF H...
3,1002755,Unknown,85098326012,unconfirmed,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,United States,...,30,270,2019-07-05,4.64,4,1,322,Unknown,Unknown,85098326012_40.68514_-73.95976_Unknown
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,United States,...,10,9,2018-11-19,0.1,3,1,289,"Please no smoking in the house, porch or on th...",Unknown,92037596077_40.79851_-73.94399_Entire Apt: Spa...


# B. Create an Expectation Suite

## a. expectations

In [22]:
# Expectation 1: Kolom 'host id' harus unik

validator.expect_column_values_to_be_unique('unique_listing_key')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 102058,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [23]:
# Expectation 2: Kolom 'price' kurang dari $ 2000

validator.expect_column_values_to_be_between(
    column='price', min_value=0, max_value=2000
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 102058,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [24]:
# Expectation 3 : Kolom 'review rate number' harus memiliki salah satu dari di bawah ini:
# 5.0 = Sangat Bagus
# 4.0 = Bagus
# 3.0 = Perhatian
# 2.0 = Jelek
# 1.0 = Sangat Jelek

validator.expect_column_values_to_be_in_set('review_rate_number', [5.0, 4.0, 3.0, 2.0, 1.0])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 102058,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [25]:
# Expectation 4: Kolom 'total_amount' memili tipe data integer ataufloat

validator.expect_column_values_to_be_in_type_list('service_fee', ['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [26]:
# Expectation 5: Kolom 'host_identity_verified' dipastikan hanys diisi ['verified', 'unconfirmed', None]

validator.expect_column_distinct_values_to_equal_set(
    column='host_identity_verified',
    value_set=['verified', 'unconfirmed', 'Unknown']
    )

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      "Unknown",
      "unconfirmed",
      "verified"
    ],
    "details": {
      "value_counts": [
        {
          "value": "Unknown",
          "count": 289
        },
        {
          "value": "unconfirmed",
          "count": 50944
        },
        {
          "value": "verified",
          "count": 50825
        }
      ]
    }
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [27]:
# Expectation 6: Kolom 'host_id' lebih besar dari 'id" 

validator.expect_column_pair_values_a_to_be_greater_than_b('host_id', 'id')

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 102058,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [28]:
# Expectation 7: Kolom sudah macth
expected_columns = [
    'id', 'name', 'host_id', 'host_identity_verified', 'host_name', 'neighbourhood_group', 'neighbourhood',
    'latitude', 'longitude', 'country', 'country_code', 'instant_bookable', 'cancellation_policy', 'room_type',
    'construction_year', 'price', 'service_fee', 'minimum_nights', 'number_of_reviews',
    'last_review', 'reviews_per_month', 'review_rate_number', 'calculated_host_listings_count',
    'availability_365', 'house_rules', 'license', 'unique_listing_key',
]

validator.expect_table_columns_to_match_set(expected_columns)

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      "id",
      "name",
      "host_id",
      "host_identity_verified",
      "host_name",
      "neighbourhood_group",
      "neighbourhood",
      "latitude",
      "longitude",
      "country",
      "country_code",
      "instant_bookable",
      "cancellation_policy",
      "room_type",
      "construction_year",
      "price",
      "service_fee",
      "minimum_nights",
      "number_of_reviews",
      "last_review",
      "reviews_per_month",
      "review_rate_number",
      "calculated_host_listings_count",
      "availability_365",
      "house_rules",
      "license",
      "unique_listing_key"
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [39]:
# Simpan dalam expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

## Checkpoint

In [29]:
# membuat Checkpint
checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [30]:
checkpoint_result = checkpoint_1.run()

Calculating Metrics: 0it [00:00, ?it/s]

## Data docs


In [32]:
# membuat data docs

context.build_data_docs()

{'local_site': 'file://c:\\Users\\ASUS\\Documents\\coda_005\\p2-coda005-rmt-m3-adizbean\\gx\\uncommitted/data_docs/local_site/index.html'}