# **I. Perkenalan**

***
Milestone 3

Nama  : Catherine Kezia Wijaya
Batch : RMT-037

Notebook ini adalah file untuk melakukan validasi data menggunakan beberapa kriteria Expectation
***

# **II. Import Library**

In [1]:
import pandas as pd
import numpy as np
from great_expectations.data_context import FileDataContext

# **III. Instantiate Data Context**

Menyimpan data context di suatu path

In [2]:
context = FileDataContext.create(project_root_dir='./')

# **IV. Connect to A `Datasource`**

In [3]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-data-sept'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'revenue-september'
path_to_data = 'dags/P2M3_catherine_kezia_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

# **V. Create an Expectation Suite**

In [4]:
# Creat an expectation suite
expectation_suite_name = 'expectation-revenue-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,name,releasedate,copiessold,price,revenue,avgplaytime,reviewscore,publisherclass,publishers,developers,steamid
0,WWE 2K24,07-03-2024,165301,99.99,8055097.0,42.36514,71,AAA,2K,Visual Concepts,2315690
1,EARTH DEFENSE FORCE 6,25-07-2024,159806,59.99,7882151.0,29.651061,57,Indie,D3PUBLISHER,SANDLOT,2291060
2,Sins of a Solar Empire II,15-08-2024,214192,49.99,7815247.0,12.452593,88,Indie,Stardock Entertainment,"Ironclad Games Corporation,Stardock Entertainment",1575940
3,Legend of Mortal,14-06-2024,440998,19.99,7756399.0,24.797817,76,Indie,"Paras Games,Obb Studio Inc.",Obb Studio Inc.,1859910
4,Shin Megami Tensei V: Vengeance,13-06-2024,141306,59.99,7629252.0,34.258496,96,AA,SEGA,ATLUS,1875830


## A. Expectations

### Expectation 1

In [None]:
# Column `name` must be unique
validator.expect_column_values_to_be_unique('name')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true,
  "result": {
    "element_count": 1500,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  }
}

### Expectation 2

In [None]:
# Column `price` must be less than $ 100

validator.expect_column_values_to_be_between(
    column='price', min_value=0, max_value=100
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true,
  "result": {
    "element_count": 1500,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  }
}

### Expectation 3

In [None]:
# Column `publisherclass` must contain one of the following 6 things :
# AAA
# Indie
# AA
# Hobbyist

validator.expect_column_values_to_be_in_set('publisherclass', ['AAA', 'Indie', 'AA', 'Hobbyist'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true,
  "result": {
    "element_count": 1500,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  }
}

### Expectation 4

In [None]:
# Column `revenue` must in form of integer or float

validator.expect_column_values_to_be_in_type_list('revenue', ['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true,
  "result": {
    "observed_value": "float64"
  }
}

### Expectation 5

In [None]:
# Minimum value of Column `reviewscore` must be between 0 - 10
validator.expect_column_min_to_be_between(
    column="reviewscore",
    min_value=0,
    max_value=10
)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true,
  "result": {
    "observed_value": 0
  }
}

### Expectation 6

In [None]:
# Amount of row in the table is 1500
validator.expect_table_row_count_to_equal(
    value=1500
)

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true,
  "result": {
    "observed_value": 1500
  }
}

### Expectation 7

In [None]:
# The most common value in 'publisherclass' column is either Indie or AA
validator.expect_column_most_common_value_to_be_in_set(
    column="publisherclass",
    value_set=['Indie', 'AA'],
    ties_okay=True
)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true,
  "result": {
    "observed_value": [
      "Indie"
    ]
  }
}

### Save

In [14]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

## B. Checkpoint

In [15]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [16]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/26 [00:00<?, ?it/s]

## C. Data Docs

In [17]:
# Build data docs

context.build_data_docs()

{'local_site': 'file://e:\\Kezia\\hacktiv8\\project-m3\\gx\\uncommitted/data_docs/local_site/index.html'}