# 1. Introduction

Name: Audrey Wanto

Batch: BSD 002

Objective: Validate the cleaned data using Python's Great Expectation

# 2. Import Libraries

In [1]:
# Mount to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install the library

!pip install -q great-expectations

In [3]:
# Import Libraries

from great_expectations.data_context import FileDataContext

# 3. Instantiate Data Context

In [4]:
# Create data context
context = FileDataContext.create(project_root_dir='./')

  and should_run_async(code)

    - No action was taken.

    - No action was taken.



# 4. Connect to Data Source

In [5]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-cleaned-dataset'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'cleaned-dataset'
path_to_data = '/content/drive/MyDrive/P2M3_Audrey_Wanto_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

# 5. Create an Expectation Suite

In [6]:
# Creat an expectation suite
expectation_suite_name = 'expectation-cleaned-data'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()




Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,rank,name,platform,year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales
0,0,1,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,1,2,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,2,3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,3,4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,4,5,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


# 6. Expectations

In [7]:
# Expectation 1 : Column `name` must be unique

validator.expect_column_values_to_be_unique('name')

  and should_run_async(code)




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_unique",
    "kwargs": {
      "column": "name",
      "batch_id": "csv-cleaned-dataset-cleaned-dataset"
    },
    "meta": {}
  },
  "result": {
    "element_count": 16291,
    "unexpected_count": 7682,
    "unexpected_percent": 47.1548707875514,
    "partial_unexpected_list": [
      "Super Mario Bros.",
      "Tetris",
      "Grand Theft Auto V",
      "Grand Theft Auto: San Andreas",
      "Super Mario World",
      "Super Mario Bros. 3",
      "Grand Theft Auto V",
      "Grand Theft Auto: Vice City",
      "Call of Duty: Modern Warfare 3",
      "Call of Duty: Black Ops",
      "Call of Duty: Black Ops 3",
      "Call of Duty: Black Ops II",
      "Call of Duty: Black Ops II",
      "Call of Duty: Modern Warfare 2",
      "Call of Duty: Modern Warfare 3",
      "Grand Theft Auto III",
      "Call of Duty: Black Ops",
      "Grand Theft Auto V",
      "Super Mario 64",
      "Grand 

In [8]:
# Expectation 2 : Column `year` must be less than 2021 and above 1980

validator.expect_column_values_to_be_between(
    column='year', min_value=1980, max_value=2020
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "year",
      "min_value": 1980,
      "max_value": 2020,
      "batch_id": "csv-cleaned-dataset-cleaned-dataset"
    },
    "meta": {}
  },
  "result": {
    "element_count": 16291,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# Expectation 3 : Column `genre` must contain one of the following 12 genres:
# Sports
# Platform
# Racing
# Role-Playing
# Puzzle
# Misc
# Shooter
# Simulation
# Action
# Fighting
# Adventure
# Strategy

validator.expect_column_values_to_be_in_set('genre', ['Sports', 'Platform', 'Racing', 'Role-Playing', 'Puzzle', 'Misc',
                                            'Shooter', 'Simulation', 'Action', 'Fighting', 'Adventure', 'Strategy'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_set",
    "kwargs": {
      "column": "genre",
      "value_set": [
        "Sports",
        "Platform",
        "Racing",
        "Role-Playing",
        "Puzzle",
        "Misc",
        "Shooter",
        "Simulation",
        "Action",
        "Fighting",
        "Adventure",
        "Strategy"
      ],
      "batch_id": "csv-cleaned-dataset-cleaned-dataset"
    },
    "meta": {}
  },
  "result": {
    "element_count": 16291,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [10]:
# Expectation 4 : Column `global_sales` must in form of integer or float

validator.expect_column_values_to_be_in_type_list('global_sales', ['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_type_list",
    "kwargs": {
      "column": "global_sales",
      "type_list": [
        "integer",
        "float"
      ],
      "batch_id": "csv-cleaned-dataset-cleaned-dataset"
    },
    "meta": {}
  },
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [11]:
# Expectation 5 : Column `global_sales` must equal the sum of 8811.97

validator.expect_column_sum_to_be_between(column='global_sales', min_value=8811.97, max_value=8811.97)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_sum_to_be_between",
    "kwargs": {
      "column": "global_sales",
      "min_value": 8811.97,
      "max_value": 8811.97,
      "batch_id": "csv-cleaned-dataset-cleaned-dataset"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 8811.97
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [12]:
# Expectation 6 : Column `publisher` must exist

validator.expect_column_to_exist('publisher')

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_to_exist",
    "kwargs": {
      "column": "publisher",
      "batch_id": "csv-cleaned-dataset-cleaned-dataset"
    },
    "meta": {}
  },
  "result": {},
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [13]:
# Expectation 7 : Column `rank` should be of 16291 rows

validator.expect_table_row_count_to_be_between(column='rank', min_value=16291, max_value=16291)

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_table_row_count_to_be_between",
    "kwargs": {
      "column": "rank",
      "min_value": 16291,
      "max_value": 16291,
      "batch_id": "csv-cleaned-dataset-cleaned-dataset"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 16291
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [14]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)