===============================================================================================
# **Coffee Bean Quality Analysis Validation File**

This program aims to validate the dataset through testing if it passes the seven great expectations.
Make sure to get the correct file path for loading the cleaned csv data.

===============================================================================================

In [None]:
# Install the library if you do not have it

# %pip install -q great-expectations

In [1]:
# import library
from great_expectations.data_context import FileDataContext

In [2]:
# Create data context in current directory
context = FileDataContext.create(project_root_dir='./')

In [3]:
# Give a name to a Datasource. This name must be unique between Datasources.
ds_name = 'cleaned_csv'
datasource = context.sources.add_pandas(ds_name)

# Give a name to a data asset
asset_name = 'coffee_data'
# create file path of the cleanded data set on csv
file_path = 'coffee_bean_cleaned_data.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=file_path)

# Build batch request
batch_request = asset.build_batch_request()

In [4]:
# Creat an expectation suite
expectation_suite_cleandata_name = 'expectation-coffee-dataset'
context.add_or_update_expectation_suite(expectation_suite_cleandata_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_cleandata_name)

# Check the validator content
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,country_of_origin,farm_name,average_altitude_m,region,number_of_bags,bag_weight_kg,variety,processing_method,aroma,...,uniformity,overall,defects,total_cup_points,moisture_percentage,category_one_defects,quakers,color,category_two_defects,total_bag_weight
0,0,Colombia,Finca El Paraiso,1815.0,"Piendamo,Cauca",1,35.0,castillo,double anaerobic washed,8.58,...,10.0,8.58,0,69.33,11.8,0,0,green,3,35.0
1,1,Taiwan,Royal Bean Geisha Estate,1200.0,Chiayi,1,80.0,gesha,washed / wet,8.5,...,10.0,8.5,0,67.59,10.5,0,0,blue-green,0,80.0
2,2,Laos,OKLAO coffee farms,1300.0,Laos Borofen Plateau,19,25.0,java,semi washed,8.33,...,10.0,8.33,0,67.42,10.4,0,0,yellowish,2,475.0
3,3,Costa Rica,La Cumbre,1900.0,"Los Santos,Tarrazu",1,22.0,gesha,washed / wet,8.08,...,10.0,8.25,0,67.17,11.8,0,0,green,0,22.0
4,4,Colombia,Finca Santuario,1975.0,"Popayan,Cauca",2,24.0,red bourbon,"honey,mossto",8.33,...,10.0,8.25,0,67.08,11.6,0,2,yellow-green,2,48.0


### First Expectation: column data has no missing values (Completeness)

In [5]:
# Expectation 1 : Column `variety` can not contain missing values

validator.expect_column_values_to_not_be_null('variety')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 206,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "success": true,
  "meta": {}
}

### Second Expectation: the maximum value of a column data should be within a certain range (Validity)

In [6]:
# Expectation 2 : The maximum value of column `total_cup_points` must be between 69 and 70

validator.expect_column_max_to_be_between('total_cup_points', 69, 70)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 69.33
  },
  "success": true,
  "meta": {}
}

### Third Expectation: column data contains unique values (Validity and Completeness)

In [7]:
# Expectation 3 : Column `id` must be unique

validator.expect_column_values_to_be_unique('id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 206,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true,
  "meta": {}
}

### Fourth Expectation: column data range should lie within a certain range of values. (Accuracy)

In [8]:
# Expectation 4 : Column `average_altitude_m` must be less than 5800 m
validator.expect_column_values_to_be_between(
    column='average_altitude_m', min_value=0, max_value=5800
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 206,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true,
  "meta": {}
}

### Fifth Expectation: a column data should exist to calculate values of another column.(Consistency)

In [9]:
# Expectation 5 : Column `number_of_bags` must be exist to calculate the total_bag_weight
validator.expect_column_to_exist(column='number_of_bags')

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {},
  "success": true,
  "meta": {}
}

### Sixth Expectation: a column data should contain a fixed number of categorical values (Consistency)

In [10]:
# Expectation 6 : Column `processing_method` must contain one of these :
# double anaerobic washed', 
# 'washed / wet', 
# 'semi washed',
# 'honey,mossto', 
# 'natural / dry', 
# 'others',
# 'double carbonic maceration / natural', 
# 'wet hulling',
# 'anaerobico 1000h', 
# 'semi-lavado'

validator.expect_column_values_to_be_in_set('processing_method', ['double anaerobic washed', 'washed / wet', 'semi washed',
       'honey,mossto', 'natural / dry', 'others',
       'double carbonic maceration / natural', 'wet hulling',
       'anaerobico 1000h', 'semi-lavado'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 206,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "success": true,
  "meta": {}
}

### Seventh Expectation: a column data should have a consistent data type (Consistency)

In [11]:
# Expectation 7 : Column `total_bag_weight` must in form of float

validator.expect_column_values_to_be_in_type_list('total_bag_weight', ['float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": "float64"
  },
  "success": true,
  "meta": {}
}

### Eight Expectation: a column data should have a certain pattern (Relevance)

In [12]:
# Expectation 8 : Column `country_of_origin = Taiwan` should have 1/4 of total data 
validator.expect_column_values_to_match_regex(column='country_of_origin', regex='Taiwan', mostly=0.25)


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 206,
    "unexpected_count": 145,
    "unexpected_percent": 70.3883495145631,
    "partial_unexpected_list": [
      "Colombia",
      "Laos",
      "Costa Rica",
      "Colombia",
      "Guatemala",
      "Tanzania",
      "Ethiopia",
      "Guatemala",
      "Ethiopia",
      "Colombia",
      "Ethiopia",
      "Tanzania",
      "Guatemala",
      "Thailand",
      "Colombia",
      "Guatemala",
      "Brazil",
      "United States (Hawaii)",
      "Ethiopia",
      "Ethiopia"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 70.3883495145631,
    "unexpected_percent_nonmissing": 70.3883495145631
  },
  "success": true,
  "meta": {}
}

In [13]:
# Save all validations into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

### Conclusion
- The dataset passed all of the validation tests. The output of each validation test shows "success": true.
- Unfortunately, the dataset cannot pass timeliness (information about the dataset was created) and accessibility (with html tags) since there is no data about date and tags.
- On the other hand, it was known from the dataset source that the data was created on May 2023. So, it still passes timeliness.