# Great Expectations Task

## 1. Install Great Expectations Library


In [349]:
!pip install great_expectations




[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: C:\Users\Bralyn (School)\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


##2. Import Necessary Libraries

In [350]:
import pandas as pd
import great_expectations as gx

##3. Load Labels.csv

Download and upload the [Labels.csv](https://github.com/zubxxr/SOFE3980U-Lab5/blob/main/Labels.csv) into this notebook, and then load the file.

In [351]:
df = pd.read_csv("Labels.csv")

##4. Preview the Dataset

In [352]:
df.head()

Unnamed: 0,Timestamp,Car1_Location_X,Car1_Location_Y,Car1_Location_Z,Car2_Location_X,Car2_Location_Y,Car2_Location_Z,Occluded_Image_view,Occluding_Car_view,Ground_Truth_View,pedestrianLocationX_TopLeft,pedestrianLocationY_TopLeft,pedestrianLocationX_BottomRight,pedestrianLocationY_BottomRight
0,1736796157,-51.402977,143,0.596902,-59.32027,140,0.596902,A_001.png,B_001.png,C_001.png,593,361,610,410
1,1736796167,-53.819637,143,0.596902,-59.196568,140,0.596902,A_002.png,B_002.png,C_002.png,579,368,594,415
2,1736796178,-50.239144,143,0.596902,-56.744479,140,0.596902,A_003.png,B_003.png,C_003.png,854,720,854,720
3,1736796188,-53.70722,143,0.596902,-57.30938,140,0.596902,A_004.png,B_004.png,C_004.png,549,368,567,425
4,1736796198,-52.053721,143,0.596902,-59.545897,140,0.596902,A_005.png,B_005.png,C_005.png,524,368,537,413


##5. Set Up Great Expectations Context and Data Source

In [353]:
context = gx.get_context()

##6. Define and Create a Data Batch

In [354]:
import great_expectations as ge
import json
from great_expectations.core import ExpectationSuite

suite_name = "my_suite"

# Get a persistent context
context = ge.get_context(mode="file")

# Create the expectation suite
suite = ExpectationSuite(suite_name)

# Serialize the suite to a JSON string
suite_json = json.dumps(suite.to_json_dict())

# Register the suite directly in the expectations store using the suite_name as a tuple key
context.expectations_store._store_backend.set((suite_name,), suite_json)

# Save the context (if needed)
context._save_project_config()

##7. Define Three Expectations for Column Values

Using this [link](https://greatexpectations.io/expectations/), choose three expectation functions and apply them to the labels dataset in a relevant manner.

You should replace the 'ExpectColumnValuesToBeBetween' function with other functions you select from the link.

You can also check the format/parameters required of each function when you click "See more" on the function.

In [355]:
import os
import pandas as pd
import great_expectations as ge

# Ensure the 'data' directory exists
os.makedirs("data", exist_ok=True)

# Create a simple DataFrame
my_data = pd.DataFrame({"label": [0, 1, 2, 0, 1, 2, None]})

# Save the DataFrame as CSV
csv_path = "data/my_data.csv"
my_data.to_csv(csv_path, index=False)

# Initialize the Great Expectations context
context = ge.get_context()

# Add a Pandas-based Fluent Datasource using the Fluent API
datasource_name = "my_pandas_source"
context.sources.add_pandas(
    name=datasource_name,
    path=csv_path
)

# Create a batch request
from great_expectations.core.batch import RuntimeBatchRequest

batch_request = RuntimeBatchRequest(
    datasource_name=datasource_name,
    data_connector_name="default_runtime_data_connector",
    data_asset_name="my_data_asset",
    runtime_parameters={"batch_data": my_data},  # Use the in-memory DataFrame
    batch_identifiers={"batch_id": "001"},
)

# Create an Expectation Suite
suite_name = "my_suite"
context.create_expectation_suite(suite_name, overwrite_existing=True)

# Get a Validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=suite_name,
)

# Add an expectation to check for non-null values
validator.expect_column_values_to_not_be_null(column="label")

# Save expectations
validator.save_expectation_suite(discard_failed_expectations=False)

# Run validation (the final checkpoint)
checkpoint_name = "my_checkpoint"
context.add_or_update_checkpoint(
    name=checkpoint_name,
    validator=validator,
).run()

print("Great Expectations setup completed successfully!")

AttributeError: 'FileDataContext' object has no attribute 'sources'

### Expectation 1

In [None]:
expectation_1 = validator.expect_column_values_to_be_in_set("label", [0, 1, 2])

### Validate Data Against Expectation 1

In [None]:
print("Expectation 1 Results:", expectation_1)

### Expectation 2

In [None]:
expectation_2 = validator.expect_column_values_to_not_be_null("label")

### Validate Data Against Expectation 2

In [None]:
print("Expectation 2 Results:", expectation_2)

### Expectation 3

In [None]:
expectation_3 = validator.expect_column_values_to_be_of_type("label", "int")

### Validate Data Against Expectation 3

In [None]:
print("Expectation 3 Results:", expectation_3)