In [55]:
import pandas as pd
from great_expectations import dataset
import json
from great_expectations.dataset.pandas_dataset import PandasDataset

In [56]:
from great_expectations.data_context import DataContext
import great_expectations as ge
import pandas as pd

# Set the path to your Great Expectations project directory
ge_project_directory = '/home/sachin/DSP/gx'
context = DataContext(context_root_dir=ge_project_directory)

# Load the dataset
df = pd.read_csv('/home/sachin/DSP/airflow-spike-master/input_data/raw-data/data_chunk_608.csv')

ge_dataset = ge.dataset.PandasDataset(df)


In [57]:
df.columns

Index(['name', 'year', 'km_driven', 'fuel', 'seller_type', 'transmission',
       'owner', 'mileage', 'engine', 'max_power', 'seats', 'car_company_name'],
      dtype='object')

In [58]:
# List of your columns
columns = [
    'name', 'year',
    'car_company_name'
]
# Adding expectation for each column to have no null values
for column in columns:
    ge_dataset.expect_column_values_to_not_be_null(column=column)


In [59]:
ge_dataset.expect_column_values_to_be_in_type_list(column='year', type_list=['int'])

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [60]:
ge_dataset.expect_column_values_to_be_in_set(column='fuel', value_set=['Petrol', 'Diesel', 'CNG', 'Electric'])

{
  "success": true,
  "result": {
    "element_count": 9,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [61]:
ge_dataset.expect_column_values_to_match_regex(column='name', regex=r'^[a-zA-Z0-9\s]+$')

{
  "success": false,
  "result": {
    "element_count": 9,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 5,
    "unexpected_percent": 55.55555555555556,
    "unexpected_percent_total": 55.55555555555556,
    "unexpected_percent_nonmissing": 55.55555555555556,
    "partial_unexpected_list": [
      "Fiat Linea Emotion (Diesel)",
      "Honda City 2017-2020 GXi",
      "Hyundai Verna Xi (Petrol)",
      "Hyundai Verna 1.4 CRDi",
      "Hyundai Xcent 1.2 VTVT S"
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [62]:
ge_dataset.expect_column_values_to_be_in_set(column='seller_type', value_set=['Individual', 'Dealer'])

{
  "success": false,
  "result": {
    "element_count": 9,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 2,
    "unexpected_percent": 22.22222222222222,
    "unexpected_percent_total": 22.22222222222222,
    "unexpected_percent_nonmissing": 22.22222222222222,
    "partial_unexpected_list": [
      "Partnership",
      "Partnership"
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [63]:
ge_dataset.expect_column_values_to_be_in_set(column='transmission', value_set=['Manual', 'Automatic'])

{
  "success": false,
  "result": {
    "element_count": 9,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 1,
    "unexpected_percent": 11.11111111111111,
    "unexpected_percent_total": 11.11111111111111,
    "unexpected_percent_nonmissing": 11.11111111111111,
    "partial_unexpected_list": [
      "Ai"
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [64]:
ge_dataset.expect_column_values_to_match_regex(column='owner', regex=r'^[a-zA-Z\s]+$')

{
  "success": true,
  "result": {
    "element_count": 9,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [65]:
ge_dataset.expect_column_values_to_be_in_type_list(column='km_driven', type_list=['int'])

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [66]:
ge_dataset.expect_column_pair_cramers_phi_value_to_be_less_than(column_A='car_company_name', column_B='fuel', threshold=0.5)

{
  "success": false,
  "result": {
    "observed_value": 0.8366600265340756,
    "element_count": 9,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [67]:
#ge_dataset.expect_column_values_to_be_between(column='seats', min_value=1, max_value=10)

In [68]:
ge_dataset.expect_column_values_to_match_regex(column='car_company_name', regex=r'^[a-zA-Z\s]+$')

{
  "success": true,
  "result": {
    "element_count": 9,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [71]:
expectation_suite = ge_dataset.get_expectation_suite()

# Name your expectation suite
expectation_suite_name = "my_ex"

# Use the DataContext to save the expectation suite
context.save_expectation_suite(expectation_suite, expectation_suite_name)


'/home/sachin/DSP/gx/expectations/my_ex.json'

In [70]:
# # Save the expectation suite to the specified folder
# #expectation_suite_path = "/home/sachin/DSP/car_price_prediction/gx/" + expectation_suite_name
# ge_dataset.save_expectation_suite(expectation_suite_name)