In [63]:
from pathlib import Path
from kedro.framework.session import KedroSession
from kedro.framework.startup import bootstrap_project
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import great_expectations as gx
from great_expectations.profile.user_configurable_profiler import UserConfigurableProfiler
from great_expectations.data_context import DataContext
from great_expectations.core.expectation_configuration import ExpectationConfiguration

# Get the project root (2 levels up from this script)
project_path = Path.cwd().parents[1]

# Bootstrap project (loads pyproject.toml and settings)
bootstrap_project(project_path)

# Create Kedro session for your project
session = KedroSession.create()

# Load the Kedro context (access pipelines, catalog, etc.)
context = session.load_context()

# Now you can load your dataset from the catalog:
df = context.catalog.load("loans")

In [64]:
df.columns = df.columns.str.strip()

# Split 70% for reference, 30% for analysis
df_ref = df.sample(frac=0.7, random_state=42)
df_ana = df.drop(df_ref.index)

In [65]:
context = gx.get_context(context_root_dir = "gx")

In [66]:
# Register a pandas-based datasource
datasource_name = "example" 
try:
    datasource = context.sources.add_pandas(datasource_name)
except:
    print("Data Source already exists.")
    datasource = context.datasources[datasource_name]

Data Source already exists.


In [67]:
context


[1m{[0m
  [32m"anonymous_usage_statistics"[0m: [1m{[0m
    [32m"explicit_id"[0m: true,
    [32m"explicit_url"[0m: false,
    [32m"usage_statistics_url"[0m: [32m"https://stats.greatexpectations.io/great_expectations/v1/usage_statistics"[0m,
    [32m"enabled"[0m: true,
    [32m"data_context_id"[0m: [32m"62b31bc3-528b-4a42-8087-dddee0817143"[0m
  [1m}[0m,
  [32m"checkpoint_store_name"[0m: [32m"checkpoint_store"[0m,
  [32m"config_variables_file_path"[0m: [32m"uncommitted/config_variables.yml"[0m,
  [32m"config_version"[0m: [1;36m3.0[0m,
  [32m"data_docs_sites"[0m: [1m{[0m
    [32m"local_site"[0m: [1m{[0m
      [32m"class_name"[0m: [32m"SiteBuilder"[0m,
      [32m"show_how_to_buttons"[0m: true,
      [32m"store_backend"[0m: [1m{[0m
        [32m"class_name"[0m: [32m"TupleFilesystemStoreBackend"[0m,
        [32m"base_directory"[0m: [32m"uncommitted/data_docs/local_site/"[0m
      [1m}[0m,
      [32m"site_index_builder"[0m: [1m{

In [68]:
print("data source", context.list_datasources())

data source [{'type': 'pandas', 'name': 'example'}]


In [69]:
print(datasource)

name: example
type: pandas



In [70]:
suite_bank = context.add_or_update_expectation_suite(expectation_suite_name="example_suite")

In [71]:
suite_bank


[1m{[0m
  [32m"expectation_suite_name"[0m: [32m"example_suite"[0m,
  [32m"ge_cloud_id"[0m: null,
  [32m"expectations"[0m: [1m[[0m[1m][0m,
  [32m"data_asset_type"[0m: null,
  [32m"meta"[0m: [1m{[0m
    [32m"great_expectations_version"[0m: [32m"0.18.12"[0m
  [1m}[0m
[1m}[0m

In [72]:
df_ref.columns


[1;35mIndex[0m[1m([0m[1m[[0m[32m'loan_id'[0m, [32m'no_of_dependents'[0m, [32m'education'[0m, [32m'self_employed'[0m,
       [32m'income_annum'[0m, [32m'loan_amount'[0m, [32m'loan_term'[0m, [32m'cibil_score'[0m,
       [32m'residential_assets_value'[0m, [32m'commercial_assets_value'[0m,
       [32m'luxury_assets_value'[0m, [32m'bank_asset_value'[0m, [32m'loan_status'[0m[1m][0m,
      [33mdtype[0m=[32m'object'[0m[1m)[0m

In [73]:
df_ref["education"].describe()


count          [1;36m2988[0m
unique            [1;36m2[0m
top        Graduate
freq           [1;36m1498[0m
Name: education, dtype: object

In [74]:
df_ref["education"].unique()

[1;35marray[0m[1m([0m[1m[[0m[32m' Graduate'[0m, [32m' Not Graduate'[0m[1m][0m, [33mdtype[0m=[35mobject[0m[1m)[0m

In [75]:
df_ref['education'] = df_ref['education'].str.strip()
df_ref["education"].unique()

[1;35marray[0m[1m([0m[1m[[0m[32m'Graduate'[0m, [32m'Not Graduate'[0m[1m][0m, [33mdtype[0m=[35mobject[0m[1m)[0m

In [76]:
expectation_education = ExpectationConfiguration(
    expectation_type="expect_column_distinct_values_to_be_in_set",
    kwargs={
        "column": "education",
        "value_set" : ['Graduate', 'Not Graduate']
    },
)
suite_bank.add_expectation(expectation_configuration=expectation_education)

[1m{[0m[32m"expectation_type"[0m: [32m"expect_column_distinct_values_to_be_in_set"[0m, [32m"kwargs"[0m: [1m{[0m[32m"column"[0m: [32m"education"[0m, [32m"value_set"[0m: [1m[[0m[32m"Graduate"[0m, [32m"Not Graduate"[0m[1m][0m[1m}[0m, [32m"meta"[0m: [1m{[0m[1m}[0m[1m}[0m

In [77]:
suite_bank


[1m{[0m
  [32m"expectation_suite_name"[0m: [32m"example_suite"[0m,
  [32m"ge_cloud_id"[0m: null,
  [32m"expectations"[0m: [1m[[0m
    [1m{[0m
      [32m"expectation_type"[0m: [32m"expect_column_distinct_values_to_be_in_set"[0m,
      [32m"kwargs"[0m: [1m{[0m
        [32m"column"[0m: [32m"education"[0m,
        [32m"value_set"[0m: [1m[[0m
          [32m"Graduate"[0m,
          [32m"Not Graduate"[0m
        [1m][0m
      [1m}[0m,
      [32m"meta"[0m: [1m{[0m[1m}[0m
    [1m}[0m
  [1m][0m,
  [32m"data_asset_type"[0m: null,
  [32m"meta"[0m: [1m{[0m
    [32m"great_expectations_version"[0m: [32m"0.18.12"[0m
  [1m}[0m
[1m}[0m

In [78]:
context.add_or_update_expectation_suite(expectation_suite=suite_bank)


[1m{[0m
  [32m"expectation_suite_name"[0m: [32m"example_suite"[0m,
  [32m"ge_cloud_id"[0m: null,
  [32m"expectations"[0m: [1m[[0m
    [1m{[0m
      [32m"expectation_type"[0m: [32m"expect_column_distinct_values_to_be_in_set"[0m,
      [32m"kwargs"[0m: [1m{[0m
        [32m"column"[0m: [32m"education"[0m,
        [32m"value_set"[0m: [1m[[0m
          [32m"Graduate"[0m,
          [32m"Not Graduate"[0m
        [1m][0m
      [1m}[0m,
      [32m"meta"[0m: [1m{[0m[1m}[0m
    [1m}[0m
  [1m][0m,
  [32m"data_asset_type"[0m: null,
  [32m"meta"[0m: [1m{[0m
    [32m"great_expectations_version"[0m: [32m"0.18.12"[0m
  [1m}[0m
[1m}[0m