In [1]:
import pandas as pd

df = pd.DataFrame(
    {
        "Category": ["A", "B", "A", "C", "B", "A"],
        "Value": [10, 25, 15, 30, 20, 12],
    }
)
df

Unnamed: 0,Category,Value
0,A,10
1,B,25
2,A,15
3,C,30
4,B,20
5,A,12


## [`describe()`](https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#summarizing-data-describe)


In [2]:
df.describe(include="all")

Unnamed: 0,Category,Value
count,6,6.0
unique,3,
top,A,
freq,3,
mean,,18.666667
std,,7.788881
min,,10.0
25%,,12.75
50%,,17.5
75%,,23.75


## [YData Profiling](https://docs.profiling.ydata.ai/latest/)


In [3]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

## [Great Expectations Core](https://docs.greatexpectations.io/docs/core/introduction/)

Using [Congestion Pricing data](https://data.ny.gov/Transportation/MTA-Congestion-Relief-Zone-Vehicle-Entries-Beginni/t6yz-b64h/about_data).


In [4]:
entries = pd.read_csv(
    "https://data.ny.gov/resource/t6yz-b64h.csv",
    parse_dates=["toll_date", "toll_hour", "toll_10_minute_block", "toll_week"],
)
entries

Unnamed: 0,toll_date,toll_hour,toll_10_minute_block,minute_of_hour,hour_of_day,day_of_week_int,day_of_week,toll_week,time_period,vehicle_class,detection_group,detection_region,crz_entries,excluded_roadway_entries
0,2025-02-08,2025-02-08 23:00:00,2025-02-08 23:50:00,50,23,7,Saturday,2025-02-02,Overnight,"1 - Cars, Pickups and Vans",Brooklyn Bridge,Brooklyn,59,52
1,2025-02-08,2025-02-08 23:00:00,2025-02-08 23:50:00,50,23,7,Saturday,2025-02-02,Overnight,"1 - Cars, Pickups and Vans",Hugh L. Carey Tunnel,Brooklyn,46,11
2,2025-02-08,2025-02-08 23:00:00,2025-02-08 23:50:00,50,23,7,Saturday,2025-02-02,Overnight,"1 - Cars, Pickups and Vans",Manhattan Bridge,Brooklyn,49,0
3,2025-02-08,2025-02-08 23:00:00,2025-02-08 23:50:00,50,23,7,Saturday,2025-02-02,Overnight,"1 - Cars, Pickups and Vans",Queens Midtown Tunnel,Queens,41,0
4,2025-02-08,2025-02-08 23:00:00,2025-02-08 23:50:00,50,23,7,Saturday,2025-02-02,Overnight,"1 - Cars, Pickups and Vans",Holland Tunnel,New Jersey,115,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2025-02-08,2025-02-08 21:00:00,2025-02-08 21:40:00,40,21,7,Saturday,2025-02-02,Overnight,3 - Multi-Unit Trucks,Holland Tunnel,New Jersey,0,0
996,2025-02-08,2025-02-08 21:00:00,2025-02-08 21:40:00,40,21,7,Saturday,2025-02-02,Overnight,TLC Taxi/FHV,Brooklyn Bridge,Brooklyn,54,13
997,2025-02-08,2025-02-08 21:00:00,2025-02-08 21:40:00,40,21,7,Saturday,2025-02-02,Overnight,5 - Motorcycles,Brooklyn Bridge,Brooklyn,0,0
998,2025-02-08,2025-02-08 21:00:00,2025-02-08 21:40:00,40,21,7,Saturday,2025-02-02,Overnight,5 - Motorcycles,Hugh L. Carey Tunnel,Brooklyn,0,0


In [5]:
entries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   toll_date                 1000 non-null   datetime64[ns]
 1   toll_hour                 1000 non-null   datetime64[ns]
 2   toll_10_minute_block      1000 non-null   datetime64[ns]
 3   minute_of_hour            1000 non-null   int64         
 4   hour_of_day               1000 non-null   int64         
 5   day_of_week_int           1000 non-null   int64         
 6   day_of_week               1000 non-null   object        
 7   toll_week                 1000 non-null   datetime64[ns]
 8   time_period               1000 non-null   object        
 9   vehicle_class             1000 non-null   object        
 10  detection_group           1000 non-null   object        
 11  detection_region          1000 non-null   object        
 12  crz_entries          

In [18]:
import great_expectations as gx

context = gx.get_context()
data_source = context.data_sources.add_pandas(name="NY Open Data")
data_asset = data_source.add_dataframe_asset(
    name="Congestion Relief Zone Vehicle Entries"
)

batch_definition = data_asset.add_batch_definition_whole_dataframe("all")
batch = batch_definition.get_batch(batch_parameters={"dataframe": entries})

expected_start = pd.Timestamp(year=2025, month=1, day=1)
expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="toll_date", min_value=expected_start
)

batch.validate(expectation)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_be_between",
    "kwargs": {
      "batch_id": "NY Open Data-Congestion Relief Zone Vehicle Entries",
      "column": "toll_date",
      "min_value": "2025-01-01T00:00:00"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_counts": [],
    "partial_unexpected_index_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [None]:
import os


site_name = "congestion_pricing"

# it won't let the same site be added twice, so try deleting
try:
    context.delete_data_docs_site(site_name)
except gx.exceptions.InvalidKeyError:
    pass

context.add_data_docs_site(
    site_name=site_name,
    site_config={
        "class_name": "SiteBuilder",
        "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"},
        "store_backend": {
            "class_name": "TupleFilesystemStoreBackend",
            "base_directory": os.path.join(os.getcwd(), "great_expectations"),
        },
    },
)

context.build_data_docs(site_names=site_name)
context.open_data_docs()