## Monitoring Setup
In this notebook, we set up data quality monitoring using the Evidently library. We'll track metrics such as summary statistics and quantiles for the `fare_amount` column in our dataset.

# Homework

In [1]:
import requests
import datetime
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric
from evidently.metric_preset import DataDriftPreset, DataQualityPreset

from evidently.ui.workspace import Workspace
from evidently.ui.dashboards import DashboardPanelCounter, DashboardPanelPlot, CounterAgg, PanelValue, PlotType, ReportFilter
from evidently.renderers.html_widgets import WidgetSize

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [None]:
files = [('gr_data_2024-03.parquet', './data')]

print("Download files:")
for file, path in files:
    url=f"ADDURL/tr-data/{file}" # TODO: Modify to access properly
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

Download files:


green_tripdata_2024-03.parquet: 100%|██████████| 1372372/1372372 [00:09<00:00, 151064.55it/s, save to ./data/green_tripdata_2024-03.parquet]


In [None]:
hw_data = pd.read_parquet('data/gr_data_2024-03.parquet')

In [4]:
hw_data.describe()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
count,57457.0,57457,57457,55360.0,57457.0,57457.0,55360.0,57457.0,57457.0,57457.0,57457.0,57457.0,57457.0,0.0,57457.0,57457.0,55360.0,55353.0,55360.0
mean,1.877334,2024-03-16 04:02:52.405399,2024-03-16 04:21:00.076039,1.179986,95.524688,138.629149,1.309538,13.522828,17.313474,0.904472,0.57741,2.386255,0.192537,,0.979378,22.904832,1.321062,1.038047,0.73773
min,1.0,2008-12-31 23:02:24,2008-12-31 23:02:30,1.0,1.0,1.0,0.0,0.0,-295.08,-2.5,-0.5,-1.56,0.0,,-1.0,-296.08,1.0,1.0,-2.75
25%,2.0,2024-03-08 13:53:56,2024-03-08 14:13:49,1.0,74.0,74.0,1.0,1.1,9.3,0.0,0.5,0.0,0.0,,1.0,13.44,1.0,1.0,0.0
50%,2.0,2024-03-15 22:49:01,2024-03-15 23:09:52,1.0,75.0,138.0,1.0,1.79,13.5,0.0,0.5,2.0,0.0,,1.0,18.5,1.0,1.0,0.0
75%,2.0,2024-03-23 20:11:25,2024-03-23 20:34:48,1.0,97.0,220.0,1.0,3.1,19.8,1.0,0.5,3.61,0.0,,1.0,27.05,2.0,1.0,2.75
max,2.0,2024-04-01 00:01:45,2024-04-01 16:11:00,99.0,265.0,265.0,9.0,125112.2,841.6,10.0,4.25,150.0,26.76,,1.0,856.98,5.0,2.0,2.75
std,0.328056,,,1.356719,57.285088,76.295346,0.967749,770.416255,14.958249,1.382446,0.366916,3.159273,1.184551,,0.154253,17.013735,0.497858,0.191311,1.218039


In [5]:
hw_data.dtypes

VendorID                          int32
lpep_pickup_datetime     datetime64[us]
lpep_dropoff_datetime    datetime64[us]
store_and_fwd_flag               object
RatecodeID                      float64
PULocationID                      int32
DOLocationID                      int32
passenger_count                 float64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                    float64
trip_type                       float64
congestion_surcharge            float64
dtype: object

In [6]:
hw_data.shape

(57457, 20)

### Initial Report
We begin by generating a basic report using the `ColumnDriftMetric` and `ColumnSummaryMetric`. These help us understand how the `fare_amount` column behaves across reference and current datasets.

In [7]:
import evidently.metrics

In [8]:
evidently.metrics??

[31mType:[39m        module
[31mString form:[39m <module 'evidently.metrics' from '/home/maxkaizo/miniconda3/envs/py11/lib/python3.11/site-packages/evidently/metrics/__init__.py'>
[31mFile:[39m        ~/miniconda3/envs/py11/lib/python3.11/site-packages/evidently/metrics/__init__.py
[31mSource:[39m     
[33m"""[39m
[33mAvailable metrics for Reports and Tests.[39m
[33mAll metrics is grouped into modules.[39m
[33mFor specific group see module documentation.[39m
[33m"""[39m

[38;5;28;01mfrom[39;00m . [38;5;28;01mimport[39;00m _registry
[38;5;28;01mfrom[39;00m .classification_performance.class_balance_metric [38;5;28;01mimport[39;00m ClassificationClassBalance
[38;5;28;01mfrom[39;00m .classification_performance.class_separation_metric [38;5;28;01mimport[39;00m ClassificationClassSeparationPlot
[38;5;28;01mfrom[39;00m .classification_performance.classification_dummy_metric [38;5;28;01mimport[39;00m ClassificationDummyMetric
[38;5;28;01mfrom[39;00m .classif

In [9]:
evidently.metrics.ColumnSummaryMetric?

[31mInit signature:[39m
evidently.metrics.ColumnSummaryMetric(
    column_name: Union[str, evidently.base_metric.ColumnName],
    options: Union[evidently.options.base.Options, evidently.options.option.Option, dict, List[evidently.options.option.Option], NoneType] = [38;5;28;01mNone[39;00m,
) -> [38;5;28;01mNone[39;00m
[31mFile:[39m           ~/miniconda3/envs/py11/lib/python3.11/site-packages/evidently/metrics/data_integrity/column_summary_metric.py
[31mType:[39m           WithResultFieldPathMetaclass
[31mSubclasses:[39m     

In [10]:
print(evidently.metrics.ColumnSummaryMetric.__doc__)

None


In [11]:
evidently.metrics.ColumnQuantileMetric??

[31mInit signature:[39m
evidently.metrics.ColumnQuantileMetric(
    column_name: Union[str, evidently.base_metric.ColumnName],
    quantile: float,
    options: Union[evidently.options.base.Options, evidently.options.option.Option, dict, List[evidently.options.option.Option], NoneType] = [38;5;28;01mNone[39;00m,
) -> [38;5;28;01mNone[39;00m
[31mSource:[39m        
[38;5;28;01mclass[39;00m ColumnQuantileMetric(Metric[ColumnQuantileMetricResult]):
    [38;5;28;01mclass[39;00m Config:
        type_alias = [33m"evidently:metric:ColumnQuantileMetric"[39m

    [33m"""Calculates quantile with specified range"""[39m

    column_name: ColumnName
    quantile: float

    [38;5;28;01mdef[39;00m __init__(self, column_name: Union[str, ColumnName], quantile: float, options: AnyOptions = [38;5;28;01mNone[39;00m) -> [38;5;28;01mNone[39;00m:
        self.quantile = quantile
        self.column_name = ColumnName.from_any(column_name)
        super().__init__(options=options)

    

In [12]:
print(evidently.metrics.ColumnQuantileMetric.__doc__)

None


In [13]:
hw_data

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2024-03-01 00:10:52,2024-03-01 00:26:12,N,1.0,129,226,1.0,1.72,12.80,1.0,0.5,3.06,0.00,,1.0,18.36,1.0,1.0,0.00
1,2,2024-03-01 00:22:21,2024-03-01 00:35:15,N,1.0,130,218,1.0,3.25,17.70,1.0,0.5,0.00,0.00,,1.0,20.20,2.0,1.0,0.00
2,2,2024-03-01 00:45:27,2024-03-01 01:04:32,N,1.0,255,107,2.0,4.58,23.30,1.0,0.5,3.50,0.00,,1.0,32.05,1.0,1.0,2.75
3,1,2024-03-01 00:02:00,2024-03-01 00:23:45,N,1.0,181,71,1.0,0.00,22.50,0.0,1.5,0.00,0.00,,1.0,24.00,1.0,1.0,0.00
4,2,2024-03-01 00:16:45,2024-03-01 00:23:25,N,1.0,95,135,1.0,1.15,8.60,1.0,0.5,1.00,0.00,,1.0,12.10,1.0,1.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57452,2,2024-03-31 21:19:00,2024-03-31 21:30:00,,,25,61,,1.45,12.08,0.0,0.0,2.52,0.00,,1.0,15.60,,,
57453,2,2024-03-31 22:30:00,2024-03-31 22:35:00,,,41,42,,1.13,12.24,0.0,0.0,0.00,0.00,,1.0,13.24,,,
57454,2,2024-03-31 22:43:00,2024-03-31 22:48:00,,,223,7,,13062.08,12.08,0.0,0.0,3.77,0.00,,1.0,16.85,,,
57455,2,2024-03-31 22:48:00,2024-03-31 23:12:00,,,42,249,,7.96,40.52,0.0,0.0,8.75,0.00,,1.0,53.02,,,


In [14]:
# create target
hw_data["duration_min"] = hw_data.lpep_dropoff_datetime - hw_data.lpep_pickup_datetime
hw_data.duration_min = hw_data.duration_min.apply(lambda td : float(td.total_seconds())/60)

In [15]:
# filter out outliers
hw_data = hw_data[(hw_data.duration_min >= 0) & (hw_data.duration_min <= 60)]
hw_data = hw_data[(hw_data.passenger_count > 0) & (hw_data.passenger_count <= 8)]

In [16]:
# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

### Adding Custom Metrics
Next, we expand our monitoring by including the `ColumnQuantileMetric` with a quantile value of 0.5 (median), and we also retain the summary metric for a complete overview.

In [17]:
hw_data.shape

(54135, 21)

In [18]:
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric, ColumnSummaryMetric, ColumnQuantileMetric

# Here we can get the quantile for a single day

In [19]:
column_mapping = ColumnMapping(
    target=None,
    numerical_features=num_features,
    categorical_features=cat_features
)

In [20]:
report = Report(metrics=[
    ColumnQuantileMetric(column_name='fare_amount', quantile=0.5)
]
)

In [21]:
report.run(reference_data=None,
                  current_data=hw_data.loc[hw_data.lpep_pickup_datetime.between('2024-03-01', '2024-03-02', inclusive="left")],
                  column_mapping=column_mapping)

In [22]:
result = report.as_dict()
from pprint import pprint
pprint(result)

{'metrics': [{'metric': 'ColumnQuantileMetric',
              'result': {'column_name': 'fare_amount',
                         'column_type': 'num',
                         'current': {'value': np.float64(13.5)},
                         'quantile': 0.5,
                         'reference': None}}]}


### Building the Dashboard
We create a monitoring dashboard using Evidently's `Dashboard` object. We configure it with our selected metrics and display the results for review.

In [23]:
result['metrics'][0]['result']['current']

{'value': np.float64(13.5)}

# Loop to get a report for each day

In [None]:
from datetime import datetime, timedelta

# We assume that the dates are in datetime64 format.
start_date = datetime(2024, 3, 1)
end_date = datetime(2024, 4, 1)

daily_quantiles = {}

current_date = start_date
while current_date < end_date:
    next_date = current_date + timedelta(days=1)
    
    # Filter by day
    day_data = hw_data.loc[
        hw_data.lpep_pickup_datetime.between(current_date, next_date, inclusive="left")
    ]

    # Create and run the report
    report = Report(metrics=[
        ColumnQuantileMetric(column_name='fare_amount', quantile=0.5)
    ])
    report.run(reference_data=None, current_data=day_data, column_mapping=column_mapping)
    
    # Extract value
    result = report.as_dict()
    median_value = result['metrics'][0]['result']['current']['value']

    # Save
    daily_quantiles[current_date.strftime("%Y-%m-%d")] = median_value
    
    current_date = next_date

In [25]:
max_day = max(daily_quantiles, key=daily_quantiles.get)
print(f"fare_amount max quantile: {daily_quantiles[max_day]} on date: {max_day}")

fare_amount max quantile: 14.2 on date: 2024-03-03


# Report and dashboard creation

In [26]:
ws = Workspace("workspace")

### Saving Dashboard Configuration
Finally, we discuss where to store the dashboard configuration for future use. Based on best practices and the project folder structure, the `dashboards/` directory is chosen.

In [27]:
project = ws.create_project("NYC Taxi Data Quality Project")
project.description = "My project descriotion"
project.save()

Project(id=UUID('01979af7-60ab-7eb2-85af-f7d3754dad7f'), name='NYC Taxi Data Quality Project', description='My project descriotion', dashboard=DashboardConfig(name='NYC Taxi Data Quality Project', panels=[], tabs=[], tab_id_to_panel_ids={}), team_id=None, org_id=None, date_from=None, date_to=None, created_at=datetime.datetime(2025, 6, 22, 22, 6, 32, 875793), version='1')

In [None]:
from datetime import datetime, timedelta

# We assume that the dates are in datetime64 format.
start_date = datetime(2024, 3, 1)
end_date = datetime(2024, 4, 1)

current_date = start_date
while current_date < end_date:
    next_date = current_date + timedelta(days=1)
    
    # Filter by day
    day_data = hw_data.loc[
        hw_data.lpep_pickup_datetime.between(current_date, next_date, inclusive="left")
    ]

    # Create and run the report
    report = Report(metrics=[
        ColumnQuantileMetric(column_name='fare_amount', quantile=0.5)
    ],timestamp=current_date 
    )
    report.run(reference_data=None, current_data=day_data, column_mapping=column_mapping)

    ws.add_report(project.id, report)
    
    current_date = next_date

### Add dashboard

In [29]:
#configure the dashboard
project.dashboard.add_panel(
    DashboardPanelCounter(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        agg=CounterAgg.NONE,
        title="NYC taxi data dashboard (Solo titulo2)"
    )
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Daily Quantile 0.5",
        values=[
            PanelValue(
                metric_id="ColumnQuantileMetric",
                field_path="current.value",
                legend="value"
            ),
        ],
        plot_type=PlotType.BAR,
        size=WidgetSize.HALF,
    ),
)


project.save()

Project(id=UUID('01979af7-60ab-7eb2-85af-f7d3754dad7f'), name='NYC Taxi Data Quality Project', description='My project descriotion', dashboard=DashboardConfig(name='NYC Taxi Data Quality Project', panels=[DashboardPanelCounter(type='evidently:dashboard_panel:DashboardPanelCounter', id=UUID('01979af7-629c-7736-bcdc-14bd63bc1639'), title='NYC taxi data dashboard (Solo titulo2)', filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=False), size=<WidgetSize.FULL: 2>, agg=<CounterAgg.NONE: 'none'>, value=None, text=None), DashboardPanelPlot(type='evidently:dashboard_panel:DashboardPanelPlot', id=UUID('01979af7-629d-7e2a-8328-0a5ba17d7244'), title='Daily Quantile 0.5', filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=False), size=<WidgetSize.HALF: 1>, values=[PanelValue(field_path='current.value', metric_id='ColumnQuantileMetric', metric_fingerprint=None, metric_args={}, legend='value')], plot_type=<PlotType.BAR: 'bar'>)], tabs=[], tab_id_to_pa