In [1]:
import requests
import datetime
import pandas as pd

from evidently import DataDefinition
from evidently import Dataset
from evidently import Report
from evidently.presets import DataSummaryPreset
from evidently.metrics import (
    QuantileValue,
    RowCount,
    EmptyRowsCount,
    ValueDrift,
    DriftedColumnsCount,
    MissingValueCount,
)
from evidently.ui.workspace import Workspace
from evidently.sdk.panels import (
    PanelMetric,
    text_panel,
    bar_plot_panel,
    DashboardPanelPlot,
)
from joblib import dump
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm

  np_bool = np.bool  # type: ignore[attr-defined]


In [2]:
path = "./data"
file = "green_tripdata_2024-03.parquet"

print("Download files:")

url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
resp = requests.get(url, stream=True)
save_path = f"{path}/{file}"
with open(save_path, "wb") as handle:
    for data in tqdm(
        resp.iter_content(),
        desc=f"{file}",
        postfix=f"save to {save_path}",
        total=int(resp.headers["Content-Length"]),
    ):
        handle.write(data)

Download files:


green_tripdata_2024-03.parquet: 100%|██████████| 1372372/1372372 [00:06<00:00, 222208.76it/s, save to ./data/green_tripdata_2024-03.parquet]


In [3]:
march_data = pd.read_parquet(f"{path}/{file}")

In [4]:
march_data.shape

(57457, 20)

In [5]:
# create target
march_data["duration_min"] = (
    march_data.lpep_dropoff_datetime - march_data.lpep_pickup_datetime
)
march_data.duration_min = march_data.duration_min.apply(
    lambda td: float(td.total_seconds()) / 60
)


In [6]:
# filter out outliers
march_data = march_data[(march_data.duration_min >= 0) & (march_data.duration_min <= 60)]
march_data = march_data[(march_data.passenger_count > 0) & (march_data.passenger_count <= 8)]


In [7]:
# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

In [8]:
train_data = march_data[:30000]
val_data = march_data[30000:]

In [9]:
model = LinearRegression()
model.fit(train_data[num_features + cat_features], train_data[target])

In [10]:
with open("models/lin_reg.bin", "wb") as f_out:
    dump(model, f_out)

In [11]:
train_preds = model.predict(train_data[num_features + cat_features])
val_preds = model.predict(val_data[num_features + cat_features])
train_data["prediction"] = train_preds
val_data["prediction"] = val_preds

In [12]:
print(
    "Train mean absolute error:",
    mean_absolute_error(train_data.duration_min, train_data.prediction),
)
print(
    "Val mean absolute error:",
    mean_absolute_error(val_data.duration_min, val_data.prediction),
)


Train mean absolute error: 3.772473239359446
Val mean absolute error: 3.7168145679293674


In [13]:
val_data.to_parquet("data/reference.parquet")


# Evidently Report

In [14]:
ws = Workspace("workspace")

In [15]:
project = ws.create_project("NYC Taxi Data Quality Project")
project.save()

In [16]:
data_definition = DataDefinition(
    numerical_columns=num_features + ["prediction"], categorical_columns=cat_features
)
train_dataset = Dataset.from_pandas(train_data, data_definition)
val_dataset = Dataset.from_pandas(val_data, data_definition)

In [17]:
regular_report = Report(
    metrics=[
        DataSummaryPreset(),
        ValueDrift(column="prediction"),
        DriftedColumnsCount(),
        MissingValueCount(column="prediction"),
        QuantileValue(column="fare_amount", quantile=0.5),
        RowCount(),
        EmptyRowsCount(),
    ],
)

regular_snapshot = regular_report.run(
    reference_data=train_dataset, current_data=val_dataset, timestamp=datetime.datetime(2024, 3, 31)
)


In [18]:
ws.add_run(project.id, regular_snapshot)

In [19]:
project.dashboard.add_panel(text_panel(title="NYC taxi data dashboard"))

project.dashboard.add_panel(
    bar_plot_panel(
        title="Number of Rows",
        values=[
            PanelMetric(
                metric="RowCount",
                legend="count",
            ),
        ],
        size="half",
    ),
)

project.dashboard.add_panel(
    bar_plot_panel(
        title="Number of Empty Rows",
        values=[
            PanelMetric(
                metric="EmptyRowsCount",
                legend="count",
            ),
        ],
        size="half",
    ),
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        title="50th Percentile of Fare Amount",
        values=[
            PanelMetric(
                metric="QuantileValue",
                metric_labels={"column": "fare_amount", "quantile": 0.5},
            )
        ],
    )
)

project.save()