# Baseline model for batch monitoring example

In [5]:
import requests 
import pandas as pd 
import datetime 

from joblib import load, dump
from tqdm import tqdm 

from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [8]:
# download data from the internet 
files = [('green_tripdata_2022-02.parquet', '../data'), 
         ('green_tripdata_2022-01.parquet', '../data')]

print('Download files:')
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                         desc=f"{file}",
                         postfix=f"save to {save_path}",
                         total=int(resp.headers["Content-Length"])):
            handle.write(data)

Download files:


green_tripdata_2022-02.parquet: 100%|██████████| 1428262/1428262 [00:03<00:00, 464731.63it/s, save to ../data/green_tripdata_2022-02.parquet]
green_tripdata_2022-01.parquet: 100%|██████████| 1254291/1254291 [00:02<00:00, 474111.20it/s, save to ../data/green_tripdata_2022-01.parquet]


In [10]:
jan_data = pd.read_parquet('../data/green_tripdata_2022-01.parquet')

In [13]:
# create target
jan_data["duration_min"] = jan_data.lpep_dropoff_datetime - jan_data.lpep_pickup_datetime
jan_data.duration_min = jan_data.duration_min.apply(lambda x: float(x.total_seconds())/60)

In [14]:
# filter out outliers
jan_data = jan_data[(jan_data.duration_min >= 0) & (jan_data.duration_min <= 60)]
jan_data = jan_data[(jan_data.passenger_count > 0) & (jan_data.passenger_count <= 8)]