In [44]:
import pandas as pd

In [45]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

import pickle

In [46]:
import mlflow

In [47]:
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment('nyc-experiment-tracker')

<Experiment: artifact_location='/home/rohit/mlops-zoomcamp/02-Experiment-Tracking/mlruns/1', creation_time=1726390207628, experiment_id='1', last_update_time=1726390207628, lifecycle_stage='active', name='nyc-experiment-tracker', tags={}>

In [48]:
def read_data(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td:td.total_seconds()/60)
    df = df[((df.duration > 1) & (df.duration <= 60))]
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [49]:
train_df = read_data('../data/green_tripdata_2021-01.parquet')
val_df = read_data('../data/green_tripdata_2021-02.parquet')

train_df['PU_DO'] = train_df['PULocationID'] + '_' + train_df['DOLocationID']
val_df['PU_DO'] = val_df['PULocationID'] + '_' + val_df['DOLocationID']

categorical = ['PU_DO'] #['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = train_df[categorical+numerical].to_dict(orient='records')
train_x = dv.fit_transform(train_dicts)

val_dicts = val_df[categorical+numerical].to_dict(orient='records')
val_x = dv.transform(val_dicts)

target = 'duration'
train_y = train_df[target].values
val_y = val_df[target].values

In [50]:
with mlflow.start_run():
    mlflow.set_tag('developer', 'rohit')

    mlflow.log_param('train-data-path', '../data/green_tripdata_2021-01.parquet')
    mlflow.log_param('val-data-path', '../data/green_tripdata_2021-02.parquet')

    lr = LinearRegression()
    lr.fit(train_x, train_y)

    pred_y = lr.predict(val_x)
    rmse = root_mean_squared_error(val_y, pred_y)
    mlflow.log_metric('rmse', rmse)
