In [8]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error
import pickle
import mlflow

set mlflow tracking n experiment

In [9]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location=('/Users/andreanicolas/Library/CloudStorage/GoogleDrive-andreanicolas91@gmail.com/My '
 'Drive/ASU_ComputerScience/MLOps_studies/02-experiment-tracking/mlruns/1'), creation_time=1716340686867, experiment_id='1', last_update_time=1716340686867, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

Read in taxi data using pandas.

In [10]:
def read_dataframe(filename):
    df = pd.read_parquet(filename) #read data
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime # calculate duration
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60) # convert to mins
    mask = ((df.duration >=1) & (df.duration<=60)) # select only significant data
    df = df[mask]
    categorical = ['PULocationID', 'DOLocationID'] # pickup and dropoff location
    df[categorical] = df[categorical].astype(str) # convert to string/object to prevent labeling

    return df

In [11]:
df_train = read_dataframe('../01-intro/data/green_tripdata_2023-01.parquet')
df_val = read_dataframe('../01-intro/data/green_tripdata_2023-02.parquet')
len(df_train), len(df_val)

(65946, 62574)

dictVectorizer to generate train and validation X

In [12]:
print('combining pickup location ID and dropoff location ID...')
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

combining pickup location ID and dropoff location ID...


In [13]:
dv = DictVectorizer()
categorical = ['PU_DO'] # pickup and dropoff location
numerical = ['trip_distance'] # distance of trip

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts) # no need to fit sicne its done already for training data

Obtaining Y values (labels)

In [14]:
y_train = df_train['duration'].values
y_val = df_val['duration'].values

fit linear regression and calculate RMSEs

In [15]:
lr = LinearRegression()
lr.fit(X_train,y_train)

y_pred_train = lr.predict(X_train)
y_pred_val   = lr.predict(X_val)

train_rmse = root_mean_squared_error(y_train,y_pred_train)
val_rmse   = root_mean_squared_error(y_val,y_pred_val)

print(f'RMSE from train data is : {train_rmse:.2f}')
print(f'RMSE from validation data is : {val_rmse:.2f}')

RMSE from train data is : 4.80
RMSE from validation data is : 6.04


save model as pickle file

In [16]:
with open('./models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

LASSO:

In [19]:
with mlflow.start_run():
    mlflow.set_tag("developer","andrea")

    # log info about dataset
    mlflow.log_param("train-data-path","../01-intro/data/green_tripdata_2023-01.parquet")
    mlflow.log_param("valid-data-path","../01-intro/data/green_tripdata_2023-02.parquet")

    #log alpha
    alpha = 0.1
    mlflow.log_param("alpha",alpha)

    lr = Lasso(alpha)
    lr.fit(X_train,y_train)

    y_pred_val   = lr.predict(X_val)

    #log metric
    val_rmse   = root_mean_squared_error(y_val,y_pred_val)
    mlflow.log_metric("rmse",val_rmse)
    print(f'RMSE from validation data is : {val_rmse:.2f}')