In [1]:
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error
import pickle

In [2]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Basic Taxi Predictor")

<Experiment: artifact_location='/workspaces/MLOPSZoomCamp/03-exp-tracking/mlruns/1', creation_time=1733958842474, experiment_id='1', last_update_time=1733958842474, lifecycle_stage='active', name='Basic Taxi Predictor', tags={}>

In [33]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [34]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_test = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [42]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_test['PU_DO'] = df_test['PULocationID'] + '_' + df_test['DOLocationID']

In [44]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_test[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [45]:
target = 'duration'
y_train = df_train[target].values
y_val = df_test[target].values

In [48]:
with mlflow.start_run():

    mlflow.set_tag("developer","blake")

    mlflow.log_param("train data", "January 2021 Yellow")
    mlflow.log_param("test data", "February 2021 Yellow")

    alpha = 0.1 
    mlflow.log_param("alpha", alpha)


    lr = Lasso(alpha)
    lr.fit(X_train,y_train)

    y_pred = lr.predict(X_val)

    rmse = root_mean_squared_error(y_val,y_pred)
    mlflow.log_metric("rmse", rmse)

In [49]:
with open('Models/lin_reg.bin','wb') as f_out:
    pickle.dump((dv,lr),f_out)

In [50]:
import xgboost as xgb

from hyperopt import fmin,tpe,hp,STATUS_OK,Trials
from hyperopt.pyll import scope

In [51]:
train = xgb.DMatrix(X_train,label=y_train)
valid = xgb.DMatrix(X_val,label=y_val)

In [52]:
def objective(params):
    
    with mlflow.start_run():
        mlflow.set_tag("model","xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid,"validation")],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val,y_pred)
        mlflow.log_metric("rmse",rmse)
    
    return {'loss':rmse, 'status':STATUS_OK}

In [53]:
search_space = {
    'max_depth' : scope.int(hp.quniform('max_depth',4,100,1)),
    'learning_rate' : hp.loguniform('learning_rate',-3,0),
    'reg_alpha' : hp.loguniform('reg_alpha',-5,-1),
    'reg_lambda' : hp.loguniform('reg_lambda',-6,-1),
    'min_child_weight' : hp.loguniform('min_child_weight',-1,3),
    'objective': 'reg:linear',
    'seed':42,
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]




[0]	validation-rmse:9.33511                           
[1]	validation-rmse:7.90173                           
[2]	validation-rmse:7.22960                           
[3]	validation-rmse:6.90346                           
[4]	validation-rmse:6.74207                           
[5]	validation-rmse:6.66070                           
[6]	validation-rmse:6.61521                           
[7]	validation-rmse:6.58839                           
[8]	validation-rmse:6.57005                           
[9]	validation-rmse:6.55917                           
[10]	validation-rmse:6.55285                          
[11]	validation-rmse:6.54588                          
[12]	validation-rmse:6.54073                          
[13]	validation-rmse:6.53847                          
[14]	validation-rmse:6.53347                          
[15]	validation-rmse:6.53038                          
[16]	validation-rmse:6.52771                          
[17]	validation-rmse:6.52465                          
[18]	valid




[0]	validation-rmse:11.28655                                                     
[1]	validation-rmse:10.50204                                                     
[2]	validation-rmse:9.84155                                                      
[3]	validation-rmse:9.28856                                                      
[4]	validation-rmse:8.82812                                                      
[5]	validation-rmse:8.44646                                                      
[6]	validation-rmse:8.13240                                                      
[7]	validation-rmse:7.87356                                                      
[8]	validation-rmse:7.66169                                                      
[9]	validation-rmse:7.48842                                                      
[10]	validation-rmse:7.34695                                                     
[11]	validation-rmse:7.23103                                                     
[12]	validation-

KeyboardInterrupt: 