In [1]:
import pandas as pd
import numpy as np

In [2]:
np.set_printoptions(threshold=np.inf, linewidth=200, formatter={'float': '{: 0.3f}'.format})

pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 100)

In [3]:
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, RobustScaler

from sklearn.model_selection import train_test_split

In [4]:
from sklearn.metrics import (
    mean_squared_error,
    root_mean_squared_error
)

In [5]:
import xgboost as xgb

from hyperopt import (
    fmin,  # Function for minimizing/maximizing an objective function
    tpe,   # Tree-structured Parzen Estimator (TPE) algorithm for optimization
    hp,    # Defines search space for hyperparameters
    STATUS_OK,  # Constant indicating successful completion of an objective
    Trials  # Container for storing results of each trial
)


from hyperopt.pyll import scope # Handles scoping in hyperparameter definitions
from hyperopt.pyll.stochastic import sample

In [6]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db") #sets the sqlite up for storing artifacts
mlflow.set_experiment("nyc-taxi-experiment") # this is the experiment. It will try to recognize if the experiment exist and, If not, it will create a new one. 

<Experiment: artifact_location='/workspaces/Course-MLOps/02-intro/mlruns/1', creation_time=1720013781052, experiment_id='1', last_update_time=1720013781052, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [7]:
df = pd.read_parquet('./data/green_tripdata_2021.parquet')

df.isna().any()

lpep_pickup_datetime     False
lpep_dropoff_datetime    False
RatecodeID               False
PULocationID             False
DOLocationID             False
trip_distance            False
VendorID                 False
trip_type                False
duration                 False
dtype: bool

In [8]:
# Verifying if a categorical variable is a string.
# categorical = ['VendorID', 'trip_type']
categorical = ['trip_type', 'RatecodeID']
numerical = ['trip_distance']
label = ['duration']

categorical_columns = [col for col in categorical if col in df.columns]

for column in categorical_columns:
    if df[column].dtype == 'category':
        print(f"{column} is already of type str")
    else:
        print(f"{column} is not of type str")
        df[column] = df[column].astype('category')

df.dtypes

trip_type is not of type str
RatecodeID is not of type str


lpep_pickup_datetime     datetime64[us]
lpep_dropoff_datetime    datetime64[us]
RatecodeID                     category
PULocationID                      int64
DOLocationID                      int64
trip_distance                   float64
VendorID                          int64
trip_type                      category
duration                        float64
dtype: object

In [9]:
X = df[categorical + numerical]
y = df[label]

In [10]:
X_train, X_split, y_train, y_split = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_split, y_split, test_size=0.4, random_state=42)

print('Lengh train X and y: ', X_train.shape,' ', len(y_train))
print('Lengh valid X and y: ', X_val.shape,'  ', len(y_val))
print('Lengh test  X and y: ', X_test.shape,'  ', len(y_test))


Lengh train X and y:  (22232, 3)   22232
Lengh valid X and y:  (8893, 3)    8893
Lengh test  X and y:  (5929, 3)    5929


In [14]:
# ”c”, which represents categorical columns.
# ”q”, which represents numeric columns.
# ”int”, which represents integer columns.
# ”i”, which represents boolean columns.

# ft = ["c", "c", "q"]
# train = xgb.DMatrix(X_train, label=y_train, enable_categorical=True, feature_types = ft)

train = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
valid = xgb.DMatrix(X_val,   label=y_val,   enable_categorical=True)
test = xgb.DMatrix(X_test,   label=y_test,  enable_categorical=True)


In [12]:
# Convert DMatrix to Pandas DataFrame
train.get_data().toarray()
df = pd.DataFrame(train.get_data().toarray(), columns=[f'feature_{i}' for i in train.feature_names ])
df['label'] = train.get_label()

print(df.head())

df['feature_RatecodeID'].unique()


   feature_trip_type  feature_RatecodeID  feature_trip_distance      label
0                0.0                 0.0                   6.95  17.383333
1                0.0                 0.0                   1.40   6.666667
2                0.0                 0.0                   1.50   7.566667
3                0.0                 0.0                   1.58   7.166667
4                0.0                 0.0                   1.80  10.266666


array([ 0.000,  4.000,  1.000,  3.000,  2.000], dtype=float32)

In [22]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )

        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse_val", rmse)

        y_pred = booster.predict(test)
        rmse = root_mean_squared_error(y_test, y_pred)
        mlflow.log_metric("rmse_test", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [23]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -6, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:10.04235                          
[1]	validation-rmse:9.99460                           
[2]	validation-rmse:9.94724                           
[3]	validation-rmse:9.90026                           
[4]	validation-rmse:9.85365                           
[5]	validation-rmse:9.80743                           
[6]	validation-rmse:9.76158                           
[7]	validation-rmse:9.71611                           
[8]	validation-rmse:9.67101                           
[9]	validation-rmse:9.62628                           
[10]	validation-rmse:9.58191                          
[11]	validation-rmse:9.53791                          
[12]	validation-rmse:9.49428                          
[13]	validation-rmse:9.45100                          
[14]	validation-rmse:9.40808                          
[15]	validation-rmse:9.36553                          
[16]	validation-rmse:9.32332                          
[17]	validation-rmse:9.28147                          
[18]	valid

(0.0024787521766663585, 1.0)