In [1]:
import pandas as pd
import numpy as np

In [2]:
np.set_printoptions(threshold=np.inf, linewidth=200, formatter={'float': '{: 0.3f}'.format})

pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 100)

In [3]:
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, RobustScaler

from sklearn.model_selection import train_test_split

In [4]:
from sklearn.metrics import (
    mean_squared_error,
    root_mean_squared_error
)

In [5]:
import xgboost as xgb

from hyperopt import (
    fmin,  # Function for minimizing/maximizing an objective function
    tpe,   # Tree-structured Parzen Estimator (TPE) algorithm for optimization
    hp,    # Defines search space for hyperparameters
    STATUS_OK,  # Constant indicating successful completion of an objective
    Trials  # Container for storing results of each trial
)


from hyperopt.pyll import scope # Handles scoping in hyperparameter definitions
from hyperopt.pyll.stochastic import sample

from functools import partial

In [6]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db") # sets the sqlite up for storing artifacts
mlflow.set_experiment("nyc-taxi-experiment") # this is the experiment. It will try to recognize if the experiment exist and, If not, it will create a new one. 

2024/07/25 15:08:16 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/bruno/Git/Course-MLOps/02-intro/mlruns/1', creation_time=1721916496215, experiment_id='1', last_update_time=1721916496215, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [7]:
print(f"XGBoost version: {xgb.__version__}")
print(f"MLflow version: {mlflow.__version__}")

XGBoost version: 2.0.3
MLflow version: 2.14.3


In [8]:
df = pd.read_parquet('./data/green_tripdata_2021.parquet')

df.isna().any()

lpep_pickup_datetime     False
lpep_dropoff_datetime    False
RatecodeID               False
PULocationID             False
DOLocationID             False
trip_distance            False
VendorID                 False
trip_type                False
duration                 False
dtype: bool

In [9]:
# Verifying if a categorical variable is a string.
# categorical = ['VendorID', 'trip_type']
categorical = ['trip_type', 'RatecodeID']
numerical = ['trip_distance']
label = ['duration']

categorical_columns = [col for col in categorical if col in df.columns]

for column in categorical_columns:
    if df[column].dtype == 'category':
        print(f"{column} is already of type str")
    else:
        print(f"{column} is not of type str")
        df[column] = df[column].astype('category')

df.dtypes

trip_type is not of type str
RatecodeID is not of type str


lpep_pickup_datetime     datetime64[us]
lpep_dropoff_datetime    datetime64[us]
RatecodeID                     category
PULocationID                      int64
DOLocationID                      int64
trip_distance                   float64
VendorID                          int64
trip_type                      category
duration                        float64
dtype: object

In [10]:
X = df[categorical + numerical]
y = df[label]

In [11]:
X_train, X_split, y_train, y_split = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_split, y_split, test_size=0.4, random_state=42)

print('Lengh train X and y: ', X_train.shape,' ', len(y_train))
print('Lengh valid X and y: ', X_val.shape,'  ', len(y_val))
print('Lengh test  X and y: ', X_test.shape,'  ', len(y_test))


Lengh train X and y:  (22232, 3)   22232
Lengh valid X and y:  (8893, 3)    8893
Lengh test  X and y:  (5929, 3)    5929


In [12]:
X_train.dtypes

trip_type        category
RatecodeID       category
trip_distance     float64
dtype: object

In [13]:
# ”c”, which represents categorical columns.
# ”q”, which represents numeric columns.
# ”int”, which represents integer columns.
# ”i”, which represents boolean columns.

# ft = ["c", "c", "q"]
# train = xgb.DMatrix(X_train, label=y_train, enable_categorical=True, feature_types = ft)

train = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
valid = xgb.DMatrix(X_val,   label=y_val,   enable_categorical=True)
test = xgb.DMatrix(X_test,   label=y_test,  enable_categorical=True)


In [14]:
# Convert DMatrix to Pandas DataFrame
train.get_data().toarray()
df = pd.DataFrame(train.get_data().toarray(), columns=[f'feature_{i}' for i in train.feature_names ])
df['label'] = train.get_label()

print(df.head())

df['feature_RatecodeID'].unique()


   feature_trip_type  feature_RatecodeID  feature_trip_distance      label
0                0.0                 0.0                   6.95  17.383333
1                0.0                 0.0                   1.40   6.666667
2                0.0                 0.0                   1.50   7.566667
3                0.0                 0.0                   1.58   7.166667
4                0.0                 0.0                   1.80  10.266666


array([ 0.000,  4.000,  1.000,  3.000,  2.000], dtype=float32)

### ***Linear regression***
----

In [15]:
def objective(params, name_set = "xgboost"):
    with mlflow.start_run():
        mlflow.set_tag("model", name_set)
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )

        y_pred_val = booster.predict(valid)
        rmse_val = root_mean_squared_error(y_val, y_pred_val)
        mlflow.log_metric("rmse_val", rmse_val)

        y_pred_test = booster.predict(test)
        rmse_test = root_mean_squared_error(y_test, y_pred_test)
        mlflow.log_metric("rmse_test", rmse_test)

    return {'loss': rmse_val, 'status': STATUS_OK}

In [17]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -6, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

# best_result = fmin(
#     fn=objective,
#     space=search_space,
#     algo=tpe.suggest,
#     max_evals=50,
#     trials=Trials()
# )

In [20]:
# best_result

# {'learning_rate': 0.008611076311094068,
#  'max_depth': 4.0,
#  'min_child_weight': 18.488225034580513,
#  'reg_alpha': 0.29662061225495734,
#  'reg_lambda': 0.0371065942542032}

### ***Gamma Regression***
---

In [22]:
objective_gamma = partial(objective, name_set="xgboost-gamma")

search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -6, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:gamma',
    'seed': 42,
    'eval_metric': 'gamma-nloglik'
}

# best_result = fmin(
#     fn=objective_gamma,
#     space=search_space,
#     algo=tpe.suggest,
#     max_evals=50,
#     trials=Trials()
# )



{'learning_rate': 0.08055973081693549,
 'max_depth': 4.0,
 'min_child_weight': 15.76289220268423,
 'reg_alpha': 0.10079556317337575,
 'reg_lambda': 0.04834019493231889}

In [None]:
# best_result


# Parameter           Value
# eval_metric         gamma-nloglik
# learning_rate       0.08055973081693549
# max_depth           4
# min_child_weight    15.76289220268423
# objective           reg:gamma
# reg_alpha           0.10079556317337575
# reg_lambda          0.04834019493231889
# seed                42

In [25]:
params_final = {
 'learning_rate': 0.08055973081693549,
 'max_depth': int(4),
 'min_child_weight': 15.76289220268423,
 'reg_alpha': 0.10079556317337575,
 'reg_lambda': 0.04834019493231889,
 'objective': 'reg:gamma',
 'seed': 42,
 'eval_metric': 'gamma-nloglik'}


# Autologging is known to be compatible with the following package versions: 1.4.2 <= xgboost <= 2.0.3. 
# Autologging may not succeed when used with package versions outside of this range.

with mlflow.start_run():
    mlflow.xgboost.autolog()

    train = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
    valid = xgb.DMatrix(X_val,   label=y_val,   enable_categorical=True)
    test = xgb.DMatrix(X_test,   label=y_test,  enable_categorical=True)

    booster = xgb.train(
        params=params_final,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

[0]	validation-gamma-nloglik:5.97515
[1]	validation-gamma-nloglik:5.72177
[2]	validation-gamma-nloglik:5.49132
[3]	validation-gamma-nloglik:5.28188
[4]	validation-gamma-nloglik:5.09180
[5]	validation-gamma-nloglik:4.91933
[6]	validation-gamma-nloglik:4.76303
[7]	validation-gamma-nloglik:4.62154
[8]	validation-gamma-nloglik:4.49360
[9]	validation-gamma-nloglik:4.37805
[10]	validation-gamma-nloglik:4.27379
[11]	validation-gamma-nloglik:4.17984
[12]	validation-gamma-nloglik:4.09527
[13]	validation-gamma-nloglik:4.01922
[14]	validation-gamma-nloglik:3.95094
[15]	validation-gamma-nloglik:3.88971
[16]	validation-gamma-nloglik:3.83487
[17]	validation-gamma-nloglik:3.78581
[18]	validation-gamma-nloglik:3.74201
[19]	validation-gamma-nloglik:3.70296
[20]	validation-gamma-nloglik:3.66819
[21]	validation-gamma-nloglik:3.63727
[22]	validation-gamma-nloglik:3.60982
[23]	validation-gamma-nloglik:3.58546
[24]	validation-gamma-nloglik:3.56389
[25]	validation-gamma-nloglik:3.54481
[26]	validation-gamma-

Stack trace:
  [bt] (0) /home/bruno/anaconda3/envs/MLOps/lib/libxgboost.so(+0x109124) [0x7c4ae8509124]
  [bt] (1) /home/bruno/anaconda3/envs/MLOps/lib/libxgboost.so(xgboost::RegTree::Save(dmlc::Stream*) const+0x4fe) [0x7c4ae8a68d4e]
  [bt] (2) /home/bruno/anaconda3/envs/MLOps/lib/libxgboost.so(xgboost::gbm::GBTreeModel::Save(dmlc::Stream*) const+0x238) [0x7c4ae8828128]
  [bt] (3) /home/bruno/anaconda3/envs/MLOps/lib/libxgboost.so(+0x442a6e) [0x7c4ae8842a6e]
  [bt] (4) /home/bruno/anaconda3/envs/MLOps/lib/libxgboost.so(XGBoosterSaveModel+0x482) [0x7c4ae85120a2]
  [bt] (5) /home/bruno/anaconda3/envs/MLOps/lib/python3.12/lib-dynload/../../libffi.so.8(+0xa052) [0x7c4b30623052]
  [bt] (6) /home/bruno/anaconda3/envs/MLOps/lib/python3.12/lib-dynload/../../libffi.so.8(+0x8925) [0x7c4b30621925]
  [bt] (7) /home/bruno/anaconda3/envs/MLOps/lib/python3.12/lib-dynload/../../libffi.so.8(ffi_call+0xde) [0x7c4b3062206e]
  [bt] (8) /home/bruno/anaconda3/envs/MLOps/lib/python3.12/lib-dynload/_ctypes.cpy