In [7]:
import pandas as pd
import numpy as np

In [None]:
np.set_printoptions(threshold=np.inf, linewidth=200, formatter={'float': '{: 0.3f}'.format})

pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 100)

In [None]:
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, RobustScaler

from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import (
    mean_squared_error,
    root_mean_squared_error
)

In [None]:
import xgboost as xgb

from hyperopt import (
    fmin,  # Function for minimizing/maximizing an objective function
    tpe,   # Tree-structured Parzen Estimator (TPE) algorithm for optimization
    hp,    # Defines search space for hyperparameters
    STATUS_OK,  # Constant indicating successful completion of an objective
    Trials  # Container for storing results of each trial
)


from hyperopt.pyll import scope # Handles scoping in hyperparameter definitions
from hyperopt.pyll.stochastic import sample

from functools import partial

In [None]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db") # sets the sqlite up for storing artifacts
mlflow.set_experiment("nyc-taxi-experiment") # this is the experiment. It will try to recognize if the experiment exist and, If not, it will create a new one. 

In [None]:
print(f"XGBoost version: {xgb.__version__}")
print(f"MLflow version: {mlflow.__version__}")

In [None]:
df = pd.read_parquet('./data/green_tripdata_2021.parquet')

df.isna().any()

In [None]:
# Verifying if a categorical variable is a string.
# categorical = ['VendorID', 'trip_type']
categorical = ['trip_type', 'RatecodeID']
numerical = ['trip_distance']
label = ['duration']

categorical_columns = [col for col in categorical if col in df.columns]

for column in categorical_columns:
    if df[column].dtype == 'category':
        print(f"{column} is already of type str")
    else:
        print(f"{column} is not of type str")
        df[column] = df[column].astype('category')

df.dtypes

In [None]:
X = df[categorical + numerical]
y = df[label]

In [None]:
X_train, X_split, y_train, y_split = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_split, y_split, test_size=0.4, random_state=42)

print('Lengh train X and y: ', X_train.shape,' ', len(y_train))
print('Lengh valid X and y: ', X_val.shape,'  ', len(y_val))
print('Lengh test  X and y: ', X_test.shape,'  ', len(y_test))


In [None]:
X_train.dtypes

In [None]:
# ”c”, which represents categorical columns.
# ”q”, which represents numeric columns.
# ”int”, which represents integer columns.
# ”i”, which represents boolean columns.

# ft = ["c", "c", "q"]
# train = xgb.DMatrix(X_train, label=y_train, enable_categorical=True, feature_types = ft)

train = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
valid = xgb.DMatrix(X_val,   label=y_val,   enable_categorical=True)
test = xgb.DMatrix(X_test,   label=y_test,  enable_categorical=True)


In [None]:
# Convert DMatrix to Pandas DataFrame
train.get_data().toarray()
df = pd.DataFrame(train.get_data().toarray(), columns=[f'feature_{i}' for i in train.feature_names ])
df['label'] = train.get_label()

print(df.head())

df['feature_RatecodeID'].unique()


### ***Linear regression***
----

In [None]:
def objective(params, name_set = "xgboost"):
    with mlflow.start_run():
        mlflow.set_tag("model", name_set)
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )

        y_pred_val = booster.predict(valid)
        rmse_val = root_mean_squared_error(y_val, y_pred_val)
        mlflow.log_metric("rmse_val", rmse_val)

        y_pred_test = booster.predict(test)
        rmse_test = root_mean_squared_error(y_test, y_pred_test)
        mlflow.log_metric("rmse_test", rmse_test)

    return {'loss': rmse_val, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -6, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

# best_result = fmin(
#     fn=objective,
#     space=search_space,
#     algo=tpe.suggest,
#     max_evals=50,
#     trials=Trials()
# )

### ***Gamma Regression***
---

In [None]:
objective_gamma = partial(objective, name_set="xgboost-gamma")

search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -6, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:gamma',
    'seed': 42,
    'eval_metric': 'gamma-nloglik'
}

# best_result = fmin(
#     fn=objective_gamma,
#     space=search_space,
#     algo=tpe.suggest,
#     max_evals=50,
#     trials=Trials()
# )



In [None]:
best_result

In [None]:
# MLflow
# Parameter           Value
# eval_metric         gamma-nloglik
# learning_rate       0.01009913729679454
# max_depth           4
# min_child_weight    10.112031917109253
# objective           reg:gamma
# reg_alpha           0.318404154187771
# reg_lambda          0.0050144697269259636
# seed                42

In [None]:
params_final = {
'learning_rate': 0.01009913729679454,
 'max_depth': int(4),
 'min_child_weight': 10.112031917109253,
 'reg_alpha': 0.318404154187771,
 'reg_lambda': 0.0050144697269259636,
 'objective': 'reg:gamma',
 'seed': 42,
 'eval_metric': 'gamma-nloglik'}


# Autologging is known to be compatible with the following package versions: 1.4.2 <= xgboost <= 2.0.3. 
# Autologging may not succeed when used with package versions outside of this range.
mlflow.xgboost.autolog()

train = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
valid = xgb.DMatrix(X_val,   label=y_val,   enable_categorical=True)
test = xgb.DMatrix(X_test,   label=y_test,  enable_categorical=True)

booster = xgb.train(
    params=params_final,
    dtrain=train,
    num_boost_round=1000,
    evals=[(valid, 'validation')],
    early_stopping_rounds=50
)