In [None]:
import seaborn as sns
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')
import mlflow
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error as RMSE 

In [85]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval 
from hyperopt.early_stop import no_progress_loss

In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("NYC-Taxi-Regressor")

In [87]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')

In [None]:
df

In [None]:
df.dtypes

In [90]:
df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']

In [None]:
df.head()

In [92]:
df.duration = df.duration.apply(lambda ld : ld.total_seconds()/60)

In [None]:
df.head()

In [None]:
df.duration.describe()

In [None]:
df.duration.describe(percentiles=[0.95,0.98,0.99])

#### **98% 56.000000 : Most of the trip (98%) duration is within an Hour**
#### we should focus in that range only

In [96]:
df = df[(df.duration >= 1.0) & (df.duration <= 60.0)]

In [None]:
df

In [None]:
df.duration.mean()

In [None]:
sns.distplot(df.duration)

In [None]:
df.isna().sum()

In [101]:
df.drop(columns=['store_and_fwd_flag','RatecodeID','passenger_count','ehail_fee','payment_type','trip_type','congestion_surcharge'],inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.head()

In [104]:
df = df.reset_index()

In [105]:
xvars = ['PULocationID','DOLocationID','trip_distance','fare_amount','extra','mta_tax','tip_amount','tolls_amount','improvement_surcharge','total_amount']
yvars = ['duration']

In [106]:
Xtrain = df[xvars]
ytrain = df[yvars]

In [None]:
Xtrain

In [None]:
Xtrain.dtypes

In [None]:
Xtrain.head()

In [None]:
ytrain.head()

In [111]:
X_train, X_test, y_train, y_test = train_test_split(Xtrain, ytrain, test_size=0.2, random_state=42)

## CatBoost

In [None]:
from catboost import CatBoostRegressor

In [113]:
from hyperopt.pyll import scope

In [114]:
search_space = {'learning_rate': hp.uniform('learning_rate', 0.1, 0.5),
                'iterations': hp.randint('iterations',100,1000),
                'l2_leaf_reg': hp.randint('l2_leaf_reg',1,10),
                'depth': hp.randint('depth',4,10),
                'bootstrap_type' : hp.choice('bootstrap_type', ['Bayesian', 'Bernoulli'])}

In [115]:
def objective(params):
        
    model = CatBoostRegressor(**params,
                            loss_function='RMSE',
                            eval_metric='RMSE',
                            early_stopping_rounds=100,
                            random_seed=42)

    model.fit(X = X_train, y = y_train, eval_set=(X_test,y_test), verbose=False)
    
    y_pred = model.predict(X_test)
    rmse = RMSE(y_test, y_pred)
        

    return {'loss': rmse, 'status': STATUS_OK}

### 1. max_evals=50 means that Hyperopt will test 50 different sets of hyperparameters (not all combinations).
### 2. Hyperopt uses the Tree of Parzen Estimators (TPE) algorithm (in your case, specified with algo=tpe.suggest) to suggest the next set of hyperparameters based on past results, rather than exhaustively trying all combinations.

In [None]:
best_params = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [None]:
hyperparams = space_eval(search_space, best_params)


In [None]:
params = {'learning_rate' : hyperparams['learning_rate'],
          'iterations' : hyperparams['iterations'],
          'depth' : hyperparams['depth'],
          'loss_function' : 'RMSE',
          'l2_leaf_reg' : hyperparams['l2_leaf_reg'],
          'eval_metric' : 'RMSE',
          'early_stopping_rounds': 100,
          'bootstrap_type' : hyperparams['bootstrap_type']}

mlflow.xgboost.autolog()

model = CatBoostRegressor(**params, random_seed=42)
model.fit(X = X_train, y = y_train, eval_set=(X_test,y_test),verbose=250)


In [None]:
print('best RMSE', model.get_best_score()['validation']['RMSE'])

In [None]:
from sklearn.metrics import r2_score

In [None]:
logged_model = 
# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
pred = loaded_model.predict(X_test)

r2 = r2_score(y_test, pred)

print(f"R-squared: {r2}")

sns.distplot(y_test,kde=True,label='actual')
sns.distplot(pred,kde=True,label='pred')
plt.legend()