In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso # Linear Regression with regularization
from sklearn.linear_model import Ridge # regularised linear regression
from sklearn.metrics import mean_squared_error

import pickle
import xgboost
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials # Importing some methods from hyperopt
from hyperopt.pyll import scope


In [2]:
import mlflow

mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [3]:
# !pip install mlflow
# !pip install hyperopt
# !pip install pyarrow

In [4]:
def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    df = df[(df.duration >= 1) & (df.duration <= 60) ]
    
    categorical = ['PULocationID','DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    return df

In [5]:
df_train = read_data('../data/green_tripdata_2021-01.parquet')
df_val = read_data('../data/green_tripdata_2021-02.parquet')

In [6]:
len(df_train), len(df_val)

(73908, 61921)

In [7]:
categorical = ['PU_DO']#,'PULocationID','DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)
X_valid = dv.transform(val_dicts)

target = 'duration'
y_train = df_train[target].values
y_valid = df_val[target].values

In [8]:
def train_model(lr = LinearRegression()):
    # For Rapid and clean experimentaion
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_train)
    print('train mse',mean_squared_error(y_train, y_pred, squared=False))
    
    y_pred = lr.predict(X_valid)
    print('val mse',mean_squared_error(y_valid, y_pred, squared=False))
    

In [9]:
train_model(LinearRegression())

train mse 4.640859708885847
val mse 7.479343826040432


In [10]:
train_model(Lasso(alpha = 0.001))

train mse 8.345478908061276
val mse 9.233436225720547


In [11]:
train_model(Ridge())

train mse 10.712682923244786
val mse 11.3426039432658


**Experiment Tracking with MLFLOW**

In [12]:
with mlflow.start_run():
    mlflow.set_tag('developer','Bhaskara')
    mlflow.log_param('train-data-path','../data/green_tripdata_2021-01.parquet')
    mlflow.log_param('valid-data-path','../data/green_tripdata_2021-02.parquet')
    a = 0.001
    mlflow.log_param('alpha',a)
    lr = Lasso(alpha = a)
    lr.fit(X_train,y_train)

    y_pred = lr.predict(X_train)
    rmse_train = mean_squared_error(y_train, y_pred, squared=False)

    y_pred = lr.predict(X_valid)
    rmse_valid = mean_squared_error(y_valid, y_pred, squared=False)
    
    mlflow.log_metric('rmse_train',rmse_train)
    mlflow.log_metric('rmse_valid',rmse_valid)
    
    # saving the model
    mlflow.log_artifact(local_path="../models/lin_reg.bin", artifact_path="models_pickle")

In [13]:
with open('../models/lin_reg.bin','wb') as f_out:
    pickle.dump((dv,lr), f_out)

**Hyper Parameter Optimization using Bayesian methods**
<br>
Using Hyperopt library to tune the find the parameteres that result in least error

In [14]:
# Preparing data
train = xgboost.DMatrix(X_train, label = y_train)
valid = xgboost.DMatrix(X_valid, label = y_valid)

In [15]:
# defining objective function for hyperopt
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model","xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params = params,
            dtrain = train,
            num_boost_round = 1000,
            evals = [(valid,"validation")],
            early_stopping_rounds = 50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_valid, y_pred, squared=False)
        mlflow.log_metric("rmse",rmse)
        
    return {'loss':rmse,'status':STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth',4,100,1)),
    'learning_rate': hp.loguniform('learning_rate',-3,0),
    'reg_alpha': hp.loguniform('reg_alpha',-5,1),
    'reg_lambda': hp.loguniform('reg_lambda',-6,1),
    'min_child_weight':hp.loguniform('min_child_weight',-1,3),
    'objective':'reg:linear',
    'seed':42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|                                                                           | 0/50 [00:00<?, ?trial/s, best loss=?]

<IPython.core.display.Javascript object>

[0]	validation-rmse:14.43683                                                                                           
[1]	validation-rmse:10.65666                                                                                           
[2]	validation-rmse:8.64480                                                                                            
[3]	validation-rmse:7.63133                                                                                            
[4]	validation-rmse:7.11061                                                                                            
[5]	validation-rmse:6.84934                                                                                            
[6]	validation-rmse:6.70913                                                                                            
[7]	validation-rmse:6.63826                                                                                            
[8]	validation-rmse:6.59225             

[66]	validation-rmse:6.44351                                                                                           
[67]	validation-rmse:6.44351                                                                                           
[68]	validation-rmse:6.44248                                                                                           
[69]	validation-rmse:6.44299                                                                                           
[70]	validation-rmse:6.44037                                                                                           
[71]	validation-rmse:6.44064                                                                                           
[72]	validation-rmse:6.43960                                                                                           
[73]	validation-rmse:6.43965                                                                                           
[74]	validation-rmse:6.43993            

In [None]:
params = {
    'learning_rate':0.6792481346516283,
    'max_depth':45,
    'min_child_weight':1.1821354185273338,
    'objective':'reg:linear',
    'reg_alpha':0.22386751312752642,
    'reg_lambda':1.5946121644009208,
    'seed':42
}
with mlflow.start_run():
    
    mlflow.xgboost.autolog()
    # mlflow.xgboost.autolog()
    best_booster = xgb.train(
            params = params,
            dtrain = train,
            num_boost_round = 1000,
            evals = [(valid,"validation")],
            early_stopping_rounds = 50,
            verbose_eval = False
        )


In [None]:
with mlflow.start_run():
    best_params = {
    'learning_rate':0.6792481346516283,
    'max_depth':45,
    'min_child_weight':1.1821354185273338,
    'objective':'reg:linear',
    'reg_alpha':0.22386751312752642,
    'reg_lambda':1.5946121644009208,
    'seed':42
    }
    mlflow.log_params(best_params)
    
    best_booster = xgb.train(
            params = best_params,
            dtrain = train,
            num_boost_round = 1000,
            evals = [(valid,"validation")],
            early_stopping_rounds = 50,
            verbose_eval = False
        )
    y_pred = best_booster.predict(valid)
    rmse = mean_squared_error(y_valid, y_pred, squared=False)
    mlflow.log_metric("rmse",rmse)
    
    with open("../models/preprocessor.b","wb") as f_out:
        pickle.dump(dv,f_out)
        
    mlflow.log_artifact("../models/preprocessor.b",artifact_path = "preprocessor")
    mlflow.xgboost.log_model(best_booster,artifact_path = "models_mlflow")

In [None]:
# import mlflow
logged_model = 'runs:/a7a089966f6c4cf2a987d34e78a2b8b3/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

loaded_model

In [None]:
xgboost_model = mlflow.xgboost.load_model('runs:/a7a089966f6c4cf2a987d34e78a2b8b3/models_mlflow')

In [None]:
xgboost_model

In [None]:
xgboost_model.predict(valid)