In [None]:
!python -V

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
import pickle
import mlflow

In [None]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

In [None]:
mlflow.set_experiment("nyc-taxi-exp")

In [None]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')

In [None]:
df.head()

In [None]:
df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime

In [None]:
df

In [None]:
td = df.duration.iloc[0]

In [None]:
td

In [None]:
td.total_seconds()

In [None]:
df.duration = df.duration.apply(lambda td: td.total_seconds()/60)

In [None]:
sns.distplot(df.duration)

In [None]:
df.duration.describe(percentiles=[0.9,0.95,0.96,0.97,0.98,0.99,1.0])

In [None]:
((df.duration >=1) & (df.duration <=60)).mean()

In [None]:
df_avgtrips = df[(df.duration >=1) & (df.duration <=60)]

In [None]:
df_avgtrips.dtypes

In [None]:
categorical = ['PULocationID','DOLocationID']
numerical = ['trip_distance']

In [None]:
df_avgtrips[categorical] = df_avgtrips[categorical].astype(str)

In [None]:
df_avgtrips.dtypes

In [None]:
df_avgtrips[categorical + numerical].iloc[:10].to_dict(orient='records')

In [None]:
train_dicts = df_avgtrips[categorical + numerical].to_dict(orient='records')

In [None]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [None]:
X_train

In [None]:
dv.feature_names_

In [None]:
target = 'duration'
y_train = df_avgtrips[target].values

In [None]:
y_train

In [None]:
lr = LinearRegression()
lr.fit(X_train,y_train)

In [None]:
y_pred = lr.predict(X_train)

In [None]:
sns.distplot(y_pred, label='prediction')

In [None]:
sns.distplot(y_train, label='actual')

In [None]:
sns.distplot(y_pred, label='prediction')
sns.distplot(y_train, label='actual')
plt.legend()

In [None]:
mean_squared_error(y_train, y_pred, squared = False)
#squared = False to get root mean squared error value

In [None]:
def read_dataFrame(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds()/60)
    df = df[(df.duration >=1) & (df.duration <=60)]
    categorical = ['PULocationID','DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [None]:
df_train = read_dataFrame('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val = read_dataFrame('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [None]:

df_val.dtypes

In [None]:
categorical = ['PULocationID','DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

In [None]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)


In [None]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [None]:
lr1 = Ridge(alpha=10)
lr1.fit(X_train,y_train)

y_pred = lr1.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)


In [None]:
with open('./models/lin_reg.bin','wb') as f:
    pickle.dump((dv,lr1), f)

In [None]:
with mlflow.start_run():
    mlflow.set_tag('developer', 'arungansi')
    
    mlflow.log_param('train_data_path','green_tripdata_2021-01.parquet')
    mlflow.log_param('val_data_path','green_tripdata_2021-02.parquet')
    # mlflow.log_input('train_data_path','green_tripdata_2021-01.parquet')
    # mlflow.log_input('train_data_path','green_tripdata_2021-01.parquet')
    
    alpha = 10
    mlflow.log_param('alpha',alpha)
    lr1 = Ridge(alpha)
    lr1.fit(X_train,y_train)

    y_pred = lr1.predict(X_val)

    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric('rmse',rmse)
    
    mlflow.log_artifact(local_path='models/lin_reg.bin', artifact_path='models_pickle')

In [None]:
import xgboost as xgb


In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

Best Parameters after 1000 runs is the one with the lowest rmse
Choose the hyper paramters for that run and train the model again at the end of the hypertuning run above

In [None]:
params = {
    'learning_rate':0.07755880320609634,
    'max_depth':10,
    'min_child_weight':3.7126270987128547,
    'objective':'reg:linear',
    'reg_alpha':0.028088122108329037,
    'reg_lambda':0.013125534079775656,
    'seed':42
}

In [None]:
mlflow.xgboost.autolog()

In [None]:
booster = xgb.train(
    params=params,
    dtrain=train,
    num_boost_round=1000,
    evals=[(valid, 'validation')],
    early_stopping_rounds=50
)

In [None]:
mlflow.xgboost.autolog(disable=True)

In [None]:
with mlflow.start_run():
    params = {
    'learning_rate':0.07755880320609634,
    'max_depth':10,
    'min_child_weight':3.7126270987128547,
    'objective':'reg:linear',
    'reg_alpha':0.028088122108329037,
    'reg_lambda':0.013125534079775656,
    'seed':42
    }
    
    mlflow.log_params(params)
    
    booster = xgb.train(
    params=params,
    dtrain=train,
    num_boost_round=1000,
    evals=[(valid, 'validation')],
    early_stopping_rounds=50
    )
    
    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val,y_pred,squared=False)
    mlflow.log_metric('rmse',rmse)
    
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv,f_out)
        
    mlflow.log_artifact("models/preprocessor.b",artifact_path='preprocessor')
    
    mlflow.xgboost.log_model(booster,artifact_path="models_mlflow")
    

In [None]:
logged_model = 'runs:/d031ab694f7f4a76988537cd5f7594ce/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)



In [None]:
loaded_model

In [None]:
X_val

In [None]:
# Predict on a Pandas DataFrame.
#loaded_model.predict(pd.DataFrame(<data>))

In [45]:
#2nd method

xgboost_loaded_model = mlflow.xgboost.load_model(logged_model)

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]



In [46]:
xgboost_loaded_model

<xgboost.core.Booster at 0x799b627b3500>

In [47]:
y_pred = xgboost_loaded_model.predict(valid)

In [48]:
y_pred

array([15.088369 ,  6.6498303, 17.523094 , ..., 12.988363 ,  6.5416555,
        8.390461 ], dtype=float32)