In [1]:
%attachments

Training Cluster    ML Engine
------------------  -----------
pythonmldl          python


In [2]:
%%pythonmldl


print("Importing libraries")
import pandas as pd
import numpy as np
from scipy import stats
import math
import os
import datetime
import xgboost as xgb
import pickle

import matplotlib.pyplot as plt

# Start time 
print("Start time: ", datetime.datetime.now())

# Project repo path function
def ProjectRepo(path):
   ProjectRepo = os.popen('bdvcli --get cluster.project_repo').read().rstrip()
   return str(ProjectRepo + '/' + path)


print("Reading in data")
# Reading in dataset table 
dbName = "pqyellowtaxi"
df = pd.read_csv(ProjectRepo('demodata.csv'))

# Reading in latitude/longitude coordinate lookup table 
lookupDbName = "pqlookup"
dflook = pd.read_csv(ProjectRepo('lookup-ipyheader.csv'))
print("Done reading in data")


# merging dataset and lookup tables on latitudes/coordinates
df = pd.merge(df, dflook[[lookupDbName + '.location_i', lookupDbName + '.long', lookupDbName + '.lat']], how='left', left_on=dbName + '.pulocationid', right_on=lookupDbName + '.location_i')
df.rename(columns = {(lookupDbName + '.long'):(dbName + '.startstationlongitude')}, inplace = True)
df.rename(columns = {(lookupDbName + '.lat'):(dbName + '.startstationlatitude')}, inplace = True)
df = pd.merge(df, dflook[[lookupDbName + '.location_i', lookupDbName + '.long', lookupDbName + '.lat']], how='left', left_on=dbName + '.dolocationid', right_on=lookupDbName + '.location_i')
df.rename(columns = {(lookupDbName + '.long'):(dbName + '.endstationlongitude')}, inplace = True)
df.rename(columns = {(lookupDbName + '.lat'):(dbName + '.endstationlatitude')}, inplace = True)


def fullName(colName):
    return dbName + '.' + colName

# convert string to datetime
df[fullName('tpep_pickup_datetime')] = pd.to_datetime(df[fullName('tpep_pickup_datetime')])
df[fullName('tpep_dropoff_datetime')] = pd.to_datetime(df[fullName('tpep_dropoff_datetime')])
df[fullName('duration')] = (df[fullName("tpep_dropoff_datetime")] - df[fullName("tpep_pickup_datetime")]).dt.total_seconds()

# feature engineering
df[fullName("weekday")] = (df[fullName('tpep_pickup_datetime')].dt.dayofweek < 5).astype(float)
df[fullName("hour")] = df[fullName('tpep_pickup_datetime')].dt.hour
df[fullName("work")] = (df[fullName('weekday')] == 1) & (df[fullName("hour")] >= 8) & (df[fullName("hour")] < 18)
df[fullName("month")] = df[fullName('tpep_pickup_datetime')].dt.month
# convert month to a categorical feature using one-hot encoding
df = pd.get_dummies(df, columns=[fullName("month")])

# Filter dataset to rides under 3 hours and under 150 miles to remove outliers
df = df[df[fullName('duration')] > 20]
df = df[df[fullName('duration')] < 10800]
df = df[df[fullName('trip_distance')] > 0]
df = df[df[fullName('trip_distance')] < 150]

# drop null rows
df = df.dropna(how='any',axis=0)

# select columns to be used as features
cols = [fullName('work'), fullName('startstationlatitude'), fullName('startstationlongitude'), fullName('endstationlatitude'), fullName('endstationlongitude'), fullName('trip_distance'), fullName('weekday'), fullName('hour')]
cols.extend([fullName('month_' + str(x)) for x in range(1, 7)])
cols.append(fullName('duration'))
dataset = df[cols]


X = dataset.iloc[:, 0:(len(cols) - 1)].values
y = dataset.iloc[:, (len(cols) - 1)].values
X = X.copy()
y = y.copy()
del dataset
del df

print("Done cleaning data")


print("Training...")

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)



xgbr = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 1, subsample = 1, learning_rate = 0.15, booster = "gbtree", max_depth = 15, eta = 0.5, eval_metric = "rmse",) 
print("num train elements: " + str(len(X_train)))
print("Train start time: ", datetime.datetime.now())
xgbr.fit(X_train, y_train)
print("Train end time: ", datetime.datetime.now())
y_pred = xgbr.predict(X_test)
y_pred = y_pred.clip(min=0)


from sklearn import metrics
from sklearn.metrics import mean_squared_log_error
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Root Mean Squared Log Error:', np.sqrt(mean_squared_log_error( y_test, y_pred)))
print()


print("Saving model")
pickle.dump(xgbr, open( ProjectRepo('models/') + "XGB.pickle.dat", "wb"))


# from xgboost import plot_importance
# plot_importance(xgbr, max_num_features=10) # top 10 most important features
# plt.show()

# Finish time
print("End time: ", datetime.datetime.now())

History URL: http://bluedata-195.bdlocal:10001/history/47


In [4]:
%logs --url http://bluedata-195.bdlocal:10001/history/47

Job Status: Finished
Importing libraries
Start time:  2020-06-30 16:20:03.975878
Reading in data
Done reading in data
Done cleaning data
Training...
num train elements: 264325
Train start time:  2020-06-30 16:20:08.503259
Train end time:  2020-06-30 16:22:08.580873
Mean Absolute Error: 177.31845364978025
Mean Squared Error: 80833.71930032197
Root Mean Squared Error: 284.31271392662336
Root Mean Squared Log Error: 0.30597408418927413

Saving model
End time:  2020-06-30 16:22:12.264221




