Ref: https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-auto-train-models

In [None]:
#!pip install azureml-sdk[automl] azureml-opendatasets azureml-widgets 

In [2]:
#Download and prepare data
from azureml.opendatasets import NycTlcGreen
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [None]:
#This takes a long time to run. Reload data saved as .csv in later cell.

#dataframe to hold the taxi data
green_taxi_df = pd.DataFrame([])
start = datetime.strptime("1/1/2015","%m/%d/%Y")
end = datetime.strptime("1/31/2015","%m/%d/%Y")

for sample_month in range(12):
    temp_df_green = NycTlcGreen(start + relativedelta(months=sample_month), end + relativedelta(months=sample_month)) \
        .to_pandas_dataframe()
    green_taxi_df = green_taxi_df.append(temp_df_green.sample(2000))

In [24]:
green_taxi_df.head(10)

Unnamed: 0,vendorID,passengerCount,tripDistance,pickupLongitude,pickupLatitude,dropoffLongitude,dropoffLatitude,totalAmount,month_num,day_of_month,day_of_week,hour_of_day
401038,1,2,3.0,-73.94,40.81,-73.95,40.78,16.8,1,19,0,13
867794,2,1,5.59,-73.95,40.79,-73.98,40.73,22.35,1,5,0,12
1342403,1,3,1.1,-73.96,40.72,-73.96,40.74,7.8,1,20,1,21
1408967,2,1,7.07,-73.95,40.79,-73.91,40.77,36.2,1,12,0,18
320663,2,2,2.99,-73.88,40.88,-73.86,40.9,15.3,1,15,3,7
407563,1,1,1.7,-73.94,40.67,-73.95,40.69,10.55,1,19,0,16
1506139,1,2,2.6,-73.96,40.81,-73.96,40.78,12.3,1,30,4,15
231789,2,1,5.84,-73.99,40.7,-73.98,40.65,24.36,1,23,4,21
777785,2,3,5.5,-73.91,40.77,-73.91,40.71,18.3,1,17,5,19
1106011,2,1,1.27,-73.96,40.81,-73.97,40.79,6.8,1,8,3,14


In [25]:
#create copy of the data in the workspace
import os
dataDir = "data"

if not os.path.exists(dataDir):
    os.mkdir(dataDir)
    
green_taxi_df.to_csv('./data/greenTaxiDataForAutoML', index=False)

In [8]:
#function to create various time-based features from the pickup datetime field
def build_time_features(vector):
    pickup_datetime = vector[0]
    month_num = pickup_datetime.month
    day_of_month = pickup_datetime.day
    day_of_week = pickup_datetime.weekday()
    hour_of_day = pickup_datetime.hour

    return pd.Series((month_num, day_of_month, day_of_week, hour_of_day))

green_taxi_df[["month_num", "day_of_month","day_of_week", "hour_of_day"]] = green_taxi_df[["lpepPickupDatetime"]].apply(build_time_features, axis=1)
green_taxi_df.head(10)

Unnamed: 0,vendorID,lpepPickupDatetime,lpepDropoffDatetime,passengerCount,tripDistance,puLocationId,doLocationId,pickupLongitude,pickupLatitude,dropoffLongitude,...,improvementSurcharge,tipAmount,tollsAmount,ehailFee,totalAmount,tripType,month_num,day_of_month,day_of_week,hour_of_day
401038,1,2015-01-19 13:18:02,2015-01-19 13:38:54,2,3.0,,,-73.938889,40.805126,-73.951561,...,0.3,0.0,0.0,,16.8,1.0,1,19,0,13
867794,2,2015-01-05 12:34:09,2015-01-05 12:56:46,1,5.59,,,-73.954124,40.790211,-73.98278,...,0.3,0.05,0.0,,22.35,1.0,1,5,0,12
1342403,1,2015-01-20 21:43:32,2015-01-20 21:47:43,3,1.1,,,-73.961731,40.720181,-73.958405,...,0.3,1.0,0.0,,7.8,1.0,1,20,1,21
1408967,2,2015-01-12 18:01:04,2015-01-12 18:39:09,1,7.07,,,-73.952957,40.792088,-73.909431,...,0.3,5.9,0.0,,36.2,1.0,1,12,0,18
320663,2,2015-01-15 07:49:53,2015-01-15 08:08:23,2,2.99,,,-73.879814,40.879726,-73.864784,...,0.3,0.0,0.0,,15.3,1.0,1,15,3,7
407563,1,2015-01-19 16:07:36,2015-01-19 16:15:30,1,1.7,,,-73.942139,40.67075,-73.947067,...,0.3,1.75,0.0,,10.55,1.0,1,19,0,16
1506139,1,2015-01-30 15:22:01,2015-01-30 15:36:21,2,2.6,,,-73.964188,40.80793,-73.961655,...,0.3,0.0,0.0,,12.3,1.0,1,30,4,15
231789,2,2015-01-23 21:13:16,2015-01-23 21:29:13,1,5.84,,,-73.986206,40.703835,-73.97567,...,0.3,4.06,0.0,,24.36,1.0,1,23,4,21
777785,2,2015-01-17 19:47:13,2015-01-17 20:02:30,3,5.5,,,-73.91143,40.768318,-73.905823,...,0.3,0.0,0.0,,18.3,1.0,1,17,5,19
1106011,2,2015-01-08 14:25:36,2015-01-08 14:31:27,1,1.27,,,-73.961884,40.805408,-73.965378,...,0.3,0.0,0.0,,6.8,1.0,1,8,3,14


In [9]:
#Remove columnsnot needed for training or additional feature building.
columns_to_remove = ["lpepPickupDatetime", "lpepDropoffDatetime", "puLocationId", "doLocationId", "extra", "mtaTax",
                     "improvementSurcharge", "tollsAmount", "ehailFee", "tripType", "rateCodeID",
                     "storeAndFwdFlag", "paymentType", "fareAmount", "tipAmount"
                    ]
for col in columns_to_remove:
    green_taxi_df.pop(col)

green_taxi_df.head(5)

Unnamed: 0,vendorID,passengerCount,tripDistance,pickupLongitude,pickupLatitude,dropoffLongitude,dropoffLatitude,totalAmount,month_num,day_of_month,day_of_week,hour_of_day
401038,1,2,3.0,-73.938889,40.805126,-73.951561,40.777924,16.8,1,19,0,13
867794,2,1,5.59,-73.954124,40.790211,-73.98278,40.731506,22.35,1,5,0,12
1342403,1,3,1.1,-73.961731,40.720181,-73.958405,40.735397,7.8,1,20,1,21
1408967,2,1,7.07,-73.952957,40.792088,-73.909431,40.768005,36.2,1,12,0,18
320663,2,2,2.99,-73.879814,40.879726,-73.864784,40.902714,15.3,1,15,3,7


In [10]:
green_taxi_df.describe()

Unnamed: 0,vendorID,passengerCount,tripDistance,pickupLongitude,pickupLatitude,dropoffLongitude,dropoffLatitude,totalAmount,month_num,day_of_month,day_of_week,hour_of_day
count,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0
mean,1.787167,1.366458,2.912197,-73.814681,40.683375,-73.844868,40.698184,14.93383,6.5,15.061167,3.270042,13.569458
std,0.409319,1.038891,2.945117,2.978342,1.642374,2.569027,1.4168,12.644065,3.452124,8.457688,1.954425,6.795907
min,1.0,0.0,0.0,-74.18158,0.0,-74.274261,0.0,-52.8,1.0,1.0,0.0,0.0
25%,2.0,1.0,1.06,-73.958925,40.699966,-73.967684,40.700363,7.8,3.75,8.0,2.0,9.0
50%,2.0,1.0,1.94,-73.944977,40.746872,-73.944649,40.747898,11.3,6.5,15.0,3.0,15.0
75%,2.0,1.0,3.73,-73.916937,40.803333,-73.909924,40.792284,17.8,9.25,22.0,5.0,19.0
max,2.0,8.0,41.63,0.0,40.985497,0.0,41.059727,485.5,12.0,30.0,6.0,23.0


In [11]:
#Filter out anomalies, like -ve fares (totalAmount) or trips outside Manhattan using query functions 
#remove the last few columns unnecessary for training
final_df = green_taxi_df.query("pickupLatitude>=40.53 and pickupLatitude<=40.88")
final_df = final_df.query("pickupLongitude>=-74.09 and pickupLongitude<=-73.72")
final_df = final_df.query("tripDistance>=0.25 and tripDistance<31")
final_df = final_df.query("passengerCount>0 and totalAmount>0")

columns_to_remove_for_training = ["pickupLongitude", "pickupLatitude", "dropoffLongitude", "dropoffLatitude"]
for col in columns_to_remove_for_training:
    final_df.pop(col)

In [12]:
#cleaned dataset
final_df.describe()

Unnamed: 0,vendorID,passengerCount,tripDistance,totalAmount,month_num,day_of_month,day_of_week,hour_of_day
count,23200.0,23200.0,23200.0,23200.0,23200.0,23200.0,23200.0,23200.0
mean,1.789009,1.368966,2.973646,14.880554,6.512716,15.065776,3.277888,13.584138
std,0.408021,1.04169,2.894286,10.743206,3.452923,8.446987,1.954783,6.802757
min,1.0,1.0,0.25,0.01,1.0,1.0,0.0,0.0
25%,2.0,1.0,1.1,8.16,4.0,8.0,2.0,9.0
50%,2.0,1.0,2.0,11.75,7.0,15.0,3.0,15.0
75%,2.0,1.0,3.8,17.8,10.0,22.0,5.0,19.0
max,2.0,6.0,28.36,450.8,12.0,30.0,6.0,23.0


In [13]:
#Configure workspace
from azureml.core.workspace import Workspace
ws = Workspace.from_config()

Performing interactive authentication. Please follow the instructions on the terminal.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code F5F334LJ8 to authenticate.
Interactive authentication successfully completed.


In [14]:
#data split
from sklearn.model_selection import train_test_split

y_df = final_df.pop("totalAmount")
x_df = final_df

x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=223)

In [15]:
#Define the experiment parameter and model settings for training. Use **kwargs for unknown number of named arguments
import logging

automl_settings = {
    "iteration_timeout_minutes": 2,
    "experiment_timeout_minutes": 20,
    "enable_early_stopping": True,
    "primary_metric": 'spearman_correlation',
    "featurization": 'auto',
    "verbosity": logging.INFO,
    "n_cross_validations": 5
}

In [16]:
#autoML config
from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task='regression',
                             debug_log='automated_ml_errors.log',
                             X=x_train.values,
                             y=y_train.values.flatten(),
                             **automl_settings)



In [17]:
from azureml.core.experiment import Experiment
experiment = Experiment(ws, "taxi-experiment")
local_run = experiment.submit(automl_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_bfe84294-2191-4ff1-9083-f96e11cd6f38
Current status: DatasetFeaturization. Beginning to featurize the dataset.
Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturizationCompleted. Completed featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS SUMMARY:
For more details, use API: run.get_guardrails()

TYPE:         Missing Values Imputation
STATUS:       PASSED
DESCRIPTION:  There were no missing values found in the training data.

TYPE:         High Cardinality Feature Detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.

**************************************************************

In [18]:
#Explore the results
from azureml.widgets import RunDetails
RunDetails(local_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [20]:
#Retrieve the best model
best_run, fitted_model = local_run.get_output()
print('*'*100)
print(best_run)
print('*'*100)
print(fitted_model)

****************************************************************************************************
Run(Experiment: taxi-experiment,
Id: AutoML_bfe84294-2191-4ff1-9083-f96e11cd6f38_1,
Type: None,
Status: Completed)
****************************************************************************************************
RegressionPipeline(pipeline=Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
        feature_sweeping_config=None, feature_sweeping_timeout=None,
        featurization_config=None, is_cross_validation=None,
        is_onnx_compatible=None, logger=None, observer=None, task=None)), ('MinM...timators=25, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
          stddev=None)


In [21]:
#Test the best model accuracy
y_predict = fitted_model.predict(x_test.values)
print(y_predict[:10])

[16.8604142  22.97201974 45.12272819  5.66122381  5.41692327 28.20365656
  5.71670597 17.06627799  6.32534225  9.83237479]


In [22]:
from sklearn.metrics import mean_squared_error
from math import sqrt

y_actual = y_test.values.flatten().tolist()
rmse = sqrt(mean_squared_error(y_actual, y_predict))
rmse

4.111740024632385

In [23]:
sum_actuals = sum_errors = 0

for actual_val, predict_val in zip(y_actual, y_predict):
    abs_error = actual_val - predict_val
    if abs_error < 0:
        abs_error = abs_error * -1

    sum_errors = sum_errors + abs_error
    sum_actuals = sum_actuals + actual_val

mean_abs_percent_error = sum_errors / sum_actuals
print("Model MAPE:")
print(mean_abs_percent_error)
print()
print("Model Accuracy:")
print(1 - mean_abs_percent_error)

Model MAPE:
0.14008314077228862

Model Accuracy:
0.8599168592277113
