In [0]:
%matplotlib inline
!pip install -U tables
import pandas as pd
import time
import numpy as np
import sklearn
from sklearn import linear_model
import json
import matplotlib.pyplot as plt
import os
from tqdm.notebook import tqdm
import mxnet as mx
import pandarallel
from mxboard import *

Requirement already up-to-date: tables in /home/ubuntu/anaconda3/lib/python3.6/site-packages (3.6.1)


In [0]:
#from google.colab import drive
#drive.mount('/content/gdrive')
seed = 42
dataset_root = os.path.expanduser('~/TaxiData')

In [0]:
#num_samples = 1000000
#train_indices, test_indices = sklearn.model_selection.train_test_split(np.arange(num_samples))

def get_dataset():
    dataset = pd.read_hdf(os.path.join(dataset_root, 'data.h5'))
    train, test = sklearn.model_selection.train_test_split(dataset, random_state=seed)
    train = train.copy()
    test=  test.copy()
    return train, test

In [0]:
train, test = get_dataset()

In [0]:
category_features = {
    'VendorID':(2, 2),
    'PULocationID': (265, 20),
    'DOLocationID': (265, 20),
    'payment_type': (5, 2),
    'week': (53, 10),
    'dayofweek': (7, 10),
    'quarterofday': (96, 10)
}
exclude = ['tpep_pickup_datetime', 'tpep_dropoff_datetime'] + list(category_features.keys())
columns = [c for c in train.columns if c not in exclude]

In [0]:
train_pickup_counts = {i: 0 for i in range(1, 266)}
train_dropoff_counts = {i: 0 for i in range(1, 266)}
train_pickup_counts.update(train['PULocationID'].value_counts().to_dict())
train_dropoff_counts.update(train['DOLocationID'].value_counts().to_dict())

def get_pickup_dropoff_count(pu_map, do_map):
    def func(x):
      return pu_map[x['PULocationID']], do_map[x['DOLocationID']]
    return func

In [0]:
def get_date(df):
    df['week'] = df['tpep_pickup_datetime'].dt.weekofyear
    df['dayofweek'] = df['tpep_pickup_datetime'].dt.dayofweek
    df['quarterofday'] = df['tpep_pickup_datetime'].dt.hour * 4 + df['tpep_pickup_datetime'].dt.quarter
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 3600
    df.drop(columns=['tpep_dropoff_datetime', 'tpep_pickup_datetime'], inplace=True)

In [0]:
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

In [0]:
def preprocess(df):
    get_date(df)
    df['direction'] = bearing_array(df['PULocationX'].values, df['PULocationY'].values, df['DOLocationX'].values, df['DOLocationY'].values)
    df['distance_haversine'] = haversine_array(df['PULocationX'].values, df['PULocationY'].values, df['DOLocationX'].values, df['DOLocationY'].values)
    df['distance_dummy_manhattan'] = dummy_manhattan_distance(df['PULocationX'].values, df['PULocationY'].values, df['DOLocationX'].values, df['DOLocationY'].values)
    df.drop(df.loc[df['duration'] > 8].index, inplace=True) # drop > 8 hours
    #df['pickup_loc_count'], df['dropoff_loc_count'] = zip(*df.parallel_apply(get_pickup_dropoff_count(train_pickup_counts, train_dropoff_counts), axis=1))
    dtype = 'float32'
    duration = df['duration'].to_numpy(dtype=dtype)
    cats = [df[category_feature].to_numpy(dtype=dtype) for category_feature in category_features.keys()]
    conds = df[columns].to_numpy(dtype=dtype)
    return [conds, *cats], duration

In [0]:
train_data, train_labels = preprocess(train)
test_data, test_labels = preprocess(test)

In [0]:
y_range = train['duration'].max()
y_range

7.999722222222222

In [0]:
del train
del test

In [0]:
import mxnet as mx
import mxnet.gluon.nn as nn

In [0]:
class OneHotNet(nn.Block):
    def __init__(self, **kwargs):
        # Run `nn.Block`'s init method
        super(OneHotNet, self).__init__(**kwargs)
        '''
        self.blk = nn.Sequential()
        self.blk.add(       
                    #nn.Dense(300, use_bias=True),
                    #nn.BatchNorm(),         
                    #nn.Activation('relu'),
                    nn.Dense(50, use_bias=True),
                    nn.BatchNorm(),
                    nn.Activation('relu')
            )
        '''
        self.dense = nn.Dense(1, use_bias=True)
        self.relu= nn.Activation('relu')
    def forward(self, conds, *cats):
        cat_feats = []
        for cat, (depth, _) in zip(cats, category_features.values()):
            cat_feats.append(mx.nd.expand_dims(cat,axis=1))
            # cat_feats.append(mx.nd.one_hot(cat - 1, depth=depth))
        # feat = mx.nd.concat(conds, *cat_feats)
        # y = self.blk(feat)
        y = conds
        y = self.dense(y)
        y = self.relu(y)
        return y
    
class Net(nn.Block):
    def __init__(self, **kwargs):
        # Run `nn.Block`'s init method
        super(Net, self).__init__(**kwargs)
        self.cat_embeddings = []
        for feat_name, (in_dim, out_dim) in category_features.items():
            self.cat_embeddings.append(nn.Embedding(in_dim, out_dim))
            self.register_child(self.cat_embeddings[-1], feat_name)
        self.concat_bn_relu = nn.Sequential()
        self.concat_bn_relu.add(nn.BatchNorm(), nn.Activation('relu'))
     
        self.cond_dense = nn.Sequential()
        self.cond_dense.add(
            nn.Dense(100, use_bias=True), 
            nn.BatchNorm(), 
            nn.Activation('relu'))
        self.blk = nn.Sequential()
        self.blk.add(
            nn.Dense(300, use_bias=True),
            nn.BatchNorm(),         
            nn.Activation('relu')
            #nn.Dense(50, use_bias=True),
            #nn.BatchNorm(),
            #nn.Activation('relu')
                     )
        self.dense = nn.Dense(1)
        self.relu= nn.Activation('relu')
        
    def forward(self, conds, *cats):
        cat_feats = [self.cat_embeddings[i](cats[i]) for i in range(len(category_features))]
        cond_feat = self.cond_dense(conds)
        cat_feats = mx.nd.concat(*cat_feats)
        cat_feats = self.concat_bn_relu(cat_feats)
        feat = mx.nd.concat(cond_feat, cat_feats)
        # feat = cond_feat
        y = self.blk(feat)
        y = self.dense(y)
        y = self.relu(y)
        return y

In [0]:
#ds = PandasDataset(train)
#ds = ds.transform(preprocess)
train_dataset = mx.gluon.data.ArrayDataset(*train_data, train_labels)
test_dataset = mx.gluon.data.ArrayDataset(*test_data, test_labels)

In [0]:
batch_size = 64
train_loader = mx.gluon.data.DataLoader(train_dataset, batch_size=batch_size)
test_loader = mx.gluon.data.DataLoader(test_dataset, batch_size=batch_size, last_batch='keep')

In [0]:
import numpy as np
net = Net()
net.initialize(init=mx.init.Xavier())
sw = SummaryWriter(logdir='./logs/today')
trainer = mx.gluon.Trainer(net.collect_params(), 'rmsprop', {'learning_rate': 0.001})
l2_loss = mx.gluon.loss.L2Loss()
huber_loss = mx.gluon.loss.HuberLoss(rho=0.7)
global_step = 0
for epoch in range(10):
    train_loss, train_acc, valid_acc = 0., 0., 0.
    tic = time.time()
    train_loader.reset()
    test_loader.reset()
    num_batches = len(train_loader)
    mean_mae = []
    for i, batch in enumerate(tqdm(train_loader)):
        # forward + backward
        with mx.autograd.record():
            output = net(*batch[:-1])
            loss = l2_loss(output, batch[-1])
            # loss = huber_loss(output, batch[-1])
        loss.backward()
        # update parameters
        trainer.step(batch_size)
        # calculate training metrics
        cur_loss = loss.mean().asscalar()
        mean_abs_error = (output - batch[-1]).abs().mean().asscalar() * 3600
        rmse = np.sqrt(((output - batch[-1])**2).mean().asscalar())
        train_loss += cur_loss / num_batches
        sw.add_scalar('mean_abs_error', mean_abs_error, global_step)
        sw.add_scalar('loss', cur_loss, global_step)
        sw.add_scalar('RMSE',rmse, global_step)
        global_step += 1
    filename = "./epoch_today_{}.params".format(epoch)
    net.save_parameters(filename)
    print("Epoch %d: loss %.3f in %.1f sec" % (epoch, train_loss, time.time()-tic))

HBox(children=(IntProgress(value=0, max=786761), HTML(value='')))

KeyboardInterrupt: 

In [0]:
net.save_parameters('today.params')

In [0]:
from sklearn.metrics import mean_absolute_error
def fit_and_test(model, X, y, metric=mean_absolute_error):
    '''
    Fit the model and test the performance on the training set
    '''
    model.fit(X, y)
    y_pred = model.predict(X)
    print(type(model))
    print(metric(y, y_pred) * 3600)
    return model
linear_reg = fit_and_test(sklearn.linear_model.LinearRegression(), train_data[0], train_labels)

<class 'sklearn.linear_model.base.LinearRegression'>
434.6104234457016
