# Regression with Artificial Neural Network using Pytorch
# Abode Daniel
# TASK: Estimating the cost of a New York City Cab ride

This project considers the application of ANN to a regression problem of estimating taxi fare base on input features such as pickup datetime, pcikup location(latitude and longitude), dropoff location, and Number of Passengers

In [1]:
import torch
import torch.nn as nn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

Data can be downloaded from https://drive.google.com/file/d/1SsR0wi3i7UHEbzYB7oO-jnGj0KfCeC-x/view?usp=sharing

In [4]:
df = pd.read_csv('NYCTaxiFares.csv') #input file location
df.head()   

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


We want to create a new feature (travel distance) from the pickup and dropoff location

In [8]:
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

In [10]:
df['dist_km'] = haversine_distance(df,'pickup_latitude','pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321


Below we created new features Hour, time of the day (AM or PM) and Day of the Week

In [11]:
df['EDTdate'] = pd.to_datetime(df['pickup_datetime'].str[:19]) - pd.Timedelta(hours=4)
df['Hour'] = df['EDTdate'].dt.hour
df['AMorPM'] = np.where(df['Hour']<12,'am','pm')
df['Weekday'] = df['EDTdate'].dt.strftime("%a")
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM,Weekday
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56,4,am,Mon
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53,11,am,Sat
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26,7,am,Sat
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03,17,pm,Sun
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01,22,pm,Fri


Separating the dataset into categorical variables and continuous variable and label (y)

In [17]:
#Separating the dataset into categorical variables and continuous variable and label (y)
cat_cols = ['Hour', 'AMorPM', 'Weekday']
cont_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'passenger_count', 'dist_km']
y_col = ['fare_amount']  # this column contains the labels

In [20]:
for cat in cat_cols:
    df[cat] = df[cat].astype('category')

In [36]:
cats = torch.tensor(np.stack([df[col].cat.codes.values for col in cat_cols],1), dtype=torch.int64)
conts = torch.tensor(np.stack([df[cont].values for cont in cont_cols],1), dtype=torch.float)
y = torch.tensor(df[y_col].values, dtype=torch.float).reshape(-1,1)

We use Entity Embeddings, we embedded the categorical features

In [53]:
cat_szs = [max(df[col].cat.codes)+1 for col in cat_cols]
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
emb_szs

[(24, 12), (2, 1), (7, 4)]

We created our model

In [83]:
class TabularModel(nn.Module):
    
    def __init__(self, emb_sz, n_cont, out_sz, layers, p= 0.5):
        
        super().__init__()
        
        self.embeds = nn.ModuleList([nn.Embedding(ni,nf)  for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum([nf for ni,nf in emb_szs])
        n_in = n_emb + n_cont
        
        for i in layers:
            layerlist.append(nn.Linear(n_in, i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
            
        layerlist.append(nn.Linear(layers[-1], out_sz))
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        
        x = torch.cat(embeddings,1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        
        return x
        

In [84]:
torch.manual_seed(33)
model = TabularModel(emb_szs, conts.shape[1], 1, [200,100], p=0.4)

In [85]:
criterion = nn.MSELoss()  # we'll convert this to RMSE later
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Split Data into training and test portions

In [86]:
all_data_size = 120000
test_size = int(all_data_size * 1/6)
train_size = all_data_size - test_size

cat_test = cats[all_data_size-test_size:all_data_size]
con_test = conts[all_data_size-test_size:all_data_size]
y_test = y[all_data_size-test_size:all_data_size]


Training Process

In [87]:
import time
start_time = time.time()

epochs = 300
losses = []
batch_size = 25000
j = 0
while(j <= 3):
    cat_train = cats[batch_size*j:batch_size*(j+1)]
    con_train = conts[batch_size*j:batch_size*(j+1)]
    y_train = y[batch_size*j:batch_size*(j+1)]
    j = j + 1
 

    for i in range(epochs):
        i += 1
        y_pred = model(cat_train, con_train)
        loss = torch.sqrt(criterion(y_pred, y_train))
        losses.append(loss)
    
        if i%25 == 1:
            print(f'epoch: {i:3} loss: {loss.item():10.8f}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
print(f'epoch: {i:3}  loss: {loss.item():10.8f}') # print the last line
print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

epoch:   1 loss: 12.41077995
epoch:  26 loss: 11.45947266
epoch:  51 loss: 10.41737366
epoch:  76 loss: 9.49142075
epoch: 101 loss: 8.69645405
epoch: 126 loss: 7.78818989
epoch: 151 loss: 6.68477440
epoch: 176 loss: 5.45501709
epoch: 201 loss: 4.34321737
epoch: 226 loss: 3.67650723
epoch: 251 loss: 3.49558926
epoch: 276 loss: 3.42485332
epoch:   1 loss: 3.43353057
epoch:  26 loss: 3.35797477
epoch:  51 loss: 3.30640268
epoch:  76 loss: 3.31405306
epoch: 101 loss: 3.29401064
epoch: 126 loss: 3.29130268
epoch: 151 loss: 3.27930069
epoch: 176 loss: 3.23669934
epoch: 201 loss: 3.21774626
epoch: 226 loss: 3.21386027
epoch: 251 loss: 3.19866371
epoch: 276 loss: 3.19297338
epoch:   1 loss: 3.31897354
epoch:  26 loss: 3.28159809
epoch:  51 loss: 3.26169443
epoch:  76 loss: 3.27153015
epoch: 101 loss: 3.22311211
epoch: 126 loss: 3.23247552
epoch: 151 loss: 3.24099636
epoch: 176 loss: 3.22485375
epoch: 201 loss: 3.23189998
epoch: 226 loss: 3.21711206
epoch: 251 loss: 3.19877720
epoch: 276 loss: 

Validating with Test data

In [88]:
with torch.no_grad():
    y_val = model(cat_test, con_test)
    loss = torch.sqrt(criterion(y_val, y_test))
print(f'RMSE: {loss:.8f}')

print(f'{"PREDICTED":>12} {"ACTUAL":>8} {"DIFF":>8}')
for i in range(50):
    diff = np.abs(y_val[i].item()-y_test[i].item())
    print(f'{i+1:2}. {y_val[i].item():8.4f} {y_test[i].item():8.4f} {diff:8.4f}')

RMSE: 3.25461245
   PREDICTED   ACTUAL     DIFF
 1.  14.4417  15.3000   0.8583
 2.  12.4889  11.3000   1.1889
 3.   5.0818   3.3000   1.7818
 4.   5.8665   6.5000   0.6335
 5.   8.6589  11.4700   2.8111
 6.   4.8507   4.1000   0.7507
 7.   5.2547   6.1000   0.8453
 8.  11.5635  10.1000   1.4635
 9.   5.7535   7.3000   1.5465
10.   4.2073   4.5000   0.2927
11.   6.3661   7.3000   0.9339
12.   9.2370   9.7000   0.4630
13.   9.8984  10.1000   0.2016
14.   8.6947   9.3000   0.6053
15.  16.2221  16.1000   0.1221
16.   6.8800   4.9000   1.9800
17.   8.5090   8.9000   0.3910
18.  12.8557   9.3000   3.5557
19.   8.6944  13.3000   4.6056
20.   8.4827   6.5000   1.9827
21.   5.0843   8.1000   3.0157
22.   4.7828   4.5000   0.2828
23.   5.4880   4.5000   0.9880
24.   8.4054  11.3000   2.8946
25.  10.4138   8.9000   1.5138
26.  10.7654  11.3000   0.5346
27.  10.1396  10.5000   0.3604
28.   4.2825   4.5000   0.2175
29.   4.2771   3.3000   0.9771
30.   9.1060  10.5000   1.3940
31.   5.7289   3.7000 

In [73]:
torch.save(model.state_dict(), 'TaxiFareRegrModel.pt')