### 1. Import all the necessary libraries

In [79]:
import torch
import torchvision
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.optim as optim
%matplotlib inline

### 2. Loading the Dataset

In [80]:
df_tr = pd.read_csv("train.csv")
df_tr.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


### 3. Get Computed Time from POLYLINE

Our goal is to predict the travel-time of the taxi, which can be derived from the POLYLINE length.

Recall:

```
The travel time of the trip (the prediction target of this project) is defined as the (number of points-1) x 15 seconds. 
For example, a trip with 101 data points in POLYLINE has a length of (101-1) * 15 = 1500 seconds. Some trips have missing data points in POLYLINE, indicated by MISSING_DATA column, and it is part of the challenge how you utilize this knowledge.
```


In [81]:
# Over every single 
def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 1, 0) * 15

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15
df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)

### 4. Test and Extract the features: (Original Call + HR + WK + MON + TAXI_ID)

In [82]:
# Verify our guesses of the patterns of TAXI_ID such that all the IDs are in the form of 
# 20000xxx by substracting all the numbers by 20000000 and check if they are between the 
# range [0,1000).
def TAXI_ID_pattern_checker(x):
    # Test if the only last 3 digits of the TRIP_ID exhibit a pattern
    for idx in range(len(x)):
        if (x[idx]-20000000) < 0 or (x[idx]-20000000) >= 1000:
            return False
    return True

if TAXI_ID_pattern_checker(df_tr["TAXI_ID"]):
    print("Pattern is found!")

# Note that the only last three digits of the TAXI_ID are nonzero.
def parse_TAXI_ID(x):
    return (x % pow(10,3)) 

df_tr["Unique_TAXI_ID"] = df_tr["TAXI_ID"].apply(parse_TAXI_ID)

Pattern is found!


In [83]:
from datetime import datetime
def parse_time(x):
  # We are using python's builtin datetime library
  # https://docs.python.org/3/library/datetime.html#datetime.date.fromtimestamp

  # Each x is essentially a 1 row, 1 column pandas Series
  dt = datetime.fromtimestamp(x["TIMESTAMP"])
  return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

# Because we are assigning multiple values at a time, we need to "expand" our computed (year, month, day, hour, weekday) tuples on 
# the column axis, or axis 1
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html
df_tr[["YR", "MON", "DAY", "HR", "WK"]] = df_tr[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")

### 5. Data Encoding

In [84]:
outlier_threshold = 3

type_A = df_tr[df_tr["CALL_TYPE"]=="A"]

mean, std = type_A["LEN"].mean(), type_A["LEN"].std()
# "Choose all data, where the trip length is less than 3 standard deviations away from the mean"
# This is to remove outliers. Otherwise, our plots would look very squished (since there are some
# VERRRRRY long taxi trips in the dataset)
df_trimmed_A = type_A[type_A["LEN"] < mean + outlier_threshold * std]

# Shuffle the data set to avoid clustering and improve the randomness to our data set
df_trimmed_A = df_trimmed_A.sample(frac = 1)
# Split our data set into two parts: 80% training set and 20% test set.
train_set_A = df_trimmed_A[0:int(len(df_trimmed_A) * 0.8)][["Unique_TAXI_ID","ORIGIN_CALL","MON","WK","HR","LEN"]]
val_set_A = df_trimmed_A[int(len(df_trimmed_A)*0.8):][["Unique_TAXI_ID","ORIGIN_CALL","MON","WK","HR","LEN"]]

In [85]:
def id_to_binary_vector(x):
    vector_len = 24
    result = []
    
    for idx in range(len(x)):
        feature_vector = np.zeros(vector_len)
        feature_id = int(x.iloc[idx])
        length = len(list('{0:0b}'.format(feature_id)))
        
        if feature_id != 0:
            for digit in list('{0:0b}'.format(feature_id)):
                feature_vector[vector_len-length] = float(digit)
                length = length - 1
                
        result.append(feature_vector)
    
    return result

def id_to_one_hot_vector(x):
    result = []
    
    for idx in range(len(x)):
        vector = [0]*24
        vector[x.iloc[idx]] = 1
        result.append(vector)
    
    return result

In [86]:
train_data_A_MON = np.float32(id_to_one_hot_vector(train_set_A["MON"]))
val_data_A_MON = np.float32(id_to_one_hot_vector(val_set_A["MON"]))

train_data_A_WK = np.float32(id_to_one_hot_vector(train_set_A["WK"]))
val_data_A_WK = np.float32(id_to_one_hot_vector(val_set_A["WK"]))

train_data_A_HR = np.float32(id_to_one_hot_vector(train_set_A["HR"]))
val_data_A_HR = np.float32(id_to_one_hot_vector(val_set_A["HR"]))

train_data_A_TAXI_ID = np.float32(id_to_binary_vector(train_set_A["Unique_TAXI_ID"]))
val_data_A_TAXI_ID = np.float32(id_to_binary_vector(val_set_A["Unique_TAXI_ID"]))

train_data_A_ORIGIN_CALL = np.float32(id_to_binary_vector(train_set_A["ORIGIN_CALL"]))
val_data_A_ORIGIN_CALL = np.float32(id_to_binary_vector(val_set_A["ORIGIN_CALL"]))

In [87]:
result_train_set_A = []
result_val_set_A = []

for idx in range(len(train_set_A["MON"])):
    result_train_set_A.append(np.column_stack((train_data_A_MON[idx],
                                               train_data_A_WK[idx],
                                               train_data_A_HR[idx],
                                               train_data_A_TAXI_ID[idx],
                                               train_data_A_ORIGIN_CALL[idx])))

for idx in range(len(val_set_A["MON"])):
    result_val_set_A.append(np.column_stack((val_data_A_MON[idx],
                                               val_data_A_WK[idx],
                                               val_data_A_HR[idx],
                                               val_data_A_TAXI_ID[idx],
                                               val_data_A_ORIGIN_CALL[idx])))

In [88]:
# for idx in range(len(train_set_A["MON"])):
#     result_train_set_A.append(np.column_stack((train_data_A_MON[idx],
#                                                train_data_A_WK[idx],
#                                                train_data_A_HR[idx])))

# for idx in range(len(val_set_A["MON"])):
#     result_val_set_A.append(np.column_stack((val_data_A_MON[idx],
#                                                val_data_A_WK[idx],
#                                                val_data_A_HR[idx])))

In [89]:
train_label_A = train_set_A["LEN"].tolist()
train_label_max = max(train_label_A)
train_label_min = min(train_label_A)
train_label_A_normalize = [(val - train_label_min) / (train_label_max - train_label_min) for val in train_label_A]

val_label_A = val_set_A["LEN"].tolist()
val_label_max = max(val_label_A)
val_label_min = min(val_label_A)
val_label_A_normalize = [(val -  val_label_min) / (val_label_max - val_label_min) for val in val_label_A]

In [90]:
train_bound = (50 - train_label_min) / (train_label_max - train_label_min)

In [91]:
train_bound

0.02178649237472767

In [92]:
train_data_A = torch.Tensor(result_train_set_A)
val_data_A = torch.Tensor(result_val_set_A)

In [93]:
# Transform the data into tuples (data, label) so that 
# Dataloader in PyTorch can make use of them and we 
# can traverse each dataset with data + labels
train_tuple_A = list(zip(train_data_A, train_label_A_normalize))
val_tuple_A = list(zip(val_data_A, val_label_A_normalize))

### 6. Conversion of Dataset to Dataloader

In [94]:
# The Dataset retrieves our dataset’s features and labels one sample at a time. While training a model, 
# we typically want to pass samples in “minibatches”, reshuffle the data at every epoch to reduce model overfitting, 
# and use Python’s multiprocessing to speed up data retrieval.
#
#
# Typical Ways to iterate through the dataset: 
# iterator = iter(train_loader) sample = next(iterator)

train_loader_A = torch.utils.data.DataLoader(train_tuple_A, batch_size=32,
                                      shuffle=True, num_workers=2)

val_loader_A = torch.utils.data.DataLoader(val_tuple_A, batch_size=32,
                                      shuffle=True, num_workers=2)

In [95]:
# Device will determine whether to run the training on GPU or CPU.
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

### 7. Building up a neural network

In [98]:
import torch.nn as nn
import torch.nn.functional as F

class MLP_Classifier(torch.nn.Module):
    def __init__(self):
        super(MLP_Classifier, self).__init__() #Refers to the fact that this is a subclass of nn.Module and is inheriting all methods
        """
        the __init__() method that defines the layers and other components
        """ 
        self.model = torch.nn.Sequential( #an ordered container of modules
            nn.Linear(24*5, 120),
            nn.ReLU(),
            nn.Linear(120, 32),
            nn.Dropout(p = 0.2),
            nn.ReLU(),
            nn.Linear(32, 1)
        )   
        
    def forward(self, x): #You never have to call model.forward(x)
        """
        the forward function is where computatioin gets done
        """
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        
        out = self.model(x)    

        return out

In [99]:
model = MLP_Classifier()

In [100]:
model

MLP_Classifier(
  (model): Sequential(
    (0): Linear(in_features=120, out_features=120, bias=True)
    (1): ReLU()
    (2): Linear(in_features=120, out_features=32, bias=True)
    (3): Dropout(p=0.2, inplace=False)
    (4): ReLU()
    (5): Linear(in_features=32, out_features=1, bias=True)
  )
)

### 8. Define loss function and optimizer

In [101]:
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

### 9. Test and Train the network

In [102]:
def train_epoch(train_loader, model, optimizer, loss_function):
    losses = []
    # get a batch of training data from the train_loader (DataLoader obj)
    for i, data in enumerate(train_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
                    
#         inputs = inputs.to(device)
#         labels = labels.to(device)
        
        # make predictions for this batch
        output = model(inputs)
        
        # Compute the loss
        loss = loss_function(output, labels.float())
        
        # Backpropagation
        # zero out the gradients so that it will not accumulate through each iteration
        optimizer.zero_grad()
        
        # Compute the gradents with the backward call (backprop)
        loss.backward()
        
        # Update weight using gradient descent 
        optimizer.step()
        
        losses.append(loss.item())
        
    return np.mean(losses)

def eval_epoch(valid_loader, model, loss_function,bound):
    total = 0
    correct = 0
    preds = []
    trues = []
    
    with torch.no_grad(): 
        for i, (x, t) in enumerate(valid_loader):
            
            # Compute prediction
            y = model(x)
            
            prediction = y.data
            
            total += t.shape[0]
            for idx in range(len(t)):
                if abs(prediction[idx] - t[idx]) < bound:
                    correct += 1
            preds.append(y.data.numpy())
            trues.append(t.data.numpy())
            
    return correct/total*100., np.concatenate(preds), np.concatenate(trues)

In [None]:
from tqdm import tqdm

total_epochs = 20
train_accs, valid_accs = [], []
max_acc = 0
for epoch in tqdm(range(total_epochs)):
    
    model.train() # gradient tracking is on
    
    train_loss = train_epoch(train_loader_A, model, optimizer, criterion)
    train_loss = train_loss * ((train_label_max - train_label_min)**2) # Normalzie the training loss back

    model.eval() # we don't need gradients on to do reporting
    
    train_acc, train_preds, train_trues = eval_epoch(train_loader_A, model, criterion, train_bound)
    valid_acc, valid_preds, valid_trues = eval_epoch(val_loader_A, model, criterion, train_bound)

    train_accs.append(train_acc)
    valid_accs.append(valid_acc)
 
    print(f"Epoch: {epoch+1}, Train Loss: {train_loss:>0.4f}, Train Accuracy: {train_acc:>0.2f}%, Validation Accuracy: {valid_acc:>0.2f}% \n")

  5%|▌         | 1/20 [01:57<37:19, 117.87s/it]

Epoch: 1, Train Loss: 135690.2868, Train Accuracy: 10.97%, Validation Accuracy: 10.85% 



In [None]:
for i, data in enumerate(train_loader_A):
        # get the inputs; data is a list of [inputs, labels]
    inputs, labels = data
                    
#         inputs = inputs.to(device)
#         labels = labels.to(device)
        
        # make predictions for this batch
    output = model(inputs)
    print(output)

In [None]:
output_normalize = [(val * (train_label_max - train_label_min) + train_label_min) for val in output]
output_normalize