In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from datetime import datetime
import torch.nn.functional as F
import time
import matplotlib.pyplot as plt


#
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
data = pd.read_csv('train.csv')

In [5]:
#only 10 rows with missing data just dropping
data = data[data['MISSING_DATA'] == False]

In [6]:
#maybe keep missing? just split into different dataset

In [7]:
def polyline_to_trip_duration(polyline):
    return max(polyline.count("[") - 2, 0) * 15 #subtracting 2 because one is for the opening bracket?

# This code creates a new column, "LEN", in our dataframe. The value is
# the (polyline_length - 1) * 15, where polyline_length = count("[") - 1
data["LEN"] = data["POLYLINE"].apply(polyline_to_trip_duration)

**Making time columns**

In [3]:
def parse_time(x):
    dt = datetime.fromtimestamp(x["TIMESTAMP"])
    return dt.year, dt.month, dt.day, dt.hour, dt.weekday()

In [9]:
data[["YR", "MON", "DAY", "HR", "WK"]] = (
    data[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
)

KeyboardInterrupt: 

**Data engineering**

In [None]:
#filling nan's with -1 to avoid errors
data['ORIGIN_STAND'].fillna(-1, inplace = True)
data['ORIGIN_CALL'].fillna(-1, inplace = True)

In [None]:
type_map = {'A': 1, 'B': 2, 'C': 3}
data['CALL_TYPE'] = data['CALL_TYPE'].map(type_map).astype(int) 
data['DAY_TYPE'] = data["DAY_TYPE"].map(type_map).astype(int) 

In [None]:
#interactions between (day, hour), (hour, stand)
#new features: 
#   TOD: [morning, afternoon, evening, late night]
#   Time from noon (abs or not)

In [None]:
#making interaction features
data['STAND_HR'] = data['ORIGIN_STAND'].astype(str) + '_' + data['HR'].astype(str)
data['STAND_DAY'] = data['DAY'].astype(str) + '_' + data['ORIGIN_STAND'].astype(str)
data['DAY_HR'] = data['DAY'].astype(str) + '_' + data['HR'].astype(str)

In [None]:
data['TOD'] = pd.cut(data['HR'], bins=[0, 12, 16, 20, 24], labels=[1, 2, 3, 4], right=False)
data['FROM_NOON'] = data['HR'] - 12 #note that this column is continuous

In [None]:
embed_cols = (['YR', 'MON', 'DAY', 'HR', 'WK',
               'ORIGIN_STAND', 'CALL_TYPE', 'TAXI_ID',
              'STAND_HR', 'STAND_DAY', 'DAY_HR', "TOD"])

for column in embed_cols:
    data[column] = data[column].astype('category').cat.codes

#save this data as csv for later
data.to_csv('engineered_data.csv', index = False)

## Splitting

In [4]:
data = pd.read_csv('engineered_data.csv')

In [5]:
#since the data is very wide, we'll do a 90-10 split to give the model more data
train, valid = train_test_split(data, train_size = 0.90)

In [6]:
del data  #clearing memory since this dataframe is now redundant

In [7]:
embed_cols = (['YR', 'MON', 'DAY', 'HR', 'WK',
               'ORIGIN_STAND', 'CALL_TYPE', 'TAXI_ID',
              'STAND_HR', 'STAND_DAY', 'DAY_HR', "TOD"])

In [8]:
input_size = train[embed_cols].nunique().sum()

**Preparing input for model**

In [9]:
X_train = train[["YR", "MON", "DAY", "HR", "WK", 
                 'ORIGIN_STAND', 'CALL_TYPE', 
                 'TAXI_ID', 'STAND_HR', 'STAND_DAY', 
                 'DAY_HR', "TOD"]]
Y_train = train["LEN"]

In [10]:
X_valid = valid[["YR", "MON", "DAY", "HR", "WK", 'ORIGIN_STAND', 
                 'CALL_TYPE', 'TAXI_ID', 'STAND_HR',
                 'STAND_DAY', 'DAY_HR', "TOD"]] 
Y_valid = valid["LEN"]

In [11]:
X_train = torch.Tensor(X_train.to_numpy())

X_valid = torch.Tensor(X_valid.to_numpy())

In [12]:
batch_size = 128
dataset = torch.utils.data.TensorDataset(X_train, torch.from_numpy(Y_train.values))
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [14]:
del train

NameError: name 'train' is not defined

In [15]:
del valid

## Model

In [16]:
class DeepModel(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size):
        super(DeepModel, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # Create the embedding layer
        self.embedding = nn.Embedding(input_size, embedding_dim)
        
        self.encoder = nn.Sequential(
            nn.Linear(embedding_dim, hidden_size),
            nn.ReLU(),
            
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            
            nn.Linear(hidden_size//2, hidden_size//4),
            nn.ReLU(),
            nn.Dropout(0.3),
           
            nn.Linear(hidden_size//4, 1),
        )
    
    def forward(self, data):
        embedded_data = self.embedding(data)
        embedded_data = embedded_data.mean(dim=1)  # Aggregate embeddings (e.g., using mean)
        output = self.encoder(embedded_data)
        
        return output.squeeze()

# Hyperparameter tuning

In [18]:
hiddens = [64, 128, 256, 512]
embedding_sizes = [64, 128, 256, 512, 1024]
lrs = [5e-1, 1e-2, 1e-3, 1e-4, 1e-5]
batches = [32, 64, 128, 256]

In [21]:
for batch in batches:

    dataset = torch.utils.data.TensorDataset(X_train, torch.from_numpy(Y_train.values))
    train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch, shuffle=True)
    
    for hidden_size in hiddens:
        for embed_size in embedding_sizes:
            for learning_rate in lrs:

                print("-------------------------------")
                print(f'batch size: {batch}, hidden_size: {hidden_size}, embed: {embed_size}, lr: {learning_rate}')

                model = DeepModel(input_size, embed_size,
                                   hidden_size).to(device)

                loss_fn = nn.MSELoss()
                optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)



                for epoch in range(3):
                    start_time = time.time()

                    running_loss = 0
                    n = 0




                    # Loop over batches in an epoch using DataLoader
                    for batch_idx, (x_batch, y_batch) in enumerate(train_loader):

                        x_batch = x_batch.to(device).long()
                        y_batch = y_batch.to(device)

                        y_batch_pred = model(x_batch)


                        loss = loss_fn(y_batch_pred, y_batch.float())
                        running_loss += loss.item()
                        n += 1

                        # backwards steps
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()

                    print(f'Time of epoch {epoch + 1}: {time.time() - start_time} seconds')

                    print(f"Epoch {epoch + 1} training loss: {np.sqrt(running_loss/n)}")


            

-------------------------------
batch size: 32, hidden_size: 64, embed: 64, lr: 0.5
Time of epoch 1: 163.46613121032715 seconds
Epoch 1 training loss: 688.5153388285004
Time of epoch 2: 166.93063712120056 seconds
Epoch 2 training loss: 684.9085215116395
Time of epoch 3: 168.03568387031555 seconds
Epoch 3 training loss: 684.9100443872925
-------------------------------
batch size: 32, hidden_size: 64, embed: 64, lr: 0.01
Time of epoch 1: 163.581481218338 seconds
Epoch 1 training loss: 688.6728305730365
Time of epoch 2: 161.67039704322815 seconds
Epoch 2 training loss: 674.649149009022
Time of epoch 3: 158.57069945335388 seconds
Epoch 3 training loss: 669.3583506677844
-------------------------------
batch size: 32, hidden_size: 64, embed: 64, lr: 0.001
Time of epoch 1: 157.11408686637878 seconds
Epoch 1 training loss: 696.4648921587177
Time of epoch 2: 156.9175627231598 seconds
Epoch 2 training loss: 687.9038745503938
Time of epoch 3: 162.60087656974792 seconds
Epoch 3 training loss: 68

# Model training

In [30]:
hidden_size = 100 
embedding_size = 100 #hyperparam


model = DeepModel(input_size, embedding_size,
                   hidden_size).to(device)

loss_fn = nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3) #can try 1e-4, similar results

RuntimeError: CUDA error: device-side assert triggered

In [17]:
losses = []
valid_losses = []
for epoch in range(5):
    start_time = time.time()
    
    running_loss = 0
    n = 0
    
    print(f"-------------------------------")
    

    # Loop over batches in an epoch using DataLoader
    for batch_idx, (x_batch, y_batch) in enumerate(train_loader):
        
        x_batch = x_batch.to(device).long()
        y_batch = y_batch.to(device)
        
        y_batch_pred = model(x_batch)


        loss = loss_fn(y_batch_pred, y_batch.float())
        running_loss += loss.item()
        n += 1

        # backwards steps
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Time of epoch {epoch + 1}: {time.time() - start_time} seconds')
        
    print(f"Epoch {epoch + 1} training loss: {np.sqrt(running_loss/n)}")
    
    
    losses.append(np.sqrt(running_loss/n))

-------------------------------
Time of epoch 1: 74.2941210269928 seconds
Epoch 1 training loss: 681.3599767631298
-------------------------------
Time of epoch 2: 72.87142610549927 seconds
Epoch 2 training loss: 671.6922517087262
-------------------------------
Time of epoch 3: 75.93565511703491 seconds
Epoch 3 training loss: 666.0149666913167
-------------------------------
Time of epoch 4: 73.429612159729 seconds
Epoch 4 training loss: 662.8932767960531
-------------------------------
Time of epoch 5: 72.13198018074036 seconds
Epoch 5 training loss: 660.631775611726
-------------------------------
Time of epoch 6: 71.080570936203 seconds
Epoch 6 training loss: 658.1861592989904
-------------------------------
Time of epoch 7: 70.14974236488342 seconds
Epoch 7 training loss: 656.1145735554683
-------------------------------
Time of epoch 8: 71.7743752002716 seconds
Epoch 8 training loss: 654.0231699792648
-------------------------------
Time of epoch 9: 68.14510536193848 seconds
Epoc

## Validation

In [18]:
val_preds = model(X_valid.long().to(device))
np.sqrt(
    F.mse_loss(val_preds.cpu(), 
               torch.Tensor(Y_valid.values)).detach().numpy()
)

694.7105

In [19]:
del val_preds

# Prediction file

In [27]:
test_data = pd.read_csv('test_public.csv')

In [28]:
test_data[["YR", "MON", "DAY", "HR", "WK"]] = (
    test_data[["TIMESTAMP"]].apply(parse_time, axis=1, result_type="expand")
)

#making interaction features
test_data['STAND_HR'] = test_data['ORIGIN_STAND'].astype(str) + '_' + test_data['HR'].astype(str)
test_data['STAND_DAY'] = test_data['DAY'].astype(str) + '_' + test_data['ORIGIN_STAND'].astype(str)
test_data['DAY_HR'] = test_data['DAY'].astype(str) + '_' + test_data['HR'].astype(str)

#time cols
test_data['TOD'] = pd.cut(test_data['HR'], bins=[0, 12, 16, 20, 24], labels=[1, 2, 3, 4], right=False)
test_data['FROM_NOON'] = test_data['HR'] - 12 #note that this column is continuous

embed_cols = (['YR', 'MON', 'DAY', 'HR', 'WK',
               'ORIGIN_STAND', 'CALL_TYPE', 'TAXI_ID',
              'STAND_HR', 'STAND_DAY', 'DAY_HR', "TOD"])

for column in embed_cols:
    test_data[column] = test_data[column].astype('category').cat.codes

In [24]:
X_test = test_data[["YR", "MON", "DAY", "HR", "WK", 
                 'ORIGIN_STAND', 'CALL_TYPE', 
                 'TAXI_ID', 'STAND_HR', 'STAND_DAY', 
                 'DAY_HR', "TOD"]]

X_test = torch.Tensor(X_test.to_numpy()).to(device)

In [29]:
test_data

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,YR,MON,DAY,HR,WK,STAND_HR,STAND_DAY,DAY_HR,TOD,FROM_NOON
0,T1,1,,10,190,1408039037,A,False,0,0,2,11,2,9,4,1,2,5
1,T2,1,,33,37,1408038611,A,False,0,0,2,11,2,44,15,1,2,5
2,T3,1,,10,129,1408038568,A,False,0,0,2,11,2,9,4,1,2,5
3,T4,1,,30,171,1408039090,A,False,0,0,2,11,2,41,13,1,2,5
4,T5,1,,13,217,1408039177,A,False,0,0,2,11,2,15,5,1,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,T323,0,70885.0,-1,148,1419171485,A,False,0,4,3,8,4,60,32,7,1,2
316,T324,1,,30,8,1419170802,A,False,0,4,3,8,4,40,31,7,1,2
317,T325,2,,-1,72,1419172121,A,False,0,4,3,8,4,60,32,7,1,2
318,T326,0,76232.0,-1,229,1419171980,A,False,0,4,3,8,4,60,32,7,1,2


In [25]:
preds = model(X_test.long()).cpu().detach()

/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [87,0,0], thread: [84,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [87,0,0], thread: [85,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [87,0,0], thread: [86,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [87,0,0], thread: [87,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [87,0,0], thread: [88,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [87,0,0], thread: [89,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexS

RuntimeError: CUDA error: device-side assert triggered

/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [140,0,0], thread: [96,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [140,0,0], thread: [97,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [140,0,0], thread: [98,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [140,0,0], thread: [99,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [140,0,0], thread: [100,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [140,0,0], thread: [101,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702

In [None]:
final_preds = pd.DataFrame({'TRIP_ID': test_data['TRIP_ID'], 'TRAVEL_TIME': preds})

final_preds.to_csv('test_preds.csv', index = False)

