In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import time
from datetime import datetime

from torch.utils.data import DataLoader, TensorDataset

import math
from tqdm import tqdm
import copy

In [2]:
df_tr = pd.read_csv("train.csv")


# Over every single 
def polyline_to_trip_duration(polyline):
  return max(polyline.count("[") - 2, 0) * 15

df_tr["LEN"] = df_tr["POLYLINE"].apply(polyline_to_trip_duration)

gps_location_df = pd.read_csv('metaData_taxistandsID_name_GPSlocation.csv')

# Create a dictionary for each ORIGIN_STAND, where key is the stand id and value is a dic (Latitude, Longitude)
location_dict = gps_location_df.set_index('ID')[['Latitude', 'Longitude']].T.apply(tuple).to_dict()

df_tr['Latitude'] = df_tr['ORIGIN_STAND'].map(lambda x: location_dict.get(x, {}).get('Latitude', np.nan))
df_tr['Longitude'] = df_tr['ORIGIN_STAND'].map(lambda x: location_dict.get(x, {}).get('Longitude', np.nan))

In [3]:
df_test = pd.read_csv("test_public.csv")

taxi_test_ids = []

for id in df_test["TAXI_ID"]:
    if id not in taxi_test_ids:
        taxi_test_ids.append(id)
        
taxi_test_ids_dic = {}
for ele in taxi_test_ids:
    taxi_test_ids_dic[ele] = ele

In [4]:
df = df_tr.sort_values('TIMESTAMP')

In [5]:
df = df[int(len(df)*6/7):]

In [6]:
df['ORIGIN_CALL'].fillna(0, inplace=True)
df['ORIGIN_STAND'].fillna(0, inplace=True)
df['Latitude'].fillna(0, inplace=True)
df['Longitude'].fillna(0, inplace=True)
df = df[df['MISSING_DATA'] != True]
df['TAXI_ID'] = df['TAXI_ID'].map(lambda x: taxi_test_ids_dic.get(x, 0))

# assuming your dataframe is named df
categorical_features = ['CALL_TYPE', 'TAXI_ID']
spatial_features = ['Latitude', 'Longitude']
temporal_feature = 'TIMESTAMP'

# Preprocessing categorical features
label_encoders = {}
n_cats = {}

for cat in categorical_features:
    label_encoders[cat] = LabelEncoder()
    df[cat] = label_encoders[cat].fit_transform(df[cat])
    n_cats[cat] = len(label_encoders[cat].classes_)

df[temporal_feature] = MinMaxScaler().fit_transform(df[temporal_feature].values.reshape(-1,1))

df[spatial_features] = MinMaxScaler().fit_transform(df[spatial_features])


embedding1 = nn.Embedding(df['CALL_TYPE'].nunique(), 50)
embedding2 = nn.Embedding(df['TAXI_ID'].nunique(), 50)

class CombinedRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, n_cats):
        super(CombinedRNN, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Define embeddings for each categorical feature
        self.embeddings = nn.ModuleList([nn.Embedding(n_cats[cat], 50) for cat in categorical_features])
        self.fc_spatial = nn.Linear(2, 64)

        # Define LSTM layer
        self.rnn = nn.LSTM(input_size + len(categorical_features) * 50 + 64, hidden_size, num_layers, batch_first=True)

        # Define output layer
        self.fc1 = nn.Linear(hidden_size, 64)
        self.fc2 = nn.Linear(64, output_size)

    def forward(self, x, cat_features, spatial_features):
        # Embedding categorical features
        embeds = [emb(cat_features[:, :, i]) for i, emb in enumerate(self.embeddings)]
        embeds = torch.cat(embeds, -1)

        # Process spatial features
        spatial_features = self.fc_spatial(spatial_features)

        x = x.unsqueeze(2)
        x_combined = torch.cat((x, embeds, spatial_features), dim=2)

        # Initializing hidden state for first input with zeros
        h0 = torch.zeros(self.num_layers, x_combined.size(0), self.hidden_size).to(x_combined.device)
        c0 = torch.zeros(self.num_layers, x_combined.size(0), self.hidden_size).to(x_combined.device)

        # Forward propagate the LSTM
        out, _ = self.rnn(x_combined, (h0, c0))

        # Decode the hidden state of the last time step
        out = self.fc1(out[:, -1, :])
        out = self.fc2(out)
        
        return out

# Hyperparameters
input_size = 1  # as we only have 'TIMESTAMP' as input
hidden_size = 32
output_size = 1  # as regression predicts a single value
num_layers = 2

model = CombinedRNN(input_size, hidden_size, output_size, num_layers, n_cats)





In [7]:
# Prepare your data
temporal_tensor = torch.tensor(df[temporal_feature].values).float()
categorical_tensor = torch.tensor(df[categorical_features].values).long()
spatial_tensor = torch.tensor(df[spatial_features].values).float()

outputs = torch.tensor(df['LEN'].values).float()

# You need to adjust dimensions according to your actual batch size
temporal_tensor = temporal_tensor.unsqueeze(0)
categorical_tensor = categorical_tensor.unsqueeze(0)
spatial_tensor = spatial_tensor.unsqueeze(0)
outputs = outputs.unsqueeze(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

temporal_tensor = temporal_tensor.float().to(device)
categorical_tensor = categorical_tensor.long().to(device)
spatial_tensor = spatial_tensor.float().to(device)

outputs = outputs.to(device)

model = CombinedRNN(input_size, hidden_size, output_size, num_layers, n_cats, embedding_dim).to(device)

# Define your loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

# Train the model
n_epochs = 5
for epoch in range(n_epochs):
    # Forward pass
    preds = model(temporal_tensor, categorical_tensor, spatial_tensor)
    print("Shape of Model Output: ", preds.shape)
    print("Shape of Target Tensor: ", outputs.shape)

    # Compute loss
    loss = criterion(preds, outputs)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {loss.item()}')


NameError: name 'embedding_dim' is not defined