In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import os, os.path 
import numpy 
import pickle
from glob import glob
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable 
import pandas as pd
from scipy import signal
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import random
#import seaborn as sns
"""
    number of trajectories in each city
    # austin --  train: 43041 test: 6325 
    # miami -- train: 55029 test:7971
    # pittsburgh -- train: 43544 test: 6361
    # dearborn -- train: 24465 test: 3671
    # washington-dc -- train: 25744 test: 3829
    # palo-alto -- train:  11993 test:1686

    trajectories sampled at 10HZ rate, input 5 seconds, output 6 seconds
    
"""

'\n    number of trajectories in each city\n    # austin --  train: 43041 test: 6325 \n    # miami -- train: 55029 test:7971\n    # pittsburgh -- train: 43544 test: 6361\n    # dearborn -- train: 24465 test: 3671\n    # washington-dc -- train: 25744 test: 3829\n    # palo-alto -- train:  11993 test:1686\n\n    trajectories sampled at 10HZ rate, input 5 seconds, output 6 seconds\n    \n'

In [2]:
# clean memory
import gc

gc.collect()

torch.cuda.empty_cache()

In [3]:
print(torch.cuda.get_device_name())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

GeForce RTX 2060 SUPER


In [4]:
from glob import glob
import pickle
import numpy as np

### Change to requried path to access data locally, too big too push all data into github
#ROOT_PATH = "C:/Users/Administrator/cse151b-spring2022/argo2/"
ROOT_PATH = "D:/School/cse151B/argo2/"

cities = ["austin", "miami", "pittsburgh", "dearborn", "washington-dc", "palo-alto"]
splits = ["train", "test"]

def get_city_trajectories(city="palo-alto", split="train", normalized=False):
    f_in = ROOT_PATH + split + "/" + city + "_inputs"
    inputs = pickle.load(open(f_in, "rb"))
    inputs = np.asarray(inputs)
    
    outputs = None
    
    if split=="train":
        f_out = ROOT_PATH + split + "/" + city + "_outputs"
        outputs = pickle.load(open(f_out, "rb"))
        outputs = np.asarray(outputs)

        return torch.from_numpy(inputs).float(), torch.from_numpy(outputs).long()

    if split=="test":
    
        return torch.from_numpy(inputs).float(), torch.from_numpy(np.array([]))

    

class ArgoverseDataset(Dataset):
    """Dataset class for Argoverse"""
    def __init__(self, city: str, split:str, transform=None, device='cpu'):
        super(ArgoverseDataset, self).__init__()
        self.transform = transform
        self.split = split
        self.inputs, self.outputs = get_city_trajectories(city=city, split=split, normalized=False)
        self.inputs = self.inputs.to(device)
        self.outputs = self.outputs.to(device)
    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):

        if self.split == 'train':
            data = (self.inputs[idx], self.outputs[idx])
        if self.split == 'test':
            data = self.inputs[idx]
            
        if self.transform:
            data = self.transform(data)

        return data

# intialize a dataset
city = 'palo-alto' 
split = 'train'
#train_dataset  = ArgoverseDataset(city = city, split = split, device=device)

### Data Preprocessing

In [5]:
# wrapper function to select proportion of a cities random examples w/o replacement from each city and put all data into one list
# purpose: whole dataset is too big and might be redundant
def randomCitySampler(prop,scalingFactor):
    cities = ["austin", "miami", "pittsburgh", "dearborn", "washington-dc","palo-alto"]

    samples = []
    for c in cities:
        # get city data
        temp_dataset = ArgoverseDataset(city = c, split = "train", device=device)

        numProp = int(len(temp_dataset) * prop)

        # get N number of random indicies
        ind = random.sample(range(0, len(temp_dataset)), numProp)
        #print(ind)
        # push all data indicies into samples list
        for i in ind: 
            x = temp_dataset[i][0]/scalingFactor
            y = temp_dataset[i][1]/scalingFactor
            samples.append((x,y))
            
    return samples


In [6]:
### constants for generating Dataset
proportionOfEntireData = 0.5
seqLen = 40
stepSize = 3
batch_sz = 64  # batch size 
scaling_factor = 5000

In [7]:
# create train dataset, with proportion to actual amount data
sampleTest = randomCitySampler(proportionOfEntireData,scaling_factor)
len(sampleTest)

101906

In [8]:
# generate sequences of length seqLength and specific step size
def sequenceGenerator(data, seqLen=40, stepSize=5):
    newData = []
    for d in data:
        # concat X and Y together
        temp = torch.cat([d[0],d[1]])
        # make X of length SeqLen and Y is next x,y coordinate pair
        for i in range(0,len(temp)-seqLen, stepSize):
            x = temp[i:i + seqLen]
            #flatX = Variable(torch.tensor([item for sublist in x for item in sublist])).to(device)
            flatX = torch.flatten(Variable(torch.tensor(x)).to(device))
            y = temp[i+seqLen]
            newData.append((flatX,y))
            
        
    return newData

In [9]:
# generate sequences
train_seq_data = sequenceGenerator(sampleTest,seqLen, stepSize)
len(train_seq_data)

  flatX = torch.flatten(Variable(torch.tensor(x)).to(device))


2445744

In [10]:
# should be a vector of size (80,2) for each example
train_seq_data[0], len(train_seq_data[0][0])

((tensor([ 0.0707, -0.2969,  0.0707, -0.2969,  0.0707, -0.2969,  0.0707, -0.2969,
           0.0707, -0.2968,  0.0707, -0.2968,  0.0707, -0.2968,  0.0707, -0.2968,
           0.0707, -0.2968,  0.0707, -0.2968,  0.0707, -0.2968,  0.0707, -0.2968,
           0.0707, -0.2968,  0.0707, -0.2968,  0.0707, -0.2968,  0.0707, -0.2968,
           0.0707, -0.2969,  0.0707, -0.2969,  0.0707, -0.2969,  0.0707, -0.2969,
           0.0707, -0.2969,  0.0707, -0.2969,  0.0707, -0.2969,  0.0707, -0.2969,
           0.0707, -0.2969,  0.0707, -0.2969,  0.0707, -0.2969,  0.0707, -0.2969,
           0.0707, -0.2969,  0.0707, -0.2969,  0.0707, -0.2969,  0.0707, -0.2969,
           0.0707, -0.2969,  0.0707, -0.2969,  0.0707, -0.2969,  0.0707, -0.2969,
           0.0707, -0.2969,  0.0707, -0.2969,  0.0707, -0.2969,  0.0707, -0.2969],
         device='cuda:0'),
  tensor([ 0.0707, -0.2969], device='cuda:0')),
 80)

In [11]:
# create loader
train_loader = DataLoader(train_seq_data,batch_size=batch_sz)

In [12]:
# check shape is correct
train_features, train_labels = next(iter(train_loader))
len(train_features[0]), train_labels[0]
# shape is correct

(80, tensor([ 0.0707, -0.2969], device='cuda:0'))

### LSTM

In [13]:
# model parameters
#num_epochs = 60
learning_rate = 0.0001

input_size = seqLen*2 #number of features
hidden_size = 150 #number of features in hidden state
num_layers = 2 #number of stacked lstm layers

output_size = 2 #number of output classes 


In [14]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_layer_size, output_size,num_layers):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size

        self.lstm = nn.LSTM(input_size, hidden_layer_size,num_layers)

        self.linear = nn.Linear(hidden_layer_size, output_size)

        self.hidden_cell = (torch.zeros(num_layers,1,self.hidden_layer_size).to(device),
                            torch.zeros(num_layers,1,self.hidden_layer_size).to(device))

    def forward(self, input_seq):
        lstm_out, self.hidden_cell = self.lstm(input_seq.view(len(input_seq) ,1, -1), self.hidden_cell)
        predictions = self.linear(lstm_out.view(len(input_seq), -1))
        
        return predictions

In [15]:
lstm = LSTM(input_size, hidden_size,output_size,num_layers)
lstm = lstm.to(device)
lstm

LSTM(
  (lstm): LSTM(80, 150, num_layers=2)
  (linear): Linear(in_features=150, out_features=2, bias=True)
)

In [16]:
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

### Training

In [17]:
epochs = 150

for i in range(epochs):
    for seq, labels in train_loader:
        lstm.train()
        seq = seq.to(device)
        labels = labels.to(device)

        

        optimizer.zero_grad()
        lstm.hidden_cell = (torch.zeros(num_layers, 1, lstm.hidden_layer_size).to(device),
                        torch.zeros(num_layers, 1, lstm.hidden_layer_size).to(device))
        
        
        y_pred = lstm(seq)
        
        #print(seq.shape)
        #print(y_pred.shape, labels.shape)
        #break

        loss = loss_function(y_pred, labels)
        loss.backward()
        optimizer.step()

    if i%10 == 0:
        print(f'epoch: {i:3} loss: {loss.item():10.8f}')

    if i%30 == 0:
        modelPath = "lstm5Epoch{0}.pt".format(i)
        torch.save(lstm.state_dict(), modelPath)

print(f'epoch: {i:3} loss: {loss.item():10.10f}')

epoch:   0 loss: 0.00000284
epoch:  10 loss: 0.00000062
epoch:  20 loss: 0.00000033
epoch:  30 loss: 0.00000030
epoch:  40 loss: 0.00000033
epoch:  50 loss: 0.00000029
epoch:  60 loss: 0.00000020


KeyboardInterrupt: 

before normailizing:
epoch:   0 loss: 1702641.00000000 \n
epoch:  10 loss: 1387110.12500000\n
epoch:  20 loss: 1210237.00000000\n
epoch:  30 loss: 1043616.93750000\n
epoch:  40 loss: 843062.68750000\n
epoch:  50 loss: 642747.50000000\n
epoch:  60 loss: 454254.09375000\n
epoch:  70 loss: 307120.00000000\n
epoch:  80 loss: 195371.10937500\n
epoch:  90 loss: 145173.21875000\n
epoch: 100 loss: 63596.96093750\n
epoch: 110 loss: 32955.25000000\n
epoch: 120 loss: 30699.15625000\n
epoch: 130 loss: 54998.30468750\n
epoch: 140 loss: 54867.91796875\n
epoch: 149 loss: 41005.5078125000

### Predicting

In [18]:
# need to cycle for each test set and use LSTM to predict 
# add results for each city to dataframe 
# concatinate all dataframes

def validation(model):
    cities = ["austin", "miami", "pittsburgh", "dearborn", "washington-dc","palo-alto"]

    # all the data frames
    allDF = []
    with torch.no_grad():
        for c in cities:
            
            test_dataset = ArgoverseDataset(city=c, split='test', device=device)
            test_loader = DataLoader(test_dataset,batch_size=128)

            cityPredictions = []
            for t in test_loader.dataset:
                model.eval()
                flat = torch.flatten(t)
                currentPred = []

                for i in range(60):
                    #print(flat)
                    #print(len(flat)-seqLen*2)

                    pred = torch.flatten(model(flat[len(flat)-seqLen*2:].view(1,seqLen*2)))

                    #print(flat[len(flat)-seqLen*2:].view(1,80))
                    #print(pred)

                    currentPred.append(pred)

                    #print(torch.flatten(pred).shape)
                    #print(flat.shape)
                    
                    flat = torch.cat((flat,pred),0)
                    #print(flat)

                #print(len(flat[100:]))
                cityPredictions.append(flat[100:].detach().to('cpu').numpy()*scaling_factor)
        
            df = pd.DataFrame(cityPredictions)
            df.columns = ['v' + str(i) for i in (range(120))]
            df['ID'] = [str(i) + '_' + c for i in (range(len(test_loader.dataset)))]
            allDF.append(df)
            
    return allDF


In [19]:
tempValDF = validation(lstm)


In [20]:
len(tempValDF)

6

In [21]:
lstmPredFinal = pd.concat(tempValDF)
lstmPredFinal.ID

0             0_austin
1             1_austin
2             2_austin
3             3_austin
4             4_austin
             ...      
1681    1681_palo-alto
1682    1682_palo-alto
1683    1683_palo-alto
1684    1684_palo-alto
1685    1685_palo-alto
Name: ID, Length: 29843, dtype: object

In [22]:
lstmPredFinal.to_csv("lstm5early.csv", index=False)

In [23]:
torch.save(lstm.state_dict(), "lstm5finalearly.pt")

In [None]:
lstmPredFinal.head()

Unnamed: 0,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v111,v112,v113,v114,v115,v116,v117,v118,v119,ID
0,2966.430664,-9985.373047,697.852295,-10905.495117,690.802124,-10871.316406,-76.476784,-10835.010742,-160.79567,-10610.087891,...,1372.910889,1810.574585,1546.528076,1830.382812,1617.615112,1960.792969,1653.945923,1995.821899,1651.79895,0_austin
1,-14110.257812,1248.141846,-13425.822266,1427.882446,-14444.098633,1370.556519,-14100.422852,1362.783325,-14071.736328,1510.558472,...,2224.956543,2962.765625,2215.879639,2998.319824,2205.225098,3001.184814,2196.748047,2946.293457,2248.370117,1_austin
2,6244.087891,-9125.59082,3489.904785,-10084.924805,4511.722168,-10015.958984,4397.600098,-9890.113281,3239.341309,-9783.15332,...,602.991272,-703.00116,714.663147,-719.726807,774.714661,-622.440613,821.556763,-489.944244,834.670654,2_austin
3,1807.790161,12150.986328,4077.75708,11414.631836,1885.708008,11301.711914,1412.823364,11339.787109,2354.402832,11625.394531,...,-2141.852295,-2590.605957,-2162.712891,-2595.548828,-2197.846191,-2502.833496,-2195.032227,-2491.998047,-2228.847412,3_austin
4,11730.674805,-6021.035156,12871.799805,-7124.366211,12060.113281,-7679.387207,12143.30957,-7257.94873,11919.484375,-7445.813477,...,-484.069061,-2284.864746,-499.667755,-2202.729004,-450.200439,-2116.192627,-398.594299,-2026.723267,-368.494751,4_austin


In [None]:
# Test MSE: 4022490.07270