In [1]:
import os
import torch
import torch.utils.data
import pickle
import numpy as np
import random

In [24]:
class CustomDataPreprocessorForCNN():
    def __init__(self, input_seq_length=5, pred_seq_length=5, datasets=[i for i in range(37)], test_data_sets = [2], dev_fraction = 0.1, forcePreProcess=False):
        '''
        Initializer function for the CustomDataSetForCNN class
        params:
        input_seq_length : input sequence length to be considered
        output_seq_length : output sequence length to be predicted
        datasets : The indices of the datasets to use
        test_data_sets : The indices of the test sets from datasets
        dev_fraction : fraction of the validation set
        forcePreProcess : Flag to forcefully preprocess the data again from csv files
        '''
        # List of data directories where raw data resides
        self.data_dirs = ['./data/train/processed/biwi/biwi_hotel', './data/train/processed/crowds/arxiepiskopi1',
                          './data/train/processed/crowds/crowds_zara02', './data/train/processed/crowds/crowds_zara03',
                          './data/train/processed/crowds/students001', './data/train/processed/crowds/students003', 
                          './data/train/processed/stanford/bookstore_0',
                          './data/train/processed/stanford/bookstore_1', './data/train/processed/stanford/bookstore_2',
                          './data/train/processed/stanford/bookstore_3', './data/train/processed/stanford/coupa_3',
                          './data/train/processed/stanford/deathCircle_0', './data/train/processed/stanford/deathCircle_1',
                          './data/train/processed/stanford/deathCircle_2', './data/train/processed/stanford/deathCircle_3',
                          './data/train/processed/stanford/deathCircle_4', './data/train/processed/stanford/gates_0',
                          './data/train/processed/stanford/gates_1', './data/train/processed/stanford/gates_3',
                          './data/train/processed/stanford/gates_4', './data/train/processed/stanford/gates_5',
                          './data/train/processed/stanford/gates_6', './data/train/processed/stanford/gates_7',
                          './data/train/processed/stanford/gates_8', './data/train/processed/stanford/hyang_4',
                          './data/train/processed/stanford/hyang_5', './data/train/processed/stanford/hyang_6',
                          './data/train/processed/stanford/hyang_7', './data/train/processed/stanford/hyang_9',
                          './data/train/processed/stanford/nexus_0', './data/train/processed/stanford/nexus_1',
                          './data/train/processed/stanford/nexus_2', './data/train/processed/stanford/nexus_3',
                          './data/train/processed/stanford/nexus_4', './data/train/processed/stanford/nexus_7',
                          './data/train/processed/stanford/nexus_8', './data/train/processed/stanford/nexus_9']
        train_datasets = datasets
        for dataset in test_data_sets:
            train_datasets.remove(dataset)
        self.train_data_dirs = [self.data_dirs[x] for x in train_datasets]
        self.test_data_dirs = [self.data_dirs[x] for x in test_data_sets]
        
        # Number of datasets
        self.numDatasets = len(self.data_dirs)
        
        # Data directory where the pre-processed pickle file resides
        self.data_dir = './data/train/processed'
        
        # Store the arguments
        self.input_seq_length = input_seq_length
        self.pred_seq_length = pred_seq_length
        
        # Validation arguments
        self.dev_fraction = dev_fraction
        
        # Define the path in which the process data would be stored
        self.processed_train_data_file = os.path.join(self.data_dir, "trajectories_cnn_train.cpkl")
        self.processed_dev_data_file = os.path.join(self.data_dir, "trajectories_cnn_dev.cpkl")
        self.processed_test_data_file = os.path.join(self.data_dir, "trajectories_cnn_test.cpkl")
        
        # If the file doesn't exist or forcePreProcess is true
        if not(os.path.exists(self.processed_train_data_file)) or not(os.path.exists(self.processed_dev_data_file)) or forcePreProcess:
            print("------ Creating pre-processed training & dev data for CNN ------")
            self.preprocess(self.train_data_dirs, self.processed_train_data_file, self.dev_fraction, self.processed_dev_data_file)
        if not(os.path.exists(self.processed_test_data_file)) or forcePreProcess:
            print("------ Creating pre-processed test data for CNN ------")
            self.preprocess(self.test_data_dirs, self.processed_test_data_file)
        
    def preprocess(self, data_dirs, data_file, dev_fraction = 0., data_file_2 = None):
        #frameList_data = []
        #pedsInFrameList_data = []
        #pedsPosInFrameList_data = []
        processed_input_output_pairs = []
        
        for directory in data_dirs:
            print('------ Processing dataset ' + str(directory) + ' ------')
            # define path of the csv file of the current dataset
            file_path = os.path.join(directory, 'world_pos_normalized.csv')
            
            # Load the data from the csv file
            data = np.genfromtxt(file_path, delimiter=',')
            
            # Frame IDs of the frames in the current dataset
            frameList = np.unique(data[0, :]).tolist()
            numFrames = len(frameList)
            
            # Add the list of frameIDs to the frameList_data
            #frameList_data.append(frameList)
            
            # For this dataset check which pedestrians exist in each frame.
            pedsInFrameList = []
            pedsPosInFrameList = []
            for ind, frame in enumerate(frameList):
                # For this frame check the pedestrian IDs.
                pedsInFrame = data[:, data[0, :] == frame]
                pedsList = pedsInFrame[1, :].tolist()
                pedsInFrameList.append(pedsList)
                # Position information for each pedestrian.
                pedsPos = []
                for ped in pedsList:
                    # Extract x and y positions
                    current_x = pedsInFrame[2, pedsInFrame[1, :] == ped][0]
                    current_y = pedsInFrame[3, pedsInFrame[1, :] == ped][0]
                    pedsPos.extend([current_x, current_y])
                pedsPosInFrameList.append(pedsPos)
            
            # Go over the frames in this data again to extract data.
            ind = 0
            while ind < len(frameList) - (self.input_seq_length + self.pred_seq_length - 1):
                # List of pedestrians in this frame.
                pedsList = pedsInFrameList[ind]
                # Check if same pedestrians exist in the next (input_seq_length + pred_seq_length - 1) frames.
                peds_contained = True
                for ii in range(self.input_seq_length + self.pred_seq_length):
                    if pedsInFrameList[ind + ii] != pedsList:
                        peds_contained = False
                if peds_contained:
                    print(str(int(self.input_seq_length + self.pred_seq_length)) + ' frames starting from Frame ' + str(int(frameList[ind])) +  ' contain pedestrians ' + str(pedsList))
                    # Initialize numpy arrays for input-output pair
                    data_input = np.zeros((2*len(pedsList), self.input_seq_length))
                    data_output = np.zeros((2*len(pedsList), self.pred_seq_length))
                    for ii in range(self.input_seq_length):
                        data_input[:, ii] = np.array(pedsPosInFrameList[ind + ii])
                    for jj in range(self.pred_seq_length):
                        data_output[:, jj] = np.array(pedsPosInFrameList[ind + (self.input_seq_length - 1) + jj])
                    processed_pair = (torch.from_numpy(data_input), torch.from_numpy(data_output))
                    processed_input_output_pairs.append(processed_pair)
                    
                    ind += self.input_seq_length +  self.pred_seq_length - 1
                else:
                    ind += 1
                
            #pedsInFrameList_data.append(pedsInFrameList)
            #pedsPosInFrameList_data.append(pedsPosInFrameList)
            
        # Shuffle data, possibly divide into train and dev sets.
        random.seed(1)
        random.shuffle(processed_input_output_pairs)
        if dev_fraction != 0.:
            dev_size = int(len(processed_input_output_pairs)*dev_fraction)
            processed_dev_set = processed_input_output_pairs[:dev_size]
            processed_train_set = processed_input_output_pairs[dev_size:]
            # Save processed data.
            f = open(data_file, 'wb')
            pickle.dump(processed_train_set, f, protocol=2)
            f.close()
            f2 = open(data_file_2, 'wb')
            pickle.dump(processed_dev_set, f2, protocol=2)
            f2.close()
        else:
            # Save processed data.
            f = open(data_file, 'wb')
            pickle.dump(processed_input_output_pairs, f, protocol=2)
            f.close()

In [25]:
processed = CustomDataPreprocessorForCNN(forcePreProcess=True)

------ Creating pre-processed training & dev data for CNN ------
------ Processing dataset ./data/train/processed/biwi/biwi_hotel ------
10 frames starting from Frame 0 contain pedestrians [5.0, 6.0, 8.0]
10 frames starting from Frame 90 contain pedestrians [5.0, 6.0, 8.0]
10 frames starting from Frame 590 contain pedestrians [24.0, 25.0, 28.0]
10 frames starting from Frame 1000 contain pedestrians [38.0]
10 frames starting from Frame 1200 contain pedestrians [46.0, 47.0, 39.0]
10 frames starting from Frame 1400 contain pedestrians [40.0]
10 frames starting from Frame 1490 contain pedestrians [40.0]
10 frames starting from Frame 1600 contain pedestrians [41.0]
10 frames starting from Frame 1690 contain pedestrians [41.0]
10 frames starting from Frame 1800 contain pedestrians [42.0]
10 frames starting from Frame 1890 contain pedestrians [42.0]
10 frames starting from Frame 2770 contain pedestrians [71.0, 72.0]
10 frames starting from Frame 2860 contain pedestrians [71.0, 72.0]
10 frames

10 frames starting from Frame 36 contain pedestrians [59.0, 238.0, 240.0, 245.0, 252.0, 254.0, 456.0, 505.0, 590.0, 389.0]
10 frames starting from Frame 312 contain pedestrians [146.0, 60.0, 136.0, 239.0, 241.0, 246.0, 253.0, 255.0, 457.0, 506.0, 591.0, 156.0, 390.0, 53.0]
10 frames starting from Frame 552 contain pedestrians [147.0, 61.0, 137.0, 458.0, 507.0, 592.0, 36.0, 48.0, 157.0, 62.0, 391.0, 54.0]
10 frames starting from Frame 792 contain pedestrians [148.0, 138.0, 459.0, 508.0, 593.0, 37.0, 49.0, 158.0, 63.0, 392.0, 55.0]
10 frames starting from Frame 1032 contain pedestrians [149.0, 139.0, 460.0, 509.0, 594.0, 38.0, 50.0, 159.0, 64.0, 56.0]
10 frames starting from Frame 1272 contain pedestrians [150.0, 140.0, 461.0, 510.0, 595.0, 39.0, 51.0, 160.0, 323.0, 65.0, 197.0, 306.0, 57.0]
10 frames starting from Frame 1512 contain pedestrians [151.0, 141.0, 462.0, 511.0, 596.0, 40.0, 52.0, 161.0, 324.0, 66.0, 198.0, 34.0, 42.0, 44.0, 46.0, 292.0, 307.0, 58.0]
10 frames starting from F

10 frames starting from Frame 0 contain pedestrians [35.0, 37.0, 147.0, 212.0]
10 frames starting from Frame 108 contain pedestrians [35.0, 37.0, 147.0, 212.0]
10 frames starting from Frame 240 contain pedestrians [36.0, 38.0, 148.0, 213.0]
10 frames starting from Frame 348 contain pedestrians [36.0, 38.0, 148.0, 213.0]
10 frames starting from Frame 480 contain pedestrians [149.0, 214.0]
10 frames starting from Frame 588 contain pedestrians [149.0, 214.0]
10 frames starting from Frame 732 contain pedestrians [150.0, 215.0, 127.0]
10 frames starting from Frame 840 contain pedestrians [150.0, 215.0, 127.0]
10 frames starting from Frame 1068 contain pedestrians [151.0, 216.0, 128.0, 356.0, 190.0, 301.0, 145.0, 348.0, 370.0, 375.0, 282.0, 365.0]
10 frames starting from Frame 1308 contain pedestrians [152.0, 217.0, 144.0, 330.0, 357.0, 191.0, 302.0, 146.0, 349.0, 371.0, 376.0, 283.0, 366.0]
10 frames starting from Frame 1548 contain pedestrians [153.0, 218.0, 331.0, 358.0, 362.0, 303.0, 299

10 frames starting from Frame 84 contain pedestrians [237.0, 241.0, 244.0, 252.0, 270.0, 230.0, 233.0, 257.0]
10 frames starting from Frame 324 contain pedestrians [238.0, 242.0, 245.0, 253.0, 271.0, 231.0, 234.0, 258.0]
10 frames starting from Frame 1236 contain pedestrians [278.0, 275.0, 263.0, 15.0]
10 frames starting from Frame 1788 contain pedestrians [17.0, 316.0, 1.0, 323.0, 196.0, 199.0]
10 frames starting from Frame 2028 contain pedestrians [18.0, 317.0, 308.0, 311.0, 2.0, 320.0, 324.0, 328.0, 197.0, 200.0]
10 frames starting from Frame 2280 contain pedestrians [19.0, 318.0, 309.0, 312.0, 3.0, 321.0, 325.0, 329.0, 8.0]
10 frames starting from Frame 2520 contain pedestrians [20.0, 319.0, 310.0, 313.0, 4.0, 326.0, 330.0, 9.0]
10 frames starting from Frame 2760 contain pedestrians [21.0, 314.0, 5.0, 327.0, 331.0, 10.0]
10 frames starting from Frame 3000 contain pedestrians [48.0, 6.0, 332.0, 337.0, 11.0, 44.0]
10 frames starting from Frame 3240 contain pedestrians [49.0, 7.0, 333

In [26]:
train_file = open(processed.processed_train_data_file, 'rb')
dev_file = open(processed.processed_dev_data_file, 'rb')
test_file = open(processed.processed_test_data_file, 'rb')

In [27]:
processed.processed_train_data_file

'./data/train/processed/trajectories_cnn_train.cpkl'

In [28]:
train = pickle.load(train_file)
dev = pickle.load(dev_file)
test = pickle.load(test_file)

In [29]:
len(train)

354

In [30]:
len(dev)

39

In [31]:
len(test)

9

In [32]:
class CustomDatasetForCNN(torch.utils.data.Dataset):
    def __init__(self, file_path):
        self.file_path = file_path
    
    def __getitem__(self, index):
        file = open(self.file_path, 'rb')
        data = pickle.load(file)
        item = data[index]
        file.close()
        return item
    
    def __len__(self):
        file = open(self.file_path, 'rb')
        data = pickle.load(file)
        return len(data)     
        

In [33]:
train_set = CustomDatasetForCNN(processed.processed_train_data_file)

In [46]:
train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=1, shuffle=True)

In [47]:
x, y = train_set.__getitem__(99)

In [48]:
x

tensor([[-0.0187, -0.0206, -0.0225, -0.0187, -0.0187],
        [-0.4355, -0.4486, -0.4599, -0.4754, -0.4909]], dtype=torch.float64)

In [49]:
next(iter(train_loader))

[tensor([[[0.4123, 0.4122, 0.4120, 0.4118, 0.4117],
          [0.2028, 0.2033, 0.2039, 0.2044, 0.2050],
          [0.4116, 0.4101, 0.4034, 0.3963, 0.3892],
          [0.0277, 0.0277, 0.0295, 0.0295, 0.0295]]], dtype=torch.float64),
 tensor([[[0.4117, 0.4116, 0.4114, 0.4112, 0.4111],
          [0.2050, 0.2055, 0.2060, 0.2066, 0.2071],
          [0.3892, 0.3870, 0.3847, 0.3825, 0.3787],
          [0.0295, 0.0249, 0.0211, 0.0179, 0.0146]]], dtype=torch.float64)]