# Predict points via RandomForestRegressor by sklearn

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 7)

In [2]:
from sklearn.ensemble import RandomForestRegressor

In [4]:
# array1: [sX1; sY1; sX2; sY2; ...]
# array2: [eX1; eY1; eX2; eY2; ...]
# output: [dist((sX1, sY1), (eX1, eY1)), dist((sX1, sY1), (eX1, eY1)),
#          dist((sX2, sY2), (eX2, eY2)), dist((sX2, sY2), (eX2, eY2)), ...]

def distance_for_each_point(array1, array2):
    error_template = "Array lengths should be equal. len(array1): {len1}, len(array2): {len2}"
    assert (len(array1) == len(array2)), error_template.format(len1=len(array1), len2=len(array2))
    
    length = len(array1)
    # array1[i:i+2] -- point i from first array
    # np.linalg.norm -- calculate distance between points
    distance = np.array([np.linalg.norm(array1[i:i+2] - array2[i:i+2]) for i in range(0,length,2)])
    
    return np.array([[d, d] for d in distance]).flatten()

ZERO_PADDING = 6

def get_filename(index):
    return "src/Csv/{}.txt".format(str(index).zfill(ZERO_PADDING))

NUMBER_OF_PEDESTRIANS = 12273

def download_pedestrian(index):
    error_message_template = "Pedestrian number should be between 0 and {max}; given number: {id}"
    assert(0 <= index < NUMBER_OF_PEDESTRIANS), error_message_template.format(max=NUMBER_OF_PEDESTRIANS-1, id=index)
    filename = get_filename(index)
    data = pd.read_csv(filename, index_col=0)
    return data

In [5]:
# in each row: {index; category; frames_number; label}
labels = pd.read_csv('src/pedestrian_labels_and_test_and_train_separation.csv', index_col=0)

In [9]:
train_data = pd.DataFrame()
test_data = pd.DataFrame()

#for index in range(1, NUMBER_OF_PEDESTRIANS + 1):
for index in range(1, 2):
    # in each row: {index (time); X; Y}    
    pedestrian_data = pd.DataFrame()
    all_data = download_pedestrian(index).reset_index()[['X', 'Y']]
    
    for i in range(len(all_data) - 9):
        pedestrian_data = pd.concat([pedestrian_data, all_data[i:i+10]], axis=1)
    
    if labels['category'][index] == 'train':
        # in each row: {X; Y; X; Y; ...}
        train_data = pd.concat([train_data, pedestrian_data], axis=1)
    else:    
        test_data = pd.concat([test_data, pedestrian_data], axis=1)

In [7]:
train_data

Unnamed: 0,X,Y,X.1,Y.1,X.2,Y.2,X.3,Y.3
0,591,116,597,148,758,193,695,253
1,591,127,604,161,716,190,701,286
2,590,139,609,173,681,184,729,293
3,586,150,616,189,649,183,750,311
4,583,155,631,200,632,174,776,326
5,581,169,654,214,624,157,804,342
6,574,178,677,225,618,146,824,365
7,572,186,699,238,616,132,849,383
8,567,198,725,250,612,121,874,402
9,566,210,747,259,610,102,901,418


In [8]:
test_data

Unnamed: 0,X,Y,X.1,Y.1
0,608,268,544,232
1,622,272,523,231
2,618,257,501,229
3,609,233,490,216
4,607,219,494,202
5,609,208,488,193
6,610,189,486,180
7,611,176,481,169
8,610,166,484,157
9,606,154,487,144
