# Separate pedestrians to test and train according to start point in 5x5 grid

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
FRAME_WIDTH = 1920
FRAME_HEIGHT = 1080

In [36]:
# in annotation names of files look like "000010.txt"
# so we need to adjust strings while converting them from floats
ZERO_PADDING = 6
NUMBER_OF_PEDESTRIANS = 12684

def get_filename(index):
    return "src/Csv/{}.txt".format(str(index).zfill(ZERO_PADDING))

def download_pedestrian(index):
    error_message_template = "Pedestrian number should be between 1 and {max}; given number: {id}"
    assert(1 <= index <= NUMBER_OF_PEDESTRIANS), error_message_template.format(max=NUMBER_OF_PEDESTRIANS, id=index)
    filename = get_filename(index)
    data = pd.read_csv(filename, index_col=0)
    return data

We will give every pedestrian label which depends on pedestrian's start point.

In [47]:
w = FRAME_WIDTH / 5
h = FRAME_HEIGHT / 5

labels = []

for index in range(1, NUMBER_OF_PEDESTRIANS + 1):
    start_coordinate = download_pedestrian(index)[:1]
    i = int(start_coordinate['X'] / w) + 1
    j = int(start_coordinate['Y'] / h)
    labels.append(j * 5 + i)

In [61]:
from sklearn.cross_validation import StratifiedShuffleSplit

labels = np.array(labels)
sss = StratifiedShuffleSplit(labels, test_size=1/6, n_iter=1)

for train, test in sss:
    train_indeces = train
    test_indeces = test

Let's save label, number of frames and category of pedestrian: test or train.

In [71]:
number_of_frames = []

for index in range(1, NUMBER_OF_PEDESTRIANS + 1):
    data = download_pedestrian(index)
    number_of_frames.append(int(data.shape[0]))

In [64]:
category = []

for index in range(1, NUMBER_OF_PEDESTRIANS + 1):
    category.append('train' if index in train_indeces else 'test')

In [72]:
data = pd.DataFrame({'label': pd.Series(labels, index=range(1, NUMBER_OF_PEDESTRIANS + 1)),
                     'category': pd.Series(category, index=range(1, NUMBER_OF_PEDESTRIANS + 1)),
                     'frames_number': pd.Series(number_of_frames, index=range(1, NUMBER_OF_PEDESTRIANS + 1))})

In [73]:
data.to_csv('src/pedestrian_labels_and_test_and_train_separation.csv')

In [70]:
len(train_indeces), len(test_indeces)

(10569, 2115)