In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from time import localtime, strftime

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 7)

In [5]:
# Number of samples in each class

TEST_SAMPLE_NUMBER = 57092
TRAIN_SAMPLE_NUMBER = 285998

# array1: [sX1; sY1; sX2; sY2; ...]
# array2: [eX1; eY1; eX2; eY2; ...]
# output: [dist((sX1, sY1), (eX1, eY1)), dist((sX1, sY1), (eX1, eY1)),
#          dist((sX2, sY2), (eX2, eY2)), dist((sX2, sY2), (eX2, eY2)), ...]

def distance_for_each_point(array1, array2):
    assert (len(array1) == len(array2)), "Arrays' sizes have to be equal (array1: {}, array2: {})".format(
        len1=len(array1), len2=len(array2))
    
    array_length = len(array1)
    
    # for one-dimension arrays np.linalg.norm works in one way and for two-dimension in other
    if array1.ndim == 1:
        # array1[i:i+2] -- point i from first array
        # np.linalg.norm -- calculate distance between points
        distance = np.array([np.linalg.norm(array1[i:i+2] - array2[i:i+2]) for i in range(0,array_length,2)])
        result = np.array([[d, d] for d in distance]).flatten()
    else:
        result = np.array([distance_for_each_point(array1[i], array2[i]) for i in range(array_length)])
    
    return result

# metrics between real results (in tests) and predicted
def distance(test_results, predicted_results):
    return distance_for_each_point(np.array(test_results), predicted_results).sum() / TEST_SAMPLE_NUMBER / 2

# data may be present as [n_features * n_samples] or [n_samples * n_features] 
# usually algorithms require second variant but I prefer first
def to_model(df):
    return np.array(df).T

In [6]:
# you need to pass as arguments function to execute, test_results to compare with predicted by function,
# parameter_name to know results by which parameter you want to compare and list_of_values for this parameter
# also you need to pass other argument which function will use 
# it's always some test_data and usually train_data and train_results

def compare_results(function, test_results, parameter_name, list_of_values, **other_parameters):
    log_file_name = "src/Logs/{date}_predicted_coordinates_{function_name}_{parameter_name}_".format(
        date=strftime("%Y%m%d", localtime()), function_name=function.__name__, parameter_name=parameter_name)
    plot_file_name = "src/Plots/{date}_{function_name}_difference_by_{parameter_name}.png".format(
        date=strftime("%Y%m%d", localtime()), function_name=function.__name__, parameter_name=parameter_name)
    
    # we will keep results for each configuration here
    result = []

    for i, value in enumerate(list_of_values):
        print("{cur}/{num}: {time}".format(cur=i+1, num=len(list_of_values),
                                           time=strftime("%Y-%m-%d %H:%M:%S", localtime())))
        other_parameters[parameter_name] = value        
        predicted_results = function(**other_parameters)
        np.save(log_file_name + str(value), predicted_results)
        result.append(distance(test_results, predicted_results))
        
    print("done! {time}".format(time=strftime("%Y-%m-%d %H:%M:%S", localtime())))
    print("Results: {}".format(result))
        
    ind = list_of_values
    number = len(ind)
    width = 2 / number
    result_bar = plt.bar(range(number), result, width, color='g')

    plt.ylabel('Average difference')
    plt.xlabel(parameter_name)
    plt.title("Difference between real points and predicted by {parameter_name} in {function_name}".format(
        function_name=function.__name__, parameter_name=parameter_name))
    plt.xticks(np.array(range(number)) + width/2, ind)
    plt.savefig(plot_file_name)    

In [3]:
# reading data from .csv files

test_data = pd.read_csv('src/test_data_points.csv', index_col=0)
train_data = pd.read_csv('src/train_data_points.csv', index_col=0)

# split ten frames in input and output data (we want to predict output by input)

test_results = test_data[5:10]
test_data = test_data[:5]

train_results = train_data[5:10]
train_data = train_data[:5]

In [7]:
from sklearn.neighbors import KNeighborsRegressor

# n_neighbors by defalt is 5;         to compare: range(5, 55, 5)
# weights     by defalt is 'uniform'; to compare: ['uniform', 'distance']
# algorithm   by defalt is 'auto';    to compare: ['auto', 'ball_tree', 'kd_tree', 'brute']
# note: 'brute' is causing some memory error

def kneighbors_regressor(train_data, train_results, test_data, **kwargs):
    neigh = KNeighborsRegressor(**kwargs)
    neigh.fit(to_model(train_data), to_model(train_results))
    return (neigh.predict(to_model(test_data))).T

There is no difference which algorithm to use: ball_tree or kd_tree but kd_tree works faster. Also between 'uniform' and 'distance' (parameter weights) there is no difference neither.
The optimal n_neighbors is 25.

Note: I deleted some cells with comparison. You could find them in previous versions of this notebook on github.