In [1]:
import pyspark

sc = pyspark.sql.SparkSession.Builder().getOrCreate()

## Levanto los archivos

In [2]:
import json
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold 
from sklearn.model_selection import GridSearchCV
import math

In [20]:
points_recep = sc.read.json('datos/points-recep-by-angle.jsonlines').rdd

In [23]:
non_empty_points_recp = points_recep.filter(lambda x: len(x['recep_0']+x['recep_1']+x['recep_2']+x['recep_3']) > 0)


### Me quedo con una sola emision por punto

In [24]:
unique_points = non_empty_points_recp.groupBy(lambda x: x['Punto']).map(lambda x: list(x[1])[0])

### Genero los atributos y etiquetas que me interesan

In [25]:
def generate_attrs(row):
    #f = lambda l: len(l)
    f = lambda l: float(np.mean(l) if len(l) > 0 else 0)
    data = {}
    for antenna in range(4):
        for angle in range(4):
            data['recep_{}_{}'.format(antenna, angle)] = f(row['recep_{}'.format(antenna)][angle])

    return {'data': data, 'x': row['x'], 'y': row['y'], 'point': row['Punto']}
    
points = unique_points.map(generate_attrs)

### Regresion por eje

In [27]:
points_target_x = pd.DataFrame(points.map(lambda x: x['x']).collect())
points_target_y = pd.DataFrame(points.map(lambda x: x['y']).collect())
points_data = pd.DataFrame(points.map(lambda x: x['data']).collect())


def eval_knn_regressor(data, target):
    clf = KNeighborsRegressor(n_neighbors=5)
    scores = cross_val_score(clf, data, target, cv=5, scoring='neg_mean_absolute_error')
    print('cross val mae: {}'.format(scores))
    
print('prediccion eje x')
eval_knn_regressor(points_data, points_target_x)
print('prediccion eje y')
eval_knn_regressor(points_data, points_target_y)

prediccion eje x
cross val mae: [-343.0432433  -411.10479831 -373.41056791 -370.17302842 -385.60676834]
prediccion eje y
cross val mae: [-369.00659368 -420.37132144 -400.29844907 -383.10322065 -408.42480613]


### Clasificacion de puntos

In [28]:
points_target = pd.DataFrame(points.map(lambda x: x['point']).collect())
points_data = pd.DataFrame(points.map(lambda x: x['data']).collect())



In [29]:
dict_coordenadas = points_recep.map(lambda x: (x['Punto'],(x['x'], x['y']))).collectAsMap()
global dict_coordenadas

def distance(p1, p2):
    return math.sqrt((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)

def calculate_mae_distance(predictions, predictions_probas, real):
    sum_error = 0
    count = 0
    for i in range(len(predictions)):
        #print(predictions_probas[i].max())
        pred_position = dict_coordenadas[predictions[i]]
        real_position = dict_coordenadas[real[i]]
        sum_error += distance(pred_position, real_position)
        count += 1
    return sum_error/count
    

def get_classifier_error(clf, X_train, y_train, X_eval, y_eval):
    clf.fit(X_train, y_train)

    predictions = clf.predict(X_eval)
    predictions_probas = clf.predict_proba(X_eval)
    mae = calculate_mae_distance(predictions, predictions_probas, y_eval)
    return mae
    
    
points_data_np = np.array(points_data)
points_target_np = np.array(points_target).ravel()
    


def evaluate_knn_classifier(points_data_np, points_target_np, **kwargs):
    print(kwargs)
    clf = KNeighborsClassifier(**kwargs)    
    kf = KFold(n_splits=5, shuffle=True)
    mae_list = []
    mae_list_train = []
    for train_index, test_index in kf.split(points_data_np):
        X_train, X_test = points_data_np[train_index], points_data_np[test_index]
        y_train, y_test = points_target_np[train_index], points_target_np[test_index]

        mae = get_classifier_error(clf, X_train, y_train, X_test, y_test)
        mae_train = get_classifier_error(clf, X_train, y_train, X_train, y_train)
        mae_list.append(mae)
        mae_list_train.append(mae_train)
    print('mean absolute error on test set {}'.format(np.mean(mae_list)))
    print('mean absolute error on train set {}'.format(np.mean(mae_list_train)))


params = {'n_neighbors': 5, 'weights': 'distance'}
evaluate_knn_classifier(points_data_np, points_target_np, **params)


{'n_neighbors': 5, 'weights': 'distance'}
mean absolute error on test set 798.637688681372
mean absolute error on train set 494.4564438095096


### Regression grid search

In [None]:


knn_params = {
    "n_neighbors" : np.arange(1, 15),
    "weights" : ["uniform", "distance"],
    "p" : [1, 2]
}
estimator = KNeighborsRegressor()

clf = GridSearchCV(estimator, knn_params, cv=5, scoring='neg_mean_absolute_error', n_jobs=3, error_score=0.0)
clf.fit(points_data, points_target_x)


In [None]:
from IPython.display import display, HTML

def show_grid_results(clf, n):
    df = pd.DataFrame(clf.cv_results_["params"])
    df["mean_score_validation"] = clf.cv_results_["mean_test_score"]
    df["mean_score_training"] = clf.cv_results_["mean_train_score"]
    display(HTML("<h3> {}: top {} combinaciones  </h3>".format(clf.estimator.__class__.__name__, n)))
    display(df.sort_values(by="mean_score_validation", ascending=False).head(n))
    
    
show_grid_results(clf,5)

### Classification grid search

In [None]:
from sklearn.model_selection import ParameterGrid
        
knn_params = {
    "n_neighbors" : np.arange(1, 15),
    "weights" : ["uniform", "distance"],
    "p" : [1, 2]
}
 
for params in ParameterGrid(knn_params):
    evaluate_knn_classifier(points_data_np, points_target_np, **params)


### Error analysis