In [1]:
import pyspark

sc = pyspark.sql.SparkSession.Builder().getOrCreate()

## Levanto los archivos

In [2]:
import json
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold 
from sklearn.model_selection import GridSearchCV
import math

In [23]:
points_recep = sc.read.json('datos/points-recep.jsonlines').rdd

In [16]:
non_empty_points_recp = points_recep.filter(lambda x: len(x['recep_0']+x['recep_1']+x['recep_2']+x['recep_3']) > 0)


### Me quedo con una sola emision por punto

In [17]:
unique_points = non_empty_points_recp.groupBy(lambda x: x['Punto']).map(lambda x: list(x[1])[0])

### Me quedo solo con puntos que tengan data de mas de una antena

In [6]:
#unique_points = unique_points.filter(lambda x: int(len(x['recep_0']) > 0) + int(len(x['recep_1']) > 0) + int(len(x['recep_2']) > 0) + int(len(x['recep_3']) > 0) > 1)



### Regresion por eje

In [7]:


def generate_attrs(row):
    #f = lambda l: len(l)
    f = lambda l: float(np.mean(l) if len(l) > 0 else 0)
    data = {
        'recep_0': f(row['recep_0']),
        'recep_1': f(row['recep_1']),
        'recep_2': f(row['recep_2']),
        'recep_3': f(row['recep_3'])
    }
    target_x = row['x']
    target_y = row['y']
    return {'data': data, 'target_x': target_x, 'target_y': target_y}
    
points = unique_points.map(generate_attrs)

In [8]:

points_target_x = pd.DataFrame(points.map(lambda x: x['target_x']).collect())
points_target_y = pd.DataFrame(points.map(lambda x: x['target_y']).collect())
points_data = pd.DataFrame(points.map(lambda x: x['data']).collect())

In [9]:


def eval_knn(data, target):
    clf = KNeighborsRegressor(n_neighbors=5)
    scores = cross_val_score(clf, data, target, cv=5, scoring='neg_mean_absolute_error')
    print('cross val mae: {}'.format(scores))
    
print('prediccion eje x')
eval_knn(points_data, points_target_x)
print('prediccion eje y')
eval_knn(points_data, points_target_y)

prediccion eje x
cross val mae: [-236.4579316  -249.44888494 -215.68336331 -292.37782229 -259.43425472]
prediccion eje y
cross val mae: [-210.84089784 -223.67253699 -254.15687582 -247.96231532 -219.54144877]


### Clasificacion de puntos

In [10]:
def generate_attrs(row):
    #f = lambda l: len(l)
    f = lambda l: float(np.mean(l) if len(l) > 0 else 0)
    data = {
        'recep_0': f(row['recep_0']),
        'recep_1': f(row['recep_1']),
        'recep_2': f(row['recep_2']),
        'recep_3': f(row['recep_3'])
    }
    target = row['x']
    return {'data': data, 'target': row['Punto']}
    
points = unique_points.map(generate_attrs)


import pandas as pd

points_target = pd.DataFrame(points.map(lambda x: x['target']).collect())
points_data = pd.DataFrame(points.map(lambda x: x['data']).collect())



In [11]:
dict_coordenadas = points_recep.map(lambda x: (x['Punto'],(x['x'], x['y']))).collectAsMap()
global dict_coordenadas

def distance(p1, p2):
    return math.sqrt((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)

def calculate_mae_distance(predictions, predictions_probas, real):
    sum_error = 0
    count = 0
    for i in range(len(predictions)):
        #print(predictions_probas[i].max())
        pred_position = dict_coordenadas[predictions[i]]
        real_position = dict_coordenadas[real[i]]
        sum_error += distance(pred_position, real_position)
        count += 1
    return sum_error/count
    

points_data_np = np.array(points_data)
points_target_np = np.array(points_target).ravel()
    


def evaluate_knn_classifier(points_data_np, points_target_np, **kwargs):
    print(kwargs)
    clf = KNeighborsClassifier(**kwargs)    
    kf = KFold(n_splits=5, shuffle=True)
    mae_list = []
    for train_index, test_index in kf.split(points_data_np):
        X_train, X_test = points_data_np[train_index], points_data_np[test_index]
        y_train, y_test = points_target_np[train_index], points_target_np[test_index]

        clf.fit(X_train, y_train)

        predictions = clf.predict(X_test)
        predictions_probas = clf.predict_proba(X_test)
        mae = calculate_mae_distance(predictions, predictions_probas, y_test)
        mae_list.append(mae)
    print('mean absolute error {}'.format(np.mean(mae_list)))


params = {'n_neighbors': 5, 'weights': 'distance'}
evaluate_knn_classifier(points_data_np, points_target_np, **params)


{'n_neighbors': 5, 'weights': 'distance'}
mean absolute error 454.49625733944833


### Regression grid search

In [21]:


knn_params = {
    "n_neighbors" : np.arange(1, 15),
    "weights" : ["uniform", "distance"],
    "p" : [1, 2]
}
estimator = KNeighborsRegressor()

clf = GridSearchCV(estimator, knn_params, cv=5, scoring='neg_mean_absolute_error', n_jobs=3, error_score=0.0)
clf.fit(points_data, points_target_x)




GridSearchCV(cv=5, error_score=0.0,
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]), 'weights': ['uniform', 'distance'], 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=0)

In [22]:
from IPython.display import display, HTML

def show_grid_results(clf, n):
    df = pd.DataFrame(clf.cv_results_["params"])
    df["mean_score_validation"] = clf.cv_results_["mean_test_score"]
    df["mean_score_training"] = clf.cv_results_["mean_train_score"]
    display(HTML("<h3> {}: top {} combinaciones  </h3>".format(clf.estimator.__class__.__name__, n)))
    display(df.sort_values(by="mean_score_validation", ascending=False).head(n))
    
    
show_grid_results(clf,5)



Unnamed: 0,n_neighbors,p,weights,mean_score_validation,mean_score_training
51,13,2,distance,-241.868745,-7.232952
55,14,2,distance,-242.064248,-7.232952
47,12,2,distance,-242.850636,-7.232952
41,11,1,distance,-243.087763,-7.232952
53,14,1,distance,-243.505413,-7.232952


### Classification grid search

In [14]:
from sklearn.model_selection import ParameterGrid
        
knn_params = {
    "n_neighbors" : np.arange(1, 15),
    "weights" : ["uniform", "distance"],
    "p" : [1, 2]
}
 
for params in ParameterGrid(knn_params):
    evaluate_knn_classifier(points_data_np, points_target_np, **params)


{'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
mean absolute error 469.1748955461172
{'n_neighbors': 1, 'p': 1, 'weights': 'distance'}
mean absolute error 464.9469435573568
{'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}
mean absolute error 467.65001034718716
{'n_neighbors': 1, 'p': 2, 'weights': 'distance'}
mean absolute error 469.0954742076589
{'n_neighbors': 2, 'p': 1, 'weights': 'uniform'}
mean absolute error 457.6011793555714
{'n_neighbors': 2, 'p': 1, 'weights': 'distance'}
mean absolute error 459.0267286278163
{'n_neighbors': 2, 'p': 2, 'weights': 'uniform'}
mean absolute error 459.056081896696
{'n_neighbors': 2, 'p': 2, 'weights': 'distance'}
mean absolute error 453.7457755700266
{'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
mean absolute error 468.71456378187804
{'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
mean absolute error 447.0545886313622
{'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}
mean absolute error 469.9810419117909
{'n_neighbors': 3, 'p': 2, 'weights':

### Error analysis