In [4]:
import pyspark

sc = pyspark.sql.SparkSession.Builder().getOrCreate()

## Levanto los archivos

In [11]:
import json
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold 
from sklearn.model_selection import GridSearchCV
from IPython.display import display, HTML
import math


In [6]:
points_recep = sc.read.json('datos/train-test.jsonlines').rdd

In [7]:
non_empty_points_recp = points_recep.filter(lambda x: len(x['recep_0']+x['recep_1']+x['recep_2']+x['recep_3']) > 0)


### Me quedo con una sola emision por punto

In [8]:
unique_points = non_empty_points_recp.groupBy(lambda x: x['Punto']).map(lambda x: list(x[1])[0])

### Genero los atributos y etiquetas que me interesan

In [9]:
def generate_attrs(row):
    #f = lambda l: len(l)
    f = lambda l: float(np.mean(l) if len(l) > 0 else 0)
    data = {
        'recep_0': f(row['recep_0']),
        'recep_1': f(row['recep_1']),
        'recep_2': f(row['recep_2']),
        'recep_3': f(row['recep_3'])
    }
    return {'data': data, 'x': row['x'], 'y': row['y'], 'point': row['Punto']}
    
points = unique_points.map(generate_attrs)

### Regresion por eje

In [24]:
points_target_x = pd.DataFrame(points.map(lambda x: x['x']).collect())
points_target_y = pd.DataFrame(points.map(lambda x: x['y']).collect())
points_data = pd.DataFrame(points.map(lambda x: x['data']).collect())


def eval_knn_regressor(data, target):
    clf = KNeighborsRegressor(n_neighbors=5)
    scores = cross_val_score(clf, data, target, cv=5, scoring='neg_mean_absolute_error')
    print('cross val mae: {}'.format(scores))
    
print('prediccion eje x')
eval_knn_regressor(points_data, points_target_x)
print('prediccion eje y')
eval_knn_regressor(points_data, points_target_y)

prediccion eje x
cross val mae: [-282.49933514 -203.36573732 -226.10210657 -299.4146864  -268.78797938]
prediccion eje y
cross val mae: [-185.84508947 -206.90823054 -227.27933485 -252.83656086 -211.10895522]


### Clasificacion de puntos

In [18]:
points_target = pd.DataFrame(points.map(lambda x: x['point']).collect())
points_data = pd.DataFrame(points.map(lambda x: x['data']).collect())



In [35]:
dict_coordenadas = points_recep.map(lambda x: (x['Punto'],(x['x'], x['y']))).collectAsMap()
global dict_coordenadas

def distance(p1, p2):
    return math.sqrt((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)

def calculate_mae_distance(predictions, predictions_probas, real):
    sum_error = 0
    count = 0
    for i in range(len(predictions)):
        #print(predictions_probas[i].max())
        pred_position = dict_coordenadas[predictions[i]]
        real_position = dict_coordenadas[real[i]]
        sum_error += distance(pred_position, real_position)
        count += 1
    return sum_error/count
    

def get_classifier_error(clf, X_train, y_train, X_eval, y_eval):
    clf.fit(X_train, y_train)

    predictions = clf.predict(X_eval)
    predictions_probas = clf.predict_proba(X_eval)
    mae = calculate_mae_distance(predictions, predictions_probas, y_eval)
    return mae
    
    
points_data_np = np.array(points_data)
points_target_np = np.array(points_target).ravel()
    


def evaluate_knn_classifier(points_data_np, points_target_np, **kwargs):
    print(kwargs)
    clf = KNeighborsClassifier(**kwargs)    
    kf = KFold(n_splits=5, shuffle=True)
    mae_list = []
    mae_list_train = []
    for train_index, test_index in kf.split(points_data_np):
        X_train, X_test = points_data_np[train_index], points_data_np[test_index]
        y_train, y_test = points_target_np[train_index], points_target_np[test_index]

        mae = get_classifier_error(clf, X_train, y_train, X_test, y_test)
        mae_train = get_classifier_error(clf, X_train, y_train, X_train, y_train)
        mae_list.append(mae)
        mae_list_train.append(mae_train)
    print('mean absolute error on test set {}'.format(np.mean(mae_list)))
    print('mean absolute error on train set {}'.format(np.mean(mae_list_train)))


params = {'n_neighbors': 5, 'weights': 'distance'}
evaluate_knn_classifier(points_data_np, points_target_np, **params)


{'n_neighbors': 5, 'weights': 'distance'}
mean absolute error on test set 470.2904059746582
mean absolute error on train set 11.867562177610825


### Regression grid search

In [26]:


knn_params = {
    "n_neighbors" : np.arange(1, 15),
    "weights" : ["uniform", "distance"],
    "p" : [1, 2]
}
estimator = KNeighborsRegressor()

clf = GridSearchCV(estimator, knn_params, cv=5, scoring='neg_mean_absolute_error', n_jobs=3, error_score=0.0)
clf.fit(points_data, points_target_x)




GridSearchCV(cv=5, error_score=0.0,
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]), 'weights': ['uniform', 'distance'], 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=0)

In [28]:
from IPython.display import display, HTML

def show_grid_results(clf, n):
    df = pd.DataFrame(clf.cv_results_["params"])
    df["mean_score_validation"] = clf.cv_results_["mean_test_score"]
    df["mean_score_training"] = clf.cv_results_["mean_train_score"]
    display(HTML("<h3> {}: top {} combinaciones  </h3>".format(clf.estimator.__class__.__name__, n)))
    display(df.sort_values(by="mean_score_validation", ascending=False).head(n))
    
    
show_grid_results(clf,5)



Unnamed: 0,n_neighbors,p,weights,mean_score_validation,mean_score_training
53,14,1,distance,-246.266494,-8.333558
55,14,2,distance,-246.513799,-8.333558
37,10,1,distance,-247.393545,-8.333558
29,8,1,distance,-247.692049,-8.333558
47,12,2,distance,-247.734477,-8.333558


### Classification grid search

In [36]:
from sklearn.model_selection import ParameterGrid
        
knn_params = {
    "n_neighbors" : np.arange(1, 15),
    "weights" : ["uniform", "distance"],
    "p" : [1, 2]
}
 
for params in ParameterGrid(knn_params):
    evaluate_knn_classifier(points_data_np, points_target_np, **params)


{'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
mean absolute error on test set 456.78793798906344
mean absolute error on train set 13.48071630496833
{'n_neighbors': 1, 'p': 1, 'weights': 'distance'}
mean absolute error on test set 444.27562045801585
mean absolute error on train set 12.347690193117758
{'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}
mean absolute error on test set 434.2511077758212
mean absolute error on train set 11.307017328426728
{'n_neighbors': 1, 'p': 2, 'weights': 'distance'}
mean absolute error on test set 461.9593234391652
mean absolute error on train set 11.307549281596113
{'n_neighbors': 2, 'p': 1, 'weights': 'uniform'}
mean absolute error on test set 439.48706211481823
mean absolute error on train set 228.4892461792718
{'n_neighbors': 2, 'p': 1, 'weights': 'distance'}
mean absolute error on test set 450.02121322553774
mean absolute error on train set 12.338444064464941
{'n_neighbors': 2, 'p': 2, 'weights': 'uniform'}
mean absolute error on test set 453.8688

mean absolute error on test set 576.2774410094894
mean absolute error on train set 548.3481722245365
{'n_neighbors': 14, 'p': 2, 'weights': 'distance'}
mean absolute error on test set 452.25368452494615
mean absolute error on train set 12.893376399104275


### Error analysis

In [50]:
points_pd = pd.DataFrame(unique_points.collect())
'''
## Classification
clf = KNeighborsClassifier(n_neighbors=5)
X = points_data
y = points_target
info_adder = lambda X: X.set_index('real').join(points_pd.set_index(3))
'''

## Regression
clf = KNeighborsRegressor(n_neighbors=5)
X = points_data
y = points_target_x
info_adder = lambda X: X.set_index('real').join(points_pd.set_index(11))

def error_analysis(clf, X, y, info_adder):
    X_train, X_test, y_train, y_test = train_test_split(points_data, y, test_size=0.20, random_state=23)
    clf.fit(X_train, y_train)

    predictions = clf.predict(X_test)
    X_test['real'] = y_test
    X_test['prediction'] = predictions
    X_test = info_adder(X_test)
    display(X_test)


In [51]:
error_analysis(clf, X, y, info_adder)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,recep_0,recep_1,recep_2,recep_3,prediction,0,1,2,3,4,5,6,7,8,9,10,12
real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
464493.077244,0.0,37.333333,0.0,0.0,464789.42904,21/02/2018,11:16:10,11:14:10,515,18,[],"[32, 39, 43, 32, 34, 33, 37, 39, 36, 36, 36, 3...",[],[],2018-02-21 11:16:10,2018-02-21 11:14:10,6109965.0
463874.493683,29.75,43.909091,0.0,0.0,464458.581423,23/02/2018,18:55:40,18:53:40,351,18,"[30, 29, 31, 30, 33, 25, 23, 36, 31, 28, 36, 3...","[46, 46, 46, 45, 48, 45, 43, 39, 42, 42, 41]",[],[],2018-02-23 18:55:40,2018-02-23 18:53:40,6110407.0
463444.800206,47.666667,0.0,0.0,0.0,463634.548202,20/02/2018,12:15:15,12:13:15,234,18,"[53, 54, 50, 52, 51, 50, 52, 46, 46, 44, 44, 3...",[],[],[],2018-02-20 12:15:15,2018-02-20 12:13:15,6110730.0
464474.808126,11.0,55.315789,0.0,0.0,464382.191583,21/02/2018,12:44:50,12:42:50,470,18,[11],"[40, 42, 39, 42, 43, 43, 37, 77, 76, 76, 76, 5...",[],[],2018-02-21 12:44:50,2018-02-21 12:42:50,6110187.0
465224.903211,0.0,37.571429,0.0,100.304348,465115.790143,22/02/2018,09:07:45,09:05:45,434,18,[],"[40, 37, 39, 34, 32, 36, 36, 35, 41, 42, 39, 4...",[],"[52, 60, 75, 63, 53, 55, 62, 53, 58, 114, 123,...",2018-02-22 09:07:45,2018-02-22 09:05:45,6110876.0
463765.226389,32.625,0.0,0.0,0.0,463511.419156,23/02/2018,16:22:15,16:20:15,81,18,"[35, 30, 23, 37, 33, 30, 33, 40]",[],[],[],2018-02-23 16:22:15,2018-02-23 16:20:15,6111742.0
465017.806596,0.0,0.0,0.0,127.826087,465251.827736,23/02/2018,12:32:25,12:30:25,342,18,[],[],[],"[108, 95, 102, 101, 97, 111, 133, 130, 135, 13...",2018-02-23 12:32:25,2018-02-23 12:30:25,6111217.0
464401.99683,37.428571,33.0,102.285714,115.666667,464398.396935,20/02/2018,11:33:20,11:31:20,290,18,"[39, 35, 35, 31, 36, 36, 34, 42, 41, 38, 30, 3...",[33],"[53, 95, 113, 118, 117, 118, 115, 79, 85, 105,...","[55, 111, 102, 103, 109, 117, 111, 29, 41, 51,...",2018-02-20 11:33:20,2018-02-20 11:31:20,6111077.0
463737.270911,37.5,30.0,0.0,0.0,464250.077869,22/02/2018,10:49:40,10:47:40,327,18,"[34, 43, 36, 38, 38, 37, 32, 39, 35, 37, 39, 3...",[30],[],[],2018-02-22 10:49:40,2018-02-22 10:47:40,6110440.0
463944.489898,0.0,30.0,0.0,0.0,464902.243914,20/02/2018,18:16:05,18:14:05,419,18,[],"[27, 31, 32]",[],[],2018-02-20 18:16:05,2018-02-20 18:14:05,6110099.0
