In [1]:
%matplotlib notebook

In [2]:
import pyspark

sc = pyspark.sql.SparkSession.Builder().getOrCreate()

## Levanto los archivos

In [23]:
import json
import random
import numpy as np
import pandas as pd
import math
from scipy.spatial.distance import cdist, euclidean


## Utils

In [4]:
def distance(p1, p2):
    return math.sqrt((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)


In [5]:
points_recep = sc.read.json('datos/train-test-by-emission.jsonlines/').rdd

In [6]:
def group_emisions(data):
    data_dict = data.asDict()
    recep_0 = data_dict.pop('recep_0')
    recep_1 = data_dict.pop('recep_1')
    recep_2 = data_dict.pop('recep_2')
    recep_3 = data_dict.pop('recep_3')
    
    data_dict['emissions'] = []
    for i in range(24):
        data_dict['emissions'].append([recep_0[i], recep_1[i], recep_2[i], recep_3[i]])
    return data_dict


In [7]:
points_emisions = points_recep.map(group_emisions)


In [12]:
def expand_rows_with_emissions(row):
    emissions = row.pop('emissions')
    print(row)
    rows = []
    for e in emissions:
        new_row = row.copy()
        new_row['recep'] = e
        rows.append(new_row)
    return rows
        
all_emissions = points_emisions.flatMap(expand_rows_with_emissions)

### Genero los atributos y etiquetas que me interesan

In [13]:
def generate_attrs(row):
    data = {
        'antenna_0': row['recep'][0],
        'antenna_1': row['recep'][1],
        'antenna_2': row['recep'][2],
        'antenna_3': row['recep'][3],
    }
    return {'data': data, 'x': row['x'], 'y': row['y'], 'point': row['Punto']}
    
all_emissions = all_emissions.map(generate_attrs)

### Saco las emisiones sin ninguna recepcion

In [14]:
all_emissions = all_emissions.filter(lambda x: sum(x['data'].values())>0)

In [24]:
same_signals_points = all_emissions.groupBy(lambda x: str(x['data'])).map(lambda x: list(x[1]))

In [25]:
def geometric_median(X, eps=1e-5):
    y = np.mean(X, 0)

    while True:
        D = cdist(X, [y])
        nonzeros = (D != 0)[:, 0]

        Dinv = 1 / D[nonzeros]
        Dinvs = np.sum(Dinv)
        W = Dinv / Dinvs
        T = np.sum(W * X[nonzeros], 0)

        num_zeros = len(X) - np.sum(nonzeros)
        if num_zeros == 0:
            y1 = T
        elif num_zeros == len(X):
            return y
        else:
            R = (T - y) * Dinvs
            r = np.linalg.norm(R)
            rinv = 0 if r == 0 else num_zeros/r
            y1 = max(0, 1-rinv)*T + min(1, rinv)*y

        if euclidean(y, y1) < eps:
            return y1

        y = y1

In [26]:
def calc_min_err(emissions):
    min_dist_sum = 1000000
    for i in range(len(emissions)):
        distances_sum = 0
        for j in range(len(emissions)):
            if i != j:
                p1 = (emissions[i]['x'], emissions[i]['y'])
                p2 = (emissions[j]['x'], emissions[j]['y'])
                distances_sum += distance(p1, p2)
        min_dist_sum = min(min_dist_sum, distances_sum)
        
    pos = np.array([[e['x'],e['y']] for e in emissions])
    best_point = geometric_median(pos)
    distances_sum = 0
    for i in range(len(emissions)):
        p1 = (emissions[i]['x'], emissions[i]['y'])
        p2 = best_point
        distances_sum += distance(p1, p2)
    min_dist_sum = min(min_dist_sum, distances_sum)
    
    return min_dist_sum



In [27]:
sum_min_dist = same_signals_points.map(calc_min_err).sum()
total_emmisions = all_emissions.count()

min_mae = sum_min_dist/float(total_emmisions)
min_mae

226.8865060208359