In [1]:
%matplotlib notebook

In [2]:
import pyspark
sc = pyspark.sql.SparkSession.Builder().getOrCreate()

In [3]:
import json
import numpy as np
import pandas as pd
import seaborn as sn

import matplotlib.pyplot as plt
from IPython.display import display, HTML
import math
import time
import random
from numpy.random import choice

import matplotlib.animation as animation
from datetime import datetime, timedelta
import pyspark.sql.functions as F

from utils.emissions_normalizer import EmissionsNormalizer
from utils.knn_predictor import KnnPredictor
from utils.smoother import Smoother
from utils.random_predictor import RandomPredictor

import utm

In [4]:
ANTENNAS_POSITION = [(464259.981343845,6110331.85100085),(463512.015195402,6111004.324434620),(464443.295130103,6111377.26171875),(464629.562194595,6111105.34734669)]
ANTENNAS_NAMES = ["RC1", "RC2", "D1", "D2"] 
X_0 = 462385.503783397
Y_0 = 6109042.35153865

for i in range(4):
    ANTENNAS_POSITION[i] = (ANTENNAS_POSITION[i][0]-X_0, ANTENNAS_POSITION[i][1]-Y_0)


In [5]:
FEMALES = [11, 15, 17, 20, 24, 26, 28, 30, 33, 34]
MALES = [10, 14, 16, 21, 22, 23, 25, 27, 29, 31, 32]

## Entreno el modelo que vamos a usar para predecir

In [6]:
points_recep = sc.read.json('datos/train-test-by-emission.jsonlines/').rdd

In [7]:
normalizer = EmissionsNormalizer()
data = normalizer.normalize(points_recep)
regre_data, regre_target = normalizer.get_regression_dataframes(data)

In [8]:
predictor = KnnPredictor()
predictor.fit(regre_data, regre_target)

In [9]:
random_predictor = RandomPredictor()
random_predictor.fit(regre_data, regre_target)

In [10]:
validation_track = sc.read.csv('datos/validation_birds_tracks.csv', header=True)

In [11]:
def add_utm_coordinates(row):
    row_dict = row.asDict()
    x, y,_,_ = utm.from_latlon(float(row['Latitud']), float(row['Longitud']))
    row_dict['x'] = x
    row_dict['y'] = y
    return row_dict

def add_timestamp(row):
    row['timestamp'] = '{} {}'.format(
        datetime.strptime(row['Fecha'], '%m/%d/%Y').strftime('%Y-%m-%d'),
        row['Hora']
    )
    return row

In [12]:
validation_track = validation_track.rdd.map(add_utm_coordinates)

In [13]:
validation_track = validation_track.map(add_timestamp)

In [14]:
validation_track.take(1)

[{'ID': '30',
  'Fecha': '1/19/2018',
  'Hora': '10:34:00',
  'Latitud': '-35.14944',
  'Longitud': '-57.39189',
  'Tipo': None,
  'x': 464304.4317412587,
  'y': 6110314.122618829,
  'timestamp': '2018-01-19 10:34:00'}]

In [15]:
validations_list = validation_track.collect()

In [16]:
birds_recep = sc.read.json('datos/all-birds-data.jsonlines/').rdd

In [17]:
MAX_TIME_DIFF_SEC = 30
def add_validation(recep, validations_list):
    recep_timestamp = datetime.strptime(recep['timestamp'], '%Y-%m-%d %H:%M:%S')
    recep_tag_id = recep['tag_id']
    for val in validations_list:
        val_tag_id = int(val['ID'])
        if val_tag_id == recep_tag_id:
            val_timestamp = datetime.strptime(val['timestamp'], '%Y-%m-%d %H:%M:%S')
            time_diff = (recep_timestamp-val_timestamp).total_seconds()
            if abs(time_diff) <= MAX_TIME_DIFF_SEC:
                recep_dict = recep.asDict()
                recep_dict['time_diff'] = time_diff
                recep_dict['x'] = val['x']
                recep_dict['y'] = val['y']
                return recep_dict
    return None
            



In [18]:
birds_recep_with_val = birds_recep.map(lambda x: add_validation(x, validations_list)).filter(lambda x: x is not None)

In [19]:
birds_data = pd.DataFrame(birds_recep_with_val.collect())

In [20]:
birds_data['x'] = birds_data['x']-X_0
birds_data['y'] = birds_data['y']-Y_0
birds_data['time_diff_abs'] = abs(birds_data['time_diff'])

In [21]:
def get_min_time_diff(data):
    min_diff = 1000
    for index,row in data.iterrows():
        time_diff = row['time_diff_abs']
        if time_diff < min_diff:
            min_diff = time_diff
            best_row = row
    return best_row


In [22]:
birds_data = birds_data.groupby(['tag_id', 'x', 'y']).apply(get_min_time_diff).reset_index(drop=True)

In [23]:
predictions = predictor.predict(birds_data[['recep_0', 'recep_1', 'recep_2', 'recep_3']])

In [24]:
birds_data['predicted_x'] = predictions[:,0]
birds_data['predicted_y'] = predictions[:,1]

In [25]:
def get_abs_error(row, suffix=''):
    return math.sqrt((row['x']-row['predicted_x{}'.format(suffix)])**2 + (row['y']-row['predicted_y{}'.format(suffix)])**2)

In [26]:
birds_data['abs_error'] = birds_data.apply(get_abs_error, axis=1)

In [28]:
len(birds_data)

16

In [27]:
birds_data.sort_values('time_diff_abs', ascending=True).head(n=50)

Unnamed: 0,recep_0,recep_1,recep_2,recep_3,tag_id,time_diff,timestamp,x,y,time_diff_abs,predicted_x,predicted_y,abs_error
4,0,0,34,0,21,0.0,2018-01-21 04:57:07,2242.602626,2358.764582,0.0,1830.391759,1571.521578,888.633415
10,0,47,0,0,32,0.0,2018-01-19 11:28:00,2097.198918,1570.796323,0.0,1896.741952,1308.144824,330.407029
12,31,0,0,88,32,0.0,2018-01-21 05:03:07,2395.186978,2242.912116,0.0,2501.365834,1808.472383,447.226823
3,0,46,0,0,21,-1.0,2018-01-19 11:27:59,2095.38148,1569.680188,1.0,2127.95586,1658.594341,94.693278
8,0,87,0,0,31,1.0,2018-01-19 10:34:01,1918.927958,1271.77108,1.0,1994.152324,1349.697652,108.310922
14,0,58,0,0,33,-1.0,2018-01-19 11:27:59,2097.198918,1570.796323,1.0,1986.346772,1198.996168,387.973651
0,0,0,86,0,17,-2.0,2018-01-21 04:50:17,2154.785181,2451.578715,2.0,1742.565456,2202.175821,481.795501
1,0,55,0,0,20,2.0,2018-01-19 11:28:02,2095.38148,1569.680188,2.0,1979.622716,1171.553595,414.614128
2,0,0,125,0,20,2.0,2018-01-21 04:57:09,2242.602626,2358.764582,2.0,1725.037577,2051.563528,601.868813
5,43,0,0,0,24,-2.0,2018-01-21 05:13:23,2572.30728,2141.569133,2.0,1160.032611,2099.089602,1412.913391


In [29]:
print('mean absolute error: {}'.format(birds_data['abs_error'].mean()))
print('median absolute error: {}'.format(birds_data['abs_error'].median()))

mean absolute error: 487.65998244690326
median absolute error: 425.8284615121814


### Pruebo con las smooth predictions

In [169]:
validation_days = set([e['timestamp'].split()[0] for e in validations_list])

In [174]:
validation_birds = set([int(e['ID']) for e in validations_list])

In [273]:
(
    birds_recep
    .filter(lambda x: x['timestamp'].split()[0] in validation_days and x['tag_id'] in validation_birds)
    .sortBy(lambda x: x['timestamp'])
    .toDF()
    .write.parquet('tmp/checkpoint-validation-predictors.parquet')
)

In [274]:
birds_recep_pd = pd.read_parquet('tmp/checkpoint-validation-predictors.parquet')

In [275]:
all_predictions = predictor.predict(birds_recep_pd[['recep_0', 'recep_1', 'recep_2', 'recep_3']])
all_predictions = pd.concat([pd.DataFrame(all_predictions), birds_recep_pd], axis=1).values

In [276]:
smoother = Smoother(step_size=0)
smooth_predictions = smoother.smooth_predictions(all_predictions)

In [277]:
smooth_predictions_pd = pd.DataFrame(smooth_predictions, columns=['predicted_x', 'predicted_y'] + birds_recep_pd.columns.values.tolist())

In [278]:
smooth_predictions_validation = (
    pd.merge(
        birds_data, 
        smooth_predictions_pd, on=['timestamp', 'tag_id'], 
        how='left', 
        suffixes=['', '_smooth']
    )
)

In [279]:
smooth_predictions_validation = smooth_predictions_validation[
    smooth_predictions_validation['predicted_x_smooth'].notna()
]

In [280]:
smooth_predictions_validation['abs_error'] = (
    smooth_predictions_validation
    .apply(lambda x: get_abs_error(x, suffix='_smooth'), axis=1)
)

In [281]:
smooth_predictions_validation.sort_values('time_diff_abs', ascending=True).head(n=50)

Unnamed: 0,recep_0,recep_1,recep_2,recep_3,tag_id,time_diff,timestamp,x,y,predicted_x,predicted_y,abs_error,time_diff_abs,predicted_x_smooth,predicted_y_smooth,recep_0_smooth,recep_1_smooth,recep_2_smooth,recep_3_smooth
14,31,0,0,88,32,0.0,2018-01-21 05:03:07,2395.186978,2242.912116,2501.365834,1808.472383,433.270276,0.0,2582.8,1852.37,,,,
12,0,47,0,0,32,0.0,2018-01-19 11:28:00,2097.198918,1570.796323,1896.741952,1308.144824,318.675685,0.0,1989.38,1270.92,,,,
6,0,0,34,0,21,0.0,2018-01-21 04:57:07,2242.602626,2358.764582,1830.391759,1571.521578,449.967328,0.0,1864.84,2114.3,,,,
10,0,87,0,0,31,1.0,2018-01-19 10:34:01,1918.927958,1271.77108,1994.152324,1349.697652,153.721951,1.0,2029.5,1378.56,,,,
16,0,58,0,0,33,-1.0,2018-01-19 11:27:59,2097.198918,1570.796323,1986.346772,1198.996168,416.19577,1.0,1979.62,1171.55,,,,
5,0,46,0,0,21,-1.0,2018-01-19 11:27:59,2095.38148,1569.680188,2127.95586,1658.594341,433.127903,1.0,2161.26,1141.59,,,,
0,0,0,86,0,17,-2.0,2018-01-21 04:50:17,2154.785181,2451.578715,1742.565456,2202.175821,654.454492,2.0,1760.4,1929.3,,,,
8,0,0,161,0,27,2.0,2018-01-21 04:49:37,2150.21727,2454.887892,1997.860435,2351.173469,279.295161,2.0,1961.11,2249.35,,,,
3,0,0,125,0,20,2.0,2018-01-21 04:57:09,2242.602626,2358.764582,1725.037577,2051.563528,601.868813,2.0,1725.04,2051.56,,,,
2,0,55,0,0,20,2.0,2018-01-19 11:28:02,2095.38148,1569.680188,1979.622716,1171.553595,411.997285,2.0,1989.38,1171.55,,,,


In [285]:
print('mean absolute error: {}'.format(smooth_predictions_validation['abs_error'].mean()))
print('median absolute error: {}'.format(smooth_predictions_validation['abs_error'].median()))

mean absolute error: 436.5598298413461
median absolute error: 414.09652721201667
