### Grid search para encontrar los mejores hiperparametros del smooth.
Los mejores resultados van a ser aquellos que minimicen la cantidad de predicciones que caen dentro de la grilla cuando los pajaros se supone que estan durmiendo

In [1]:
%matplotlib notebook

In [2]:
import pyspark
sc = pyspark.sql.SparkSession.Builder().getOrCreate()

In [3]:
import json
import numpy as np
import pandas as pd
import seaborn as sn

import matplotlib.pyplot as plt
from IPython.display import display, HTML
import math
import time
import matplotlib.animation as animation
from datetime import datetime, timedelta
import pyspark.sql.functions as F

from utils.emissions_normalizer import EmissionsNormalizer
from utils.knn_predictor import KnnPredictor
from utils.smoother import Smoother
from utils.random_predictor import RandomPredictor

from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

In [4]:
ANTENNAS_POSITION = [(464259.981343845,6110331.85100085),(463512.015195402,6111004.324434620),(464443.295130103,6111377.26171875),(464629.562194595,6111105.34734669)]
ANTENNAS_NAMES = ["RC1", "RC2", "D1", "D2"] 
X_0 = 462385.503783397
Y_0 = 6109042.35153865

for i in range(4):
    ANTENNAS_POSITION[i] = (ANTENNAS_POSITION[i][0]-X_0, ANTENNAS_POSITION[i][1]-Y_0)

GRID = Polygon(ANTENNAS_POSITION)

In [5]:
FEMALES = [11, 15, 17, 20, 24, 26, 28, 30, 33, 34]
MALES = [10, 14, 16, 21, 22, 23, 25, 27, 29, 31, 32]

## Entreno el modelo que vamos a usar para predecir

In [6]:
points_recep = sc.read.json('datos/train-test-by-emission.jsonlines/').rdd

In [7]:
normalizer = EmissionsNormalizer()
data = normalizer.normalize(points_recep)
regre_data, regre_target = normalizer.get_regression_dataframes(data)

In [8]:
predictor = KnnPredictor()
predictor.fit(regre_data, regre_target)

## Levanto el archivo con las emisiones de los pajaros

In [None]:
birds_recep = sc.read.json('datos/all-birds-data.jsonlines/').rdd
birds_recep = birds_recep.sortBy(lambda x: x['timestamp'])

### Me quedo solo con pajaros etiquetados

In [None]:
birds_recep = birds_recep.filter(lambda x: x['tag_id'] in MALES+FEMALES)

### Obtengo cual es el dia inicial en el que todos los pajaros tienen chip

In [None]:
start_date = '2018-01-10' ## Fecha que me pasó Romina

### Obtengo cual es el dia final en el que todos los pajaros tienen chip

In [None]:
end_date = '2018-01-26'

### Filtro a partir de las fechas obtenida

In [None]:
birds_recep = birds_recep.filter(lambda x: x['timestamp'] >= start_date and  x['timestamp'] < end_date)

In [None]:
birds_recep.count()

### Filtro en solo los horarios que nos interesa analizar y hago un checkpoint en disco

In [None]:
START_TIME = '04:00:00'
END_TIME = '21:00:00'
birds_data_complete_df = birds_recep.filter(lambda x: not (START_TIME <= x['timestamp'].split(' ')[1] < END_TIME)).toDF()
birds_data_complete_df.write.parquet('tmp/checkpoint-sgs.parquet')



### Levanto el chekpoint en pandas

In [9]:
birds_data_complete = pd.read_parquet('tmp/checkpoint-sgs.parquet')
birds_data = birds_data_complete.drop(columns=['tag_id','timestamp'])

### Obtengo las predicciones

In [10]:
predictions = predictor.predict(birds_data)
predictions = pd.concat([pd.DataFrame(predictions), pd.DataFrame(birds_data_complete)], axis=1).values

In [32]:
aux = pd.DataFrame(predictions)

In [41]:
aux[7].apply(lambda x: (x.split(' ')[1]))

0         00:00:00
1         00:00:00
2         00:00:02
3         00:00:02
4         00:00:03
5         00:00:04
6         00:00:05
7         00:00:07
8         00:00:07
9         00:00:08
10        00:00:10
11        00:00:10
12        00:00:12
13        00:00:12
14        00:00:14
15        00:00:15
16        00:00:17
17        00:00:17
18        00:00:18
19        00:00:19
20        00:00:20
21        00:00:20
22        00:00:22
23        00:00:23
24        00:00:25
25        00:00:25
26        00:00:27
27        00:00:30
28        00:00:32
29        00:00:32
            ...   
416426    23:59:34
416427    23:59:35
416428    23:59:36
416429    23:59:37
416430    23:59:38
416431    23:59:38
416432    23:59:39
416433    23:59:40
416434    23:59:41
416435    23:59:42
416436    23:59:44
416437    23:59:44
416438    23:59:44
416439    23:59:46
416440    23:59:46
416441    23:59:47
416442    23:59:49
416443    23:59:49
416444    23:59:49
416445    23:59:50
416446    23:59:51
416447    23

In [43]:
def get_night_number(row):
    night_number = int(row.split(' ')[0].split('-')[2])-10
    if int(row.split(' ')[1].split(':')[0]) > 19:
        night_number +=1
    return str(night_number)
    
aux['night_number'] = aux[7].apply(get_night_number)



In [62]:
aux = aux[aux[7] >= '2018-01-10 18:00:00']

In [77]:
def set_in_grid(prediction):
    point = Point(prediction[0], prediction[1])
    return 1 if GRID.contains(point) else 0

aux['in_grid'] = aux.apply(set_in_grid, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [125]:
res = aux.groupby([6, 'night_number']).mean()

In [126]:
res

Unnamed: 0_level_0,Unnamed: 1_level_0,in_grid,has_good_predictions
6,night_number,Unnamed: 2_level_1,Unnamed: 3_level_1
10,3,1.000000,False
14,1,0.347826,False
14,10,0.371354,False
14,13,0.394643,False
14,14,0.467121,False
14,15,0.213448,False
14,16,0.415622,False
14,4,0.297189,False
14,5,0.377476,False
14,6,0.271495,False


In [94]:
good_res = res[res.in_grid < 0.1]

In [97]:
good_res_list = good_res.index.values

In [104]:
good_res_list

array([(15, '6'), (17, '16'), (20, '16'), (20, '9'), (21, '10'),
       (23, '4'), (23, '6'), (23, '8'), (24, '10'), (24, '3'), (24, '6'),
       (26, '2'), (28, '13'), (28, '14'), (29, '12'), (29, '2'),
       (29, '3'), (31, '5'), (32, '11'), (32, '12'), (32, '13'),
       (32, '16'), (32, '5'), (32, '6'), (32, '8'), (32, '9'), (33, '10'),
       (33, '11'), (33, '12'), (33, '4'), (33, '5'), (33, '8'),
       (34, '12')], dtype=object)

In [106]:
def has_good_predictions(row):
    return any([True for r in good_res_list if r[0] == row[6] and r[1] == row['night_number']])
    
aux['has_good_predictions'] = aux.apply(has_good_predictions, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [107]:
finallyl_pred = aux[aux.has_good_predictions]

In [114]:
finallyl_pred = finallyl_pred.drop(columns=['night_number', 'in_grid', 'has_good_predictions'])

In [122]:
finallyl_pred = finallyl_pred.values

### Grid smoother

In [123]:
WINDOWS_SIZE_PARAMS = [30, 60, 90, 120, 180, 240, 300]
STEP_SIZE_PARAMS = [10, 30, 60, 90, 120]
USE_MEDIAN_PARAMS = [True, False]

res = []
for windows_size in WINDOWS_SIZE_PARAMS:
    for step_size in STEP_SIZE_PARAMS:
        for use_median in USE_MEDIAN_PARAMS:
            if step_size > windows_size:
                continue
            smoother = Smoother(windows_size=windows_size, step_size=step_size, use_median=use_median)
            smooth_predictions = smoother.smooth_predictions(finallyl_pred)
            
            count_in_grid = 0
            for i in range(len(smooth_predictions)):
                prediction = smooth_predictions[i]
                point = Point(prediction[0], prediction[1])
                if GRID.contains(point):
                    count_in_grid += 1

            rate_in_grid = count_in_grid/float(len(smooth_predictions))
            res.append({
                'windows_size': windows_size,
                'step_size': step_size,
                'use_median': use_median,
                'rate': rate_in_grid,
            })
            print(res[-1])
            

{'windows_size': 30, 'step_size': 10, 'use_median': True, 'rate': 0.028219252302690987}
{'windows_size': 30, 'step_size': 10, 'use_median': False, 'rate': 0.029980133646378904}
{'windows_size': 30, 'step_size': 30, 'use_median': True, 'rate': 0.027902341803687097}
{'windows_size': 30, 'step_size': 30, 'use_median': False, 'rate': 0.02979571499750872}
{'windows_size': 60, 'step_size': 10, 'use_median': True, 'rate': 0.027496839443742097}
{'windows_size': 60, 'step_size': 10, 'use_median': False, 'rate': 0.029077117572692796}
{'windows_size': 60, 'step_size': 30, 'use_median': True, 'rate': 0.027105132037867463}
{'windows_size': 60, 'step_size': 30, 'use_median': False, 'rate': 0.02929745889387145}
{'windows_size': 60, 'step_size': 60, 'use_median': True, 'rate': 0.026619343389529725}
{'windows_size': 60, 'step_size': 60, 'use_median': False, 'rate': 0.027684117125110912}
{'windows_size': 90, 'step_size': 10, 'use_median': True, 'rate': 0.027000180603214737}
{'windows_size': 90, 'step_si

array([1962.476874132086, 1697.4160601037581, 48, 48, 0, 0, 24,
       '2018-01-10 00:00:05'], dtype=object)

In [20]:
smoother = Smoother(windows_size=60, step_size=10, use_median=True)
smooth_predictions = smoother.smooth_predictions(predictions)


    

TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'

In [21]:

count = {}
total = {}
for i in range(len(smooth_predictions)):
    prediction = smooth_predictions[i]
    bird_id = prediction[6]
    point = Point(prediction[0], prediction[1])
    if GRID.contains(point):
        count[bird_id] = count.get(bird_id, 0)+1
    total[bird_id] = total.get(bird_id, 0)+1


KeyError: 11

In [27]:
res = []
for b in FEMALES+MALES:
    res.append({
        'bird': b,
        'total': total.get(b,0),
        'in_grid': count.get(b,0),
        'rate': count.get(b,0) / total.get(b,1)
    })
res_pd = pd.DataFrame(res)

In [28]:
res_pd

Unnamed: 0,bird,in_grid,rate,total
0,11,0,0.0,0
1,15,4,0.25,16
2,17,11799,0.75912,15543
3,20,15050,0.611615,24607
4,24,13360,0.477791,27962
5,26,2,0.125,16
6,28,3696,0.578766,6386
7,30,0,0.0,0
8,33,2752,0.204594,13451
9,34,2652,0.651757,4069


### Grid filtro por smooth

In [None]:
WINDOW_THRESHOLD_PARAMS = [50, 100, 200, 300, 400, 500, 600, 800, 1000, 1500]
USE_MEDIAN_PARAMS = [True, False]
