In [1]:
%matplotlib notebook

In [2]:
import pyspark
sc = pyspark.sql.SparkSession.Builder().getOrCreate()

In [3]:
import json
import numpy as np
import pandas as pd
import seaborn as sn

import matplotlib.pyplot as plt
from IPython.display import display, HTML
import math
import time
import matplotlib.animation as animation
from datetime import datetime, timedelta
import pyspark.sql.functions as F

from utils.emissions_normalizer import EmissionsNormalizer
from utils.knn_predictor import KnnPredictor
from utils.smoother import Smoother
from utils.random_predictor import RandomPredictor

from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

In [4]:
ANTENNAS_POSITION = [(464259.981343845,6110331.85100085),(463512.015195402,6111004.324434620),(464443.295130103,6111377.26171875),(464629.562194595,6111105.34734669)]
ANTENNAS_NAMES = ["RC1", "RC2", "D1", "D2"] 
X_0 = 462385.503783397
Y_0 = 6109042.35153865

for i in range(4):
    ANTENNAS_POSITION[i] = (ANTENNAS_POSITION[i][0]-X_0, ANTENNAS_POSITION[i][1]-Y_0)

GRID = Polygon(ANTENNAS_POSITION)

In [5]:

FEMALES = [11, 17, 20, 24, 26, 28, 30, 33, 34]
MALES = [10, 14, 21, 22, 23, 25, 27, 29, 31, 32]

## Entreno el modelo que vamos a usar para predecir

In [6]:
points_recep = sc.read.json('datos/train-test-by-emission.jsonlines/').rdd

In [7]:
normalizer = EmissionsNormalizer()
data = normalizer.normalize(points_recep)
regre_data, regre_target = normalizer.get_regression_dataframes(data)

In [8]:
print(regre_target.head())
print(regre_data.head())

             0            1
0  2854.782012  2192.894689
1  2854.782012  2192.894689
2  2854.782012  2192.894689
3  2854.782012  2192.894689
4  2854.782012  2192.894689
   antenna_0  antenna_1  antenna_2  antenna_3
0          0          0          0         29
1          0          0          0         70
2          0          0          0         28
3          0          0          0         58
4          0          0          0         81


In [9]:
predictor = KnnPredictor()
predictor.fit(regre_data, regre_target)

## Levanto el archivo con las emisiones de los pajaros

In [10]:
birds_recep = sc.read.json('datos/all-birds-data.jsonlines/').rdd
birds_recep = birds_recep.sortBy(lambda x: x['timestamp'])

KeyboardInterrupt: 

### Me quedo solo con pajaros etiquetados

In [11]:
birds_recep = birds_recep.filter(lambda x: x['tag_id'] in MALES+FEMALES)

### Obtengo cual es el dia inicial en el que todos los pajaros tienen chip

In [12]:
#start_date = '2018-01-03'
start_date = '2018-01-10' ## Fecha que me pasó Romina

### Obtengo cual es el dia final en el que todos los pajaros tienen chip

In [13]:
end_date = '2018-01-26'
#end_date = '2018-02-08'

### Filtro a partir de las fechas obtenida

In [14]:
birds_recep = birds_recep.filter(lambda x: x['timestamp'] >= start_date and  x['timestamp'] < end_date)

In [15]:
birds_recep.count()

1069222

### Filtro en solo los horarios que nos interesa analizar y hago un checkpoint en disco

In [16]:
START_TIME = '07:00:00'
END_TIME = '19:00:00'
birds_data_complete_df = birds_recep.filter(lambda x: START_TIME <= x['timestamp'].split(' ')[1] < END_TIME).toDF()
birds_data_complete_df.write.parquet('tmp/checkpoint-cog-7-7.parquet')



### Levanto el chekpoint en pandas

In [10]:
birds_data_complete = pd.read_parquet('tmp/checkpoint-cog-7-7.parquet')
birds_data = birds_data_complete.drop(columns=['tag_id','timestamp'])

### Obtengo las predicciones

In [11]:
predictions = predictor.predict(birds_data)
predictions = pd.concat([pd.DataFrame(predictions), pd.DataFrame(birds_data_complete)], axis=1).values

### Imprimo matriz de confusion

In [12]:
def show_confusion_matrices(matches_per_female):
    matrix_res = [[]] * len(FEMALES)
    for i in range(len(FEMALES)):
        matrix_res[i] = []
        for m in MALES:
            matrix_res[i].append(matches_per_female.get(FEMALES[i]).get(m, 0))
         
    plt.figure(figsize=(10,15))
    
    plt.subplot(3, 1, 1)
    plt.title('Birds matches')
    matrix_res_pd = pd.DataFrame(matrix_res)
    matrix_res_pd.columns = MALES
    matrix_res_pd.rename(index={i:FEMALES[i] for i in range(len(FEMALES))}, inplace=True)
    yield matrix_res_pd
    ax = sn.heatmap(matrix_res_pd, annot=True, fmt='g', xticklabels=True, yticklabels=True, vmin=0)
    ax.set(xlabel='Males', ylabel='Females')



    ### Esto muestra, para cada hembra, que porcentaje del tiempo que los otros machos estuvieron en la grilla 
    ### con cualquier hembra estuvieron juntos
    plt.subplot(3, 1, 2)
    plt.title('Male time distribution')
    matrix_res_pd = pd.DataFrame(matrix_res)
    matrix_res_pd.columns = MALES
    matrix_res_pd.rename(index={i:FEMALES[i] for i in range(len(FEMALES))}, inplace=True)
    matrix_res_pd = (matrix_res_pd/matrix_res_pd.sum()*100).round(2)
    matrix_res_pd = matrix_res_pd.fillna(0)
    ax = sn.heatmap(matrix_res_pd, annot=True, fmt='g', xticklabels=True, yticklabels=True, vmin=0, vmax=100)
    ax.set(xlabel='Males', ylabel='Females')


    ### Igual, pero con las hembras
    plt.subplot(3, 1, 3)
    plt.title('Female time distribution')
    matrix_res_pd = pd.DataFrame(matrix_res)
    matrix_res_pd.columns = MALES
    matrix_res_pd.rename(index={i:FEMALES[i] for i in range(len(FEMALES))}, inplace=True)
    matrix_res_pd = (matrix_res_pd.div(matrix_res_pd.sum(axis=1), axis=0)*100).round(2)
    matrix_res_pd = matrix_res_pd.fillna(0)
    ax = sn.heatmap(matrix_res_pd, annot=True, fmt='g', xticklabels=True, yticklabels=True, vmin=0, vmax=100)
    ax.set(xlabel='Males', ylabel='Females')



## Analizo coocurrencia en regiones uniformes

### Calculo en que region cayo la prediccion

In [13]:
X_REGION_SIZE = 300
Y_REGION_SIZE = 300


def set_prediction_in_regions_classification(predictions):
    grid_predictions = []
    for i in range(len(predictions)):
        prediction = predictions[i]
        point = Point(prediction[0], prediction[1])

        region = '{}-{}'.format(int(prediction[0]/X_REGION_SIZE), int(prediction[1]/Y_REGION_SIZE))

        grid_pred = np.insert(prediction, 8,region, axis=0)
        grid_predictions.append(grid_pred)
    return pd.DataFrame(grid_predictions, columns=['x', 'y', 'recep_0', 'recep_1', 'recep_2', 'recep_3', 'tag', 'time', 'region'])


### Obtengo el tiempo que cada posible pareja estuvo junta

In [14]:
def get_bird_timestamp_in_regions(bird_data, delta_time_in_sec):
    """
    delta_time_in_sec es la cantidad de segundos que decimos que el pajaro se encuentra en un mismo punto dada una prediccion
    Por default, el valor es 5, ya que las emisiones son cada 5 segundos.
    """
    timestamp_in_grid = []
    for row in zip(bird_data.time, bird_data.region):
        real_time = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
        for i in range(-delta_time_in_sec, delta_time_in_sec+1):
            res=real_time+timedelta(seconds=i)
            timestamp_in_grid.append((row[1], datetime.strftime(res, '%Y-%m-%d %H:%M:%S')))

    timestamp_in_grid = list(set(timestamp_in_grid))
    return pd.DataFrame(timestamp_in_grid, columns=['region','time'])


def get_matches_per_bird_in_regions(grid_predictions, delta_time):
    male_birds_in_grid = grid_predictions[(grid_predictions.tag.isin(MALES)) & (grid_predictions.region.notnull())]

    matches_per_bird_near_antenna = {}
    for female_bird in FEMALES:
        female_bird_in_grid = grid_predictions[(grid_predictions['tag'] == female_bird) & (grid_predictions.region.notnull())]
        timestamp_in_grid_pd = get_bird_timestamp_in_regions(female_bird_in_grid, delta_time)

        male_in_grid_with_female = male_birds_in_grid.join(timestamp_in_grid_pd.set_index(['region','time']), on=['region','time'], how='inner')

        female_matches = {}
        for index, row in male_in_grid_with_female.iterrows():
            tag = row['tag']
            current_matches = female_matches.get(tag, 0)
            female_matches[tag] = current_matches+1

        matches_per_bird_near_antenna[female_bird] = female_matches
    return matches_per_bird_near_antenna



In [None]:
#### new version
def get_bird_timestamp_in_regions(bird_data, delta_time_in_sec):
    """
    delta_time_in_sec es la cantidad de segundos que decimos que el pajaro se encuentra en un mismo punto dada una prediccion
    Por default, el valor es 5, ya que las emisiones son cada 5 segundos.
    """
    timestamp_in_grid = []
    for row in zip(bird_data.time, bird_data.region):
        real_time = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
        for i in range(-delta_time_in_sec, delta_time_in_sec+1):
            res=real_time+timedelta(seconds=i)
            timestamp_in_grid.append((row[1], datetime.strftime(res, '%Y-%m-%d %H:%M:%S')))

    timestamp_in_grid = list(set(timestamp_in_grid))
    return pd.DataFrame(timestamp_in_grid, columns=['region','time'])


def get_matches_per_bird_in_regions(grid_predictions, delta_time):
    male_birds_in_grid = grid_predictions[(grid_predictions.tag.isin(MALES)) & (grid_predictions.region.notnull())]

    matches_per_bird_near_antenna = {}
    for female_bird in FEMALES:
        female_bird_in_grid = grid_predictions[(grid_predictions['tag'] == female_bird) & (grid_predictions.region.notnull())]
        timestamp_in_grid_pd = get_bird_timestamp_in_regions(female_bird_in_grid, delta_time)

        male_in_grid_with_female = male_birds_in_grid.join(timestamp_in_grid_pd.set_index(['region','time']), on=['region','time'], how='inner')

        female_matches = {}
        for index, row in male_in_grid_with_female.iterrows():
            tag = row['tag']
            current_matches = female_matches.get(tag, 0)
            female_matches[tag] = current_matches+1

        matches_per_bird_near_antenna[female_bird] = female_matches
    return matches_per_bird_near_antenna



In [15]:
def evaluate_in_regions_coocurrence(predictions, delta_time=5):
    predictions = set_prediction_in_regions_classification(predictions)
    matches_per_bird = get_matches_per_bird_in_regions(predictions, delta_time)
    show_confusion_matrices(matches_per_bird)

In [16]:
predictions = set_prediction_in_regions_classification(predictions)
predictions['hour'] = predictions.time.str.split(' ').str[1].str.split(':').str[0]

In [98]:
## new version
predictions['day'] = predictions.time.str.split(' ').str[0].str.split('-').str[2].astype(int)
predictions['hour'] = predictions.time.str.split(' ').str[1].str.split(':').str[0].astype(int)
predictions['minute'] = predictions.time.str.split(' ').str[1].str.split(':').str[1].astype(int)
predictions['second'] = predictions.time.str.split(' ').str[1].str.split(':').str[2].astype(int).div(20).astype(int)
predictions.head()

Unnamed: 0,x,y,recep_0,recep_1,recep_2,recep_3,tag,time,region,hour,minute,second,day
0,1452.50967,1809.458557,0,0,33,0,28,2018-01-10 07:00:01,4-6,7,0,0,10
1,1523.935798,1922.755656,30,0,0,0,24,2018-01-10 07:00:03,5-6,7,0,0,10
2,1760.400649,1879.44027,0,0,64,0,28,2018-01-10 07:00:06,5-6,7,0,0,10
3,1842.038246,2096.418386,0,0,41,0,28,2018-01-10 07:00:11,6-6,7,0,0,10
4,1712.292779,1920.263219,33,0,0,0,24,2018-01-10 07:00:13,5-6,7,0,0,10


In [99]:
males = predictions[predictions.tag.isin(MALES)][['tag','region','day','hour', 'minute', 'second']]
females = predictions[predictions.tag.isin(FEMALES)][['tag','region','day','hour', 'minute', 'second']]

In [100]:
coocurrence = pd.merge(females, males, on=['region', 'day','hour','minute', 'second'])


In [101]:
coocurrence_u = coocurrence.groupby(['tag_x', 'tag_y','day','hour','minute', 'second']).apply(lambda x: x.iloc[0]).reset_index(drop=True)

In [102]:
coocurrence_dict = coocurrence_u.groupby(['tag_x', 'tag_y']).size().to_dict()

In [103]:
matrix_res = [[]] * len(FEMALES)
for i in range(len(FEMALES)):
    matrix_res[i] = []
    for m in MALES:
        matrix_res[i].append(coocurrence_dict.get((FEMALES[i],m),0))


In [104]:
matrix_res_pd = pd.DataFrame(matrix_res)
matrix_res_pd.columns = MALES
matrix_res_pd.rename(index={i:FEMALES[i] for i in range(len(FEMALES))}, inplace=True)

In [106]:
matrix_res_pd

Unnamed: 0,10,14,21,22,23,25,27,29,31,32
11,464,23,107,2,16,2,206,173,59,82
17,81,664,577,420,80,201,342,442,943,660
20,190,679,4067,335,203,98,672,616,869,1507
24,149,355,637,239,938,24,582,572,854,1168
26,9,231,146,199,22,637,40,89,63,58
28,71,373,249,147,37,21,92,120,238,95
30,72,568,322,293,100,134,214,195,1701,317
33,99,176,785,110,147,29,678,813,582,6397
34,136,244,393,121,65,47,674,3349,153,680


In [None]:
#### end new

In [107]:
#### start new v2 full

In [109]:
f_example = predictions[predictions.tag == 23]

In [142]:
window = 20
delta = int(window/2)
emisions_by_second = []
for row in f_example[['tag', 'time','region']].iterrows():
    row = row[1]
    real_time = datetime.strptime(row[1], '%Y-%m-%d %H:%M:%S')
    for i in range(-delta, delta+1):
        if i == 0:
            emisions_by_second.append((row[0],row[1],row[2], 'real'))
            continue
        res=real_time+timedelta(seconds=i)
        emisions_by_second.append((row[0],datetime.strftime(res, '%Y-%m-%d %H:%M:%S'),row[2], 'fake'))

In [148]:
emisions_by_second_pd = pd.DataFrame(emisions_by_second, columns=['tag', 'time','region', 'type'])

In [152]:
emisions_by_second_pd = emisions_by_second_pd.drop_duplicates()

In [159]:
emisions_by_second_unique_pd = emisions_by_second_pd.groupby(['tag', 'time']).apply(lambda x: x.sort_values(by='type', ascending=False).iloc[0]).reset_index(drop=True)

In [167]:
len(emisions_by_second_pd), len(emisions_by_second_unique_pd)

(90535, 43644)

In [168]:
emisions_by_second_unique_pd.head()

Unnamed: 0,tag,time,region,type
0,23,2018-01-10 07:22:18,5-7,fake
1,23,2018-01-10 07:22:19,5-7,fake
2,23,2018-01-10 07:22:20,5-7,fake
3,23,2018-01-10 07:22:21,5-7,fake
4,23,2018-01-10 07:22:22,5-7,fake


In [131]:
predictions.head()

Unnamed: 0,x,y,recep_0,recep_1,recep_2,recep_3,tag,time,region,hour,minute,second,day
0,1452.50967,1809.458557,0,0,33,0,28,2018-01-10 07:00:01,4-6,7,0,0,10
1,1523.935798,1922.755656,30,0,0,0,24,2018-01-10 07:00:03,5-6,7,0,0,10
2,1760.400649,1879.44027,0,0,64,0,28,2018-01-10 07:00:06,5-6,7,0,0,10
3,1842.038246,2096.418386,0,0,41,0,28,2018-01-10 07:00:11,6-6,7,0,0,10
4,1712.292779,1920.263219,33,0,0,0,24,2018-01-10 07:00:13,5-6,7,0,0,10


Index(['x', 'y', 'recep_0', 'recep_1', 'recep_2', 'recep_3', 'tag', 'time',
       'region', 'hour', 'minute', 'second', 'day'],
      dtype='object')

In [None]:
#### end new v2 full

In [49]:
for time_range in range(4):
    print(7+time_range*3,7+time_range*3+2)
    predictions_in_time_range = predictions[predictions.hour.astype(int).isin(range(7+time_range*3,7+time_range*3+2))]
    matches_per_bird = get_matches_per_bird_in_regions(predictions_in_time_range, 5)
    matrix_res = [[]] * len(FEMALES)
    for i in range(len(FEMALES)):
        matrix_res[i] = []
        for m in MALES:
            matrix_res[i].append(matches_per_bird.get(FEMALES[i]).get(m, 0))
    matrix_res_pd = pd.DataFrame(matrix_res)
    matrix_res_pd.columns = MALES
    matrix_res_pd.rename(index={i:FEMALES[i] for i in range(len(FEMALES))}, inplace=True)
    matrix_res_pd.to_csv(f'matrix_coocurrence_by_hour-{7+time_range*3}-{7+time_range*3+2}.csv')
    display(matrix_res_pd)


7 9


Unnamed: 0,10,14,21,22,23,25,27,29,31,32
11,1,0,0,0,0,0,0,0,0,0
17,0,177,113,223,5,8,114,71,29,69
20,47,192,965,132,154,7,153,131,46,334
24,2,128,191,59,455,10,132,147,13,212
26,2,17,7,13,0,54,3,3,4,4
28,0,57,100,15,6,5,28,51,9,33
30,0,167,63,76,16,3,50,91,84,26
33,40,52,160,29,76,3,236,156,15,1594
34,3,148,115,50,4,2,181,1100,18,134


10 12


Unnamed: 0,10,14,21,22,23,25,27,29,31,32
11,75,1,0,0,2,0,7,5,6,25
17,0,169,110,104,19,73,165,209,403,224
20,26,125,1022,84,18,30,146,95,308,316
24,26,49,79,47,172,0,162,79,319,256
26,0,64,23,54,23,235,7,27,11,28
28,0,0,0,0,0,0,0,0,0,0
30,2,98,105,83,11,49,60,39,618,152
33,0,19,151,59,3,14,111,143,274,1806
34,6,12,92,6,9,0,257,1071,19,184


13 15


Unnamed: 0,10,14,21,22,23,25,27,29,31,32
11,235,0,0,0,0,0,262,93,1,27
17,7,30,100,10,15,9,35,1,608,225
20,1,189,697,4,69,2,108,44,296,252
24,4,80,128,42,262,0,209,17,429,275
26,0,8,4,9,0,20,0,0,8,5
28,0,63,19,1,7,0,30,10,59,8
30,0,120,77,15,39,2,62,3,1119,127
33,20,32,83,1,13,3,185,219,234,1555
34,33,2,23,0,16,0,193,601,41,182


16 18


Unnamed: 0,10,14,21,22,23,25,27,29,31,32
11,225,3,44,0,0,1,56,72,17,33
17,22,89,143,50,3,12,130,109,58,110
20,68,46,1550,36,4,21,175,318,100,310
24,23,18,58,19,81,2,157,112,62,151
26,10,16,55,48,0,172,2,31,19,11
28,12,72,45,44,4,5,4,12,100,23
30,65,77,62,88,7,49,29,9,278,18
33,44,34,323,16,23,5,64,251,117,1826
34,61,26,186,29,4,9,201,1479,16,194


In [20]:
print('All times')
predictions_in_time_range = predictions
matches_per_bird = get_matches_per_bird_in_regions(predictions_in_time_range, 5)
matrix_res = [[]] * len(FEMALES)
for i in range(len(FEMALES)):
    matrix_res[i] = []
    for m in MALES:
        matrix_res[i].append(matches_per_bird.get(FEMALES[i]).get(m, 0))
matrix_res_pd = pd.DataFrame(matrix_res)
matrix_res_pd.columns = MALES
matrix_res_pd.rename(index={i:FEMALES[i] for i in range(len(FEMALES))}, inplace=True)
matrix_res_pd.to_csv(f'matrix_coocurrence_by_hour-7-18.csv')
display(matrix_res_pd)


All times


Unnamed: 0,10,14,21,22,23,25,27,29,31,32
11,804,18,153,0,26,4,374,242,69,115
17,99,698,783,501,88,214,564,590,1555,774
20,295,759,6525,355,302,100,855,848,1223,1901
24,228,384,840,265,1420,20,936,687,1396,1519
26,14,219,171,227,32,823,29,114,68,58
28,99,459,316,160,41,21,85,121,294,102
30,92,697,397,366,131,166,262,241,3367,501
33,123,202,1105,129,206,28,872,1213,924,10168
34,184,291,530,128,106,56,1053,6152,172,951
