In [1]:
%matplotlib notebook

In [2]:
import pyspark
sc = pyspark.sql.SparkSession.Builder().getOrCreate()

In [18]:
import json
import numpy as np
import pandas as pd
import seaborn as sn

import matplotlib.pyplot as plt
from IPython.display import display, HTML
import math
import time
import random
from numpy.random import choice

import matplotlib.animation as animation
from datetime import datetime, timedelta
import pyspark.sql.functions as F

from utils.emissions_normalizer import EmissionsNormalizer
from utils.knn_predictor import KnnPredictor
from utils.smoother import Smoother
from utils.random_predictor import RandomPredictor

from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

In [19]:
ANTENNAS_POSITION = [(464259.981343845,6110331.85100085),(463512.015195402,6111004.324434620),(464443.295130103,6111377.26171875),(464629.562194595,6111105.34734669)]
ANTENNAS_NAMES = ["RC1", "RC2", "D1", "D2"] 
X_0 = 462385.503783397
Y_0 = 6109042.35153865

for i in range(4):
    ANTENNAS_POSITION[i] = (ANTENNAS_POSITION[i][0]-X_0, ANTENNAS_POSITION[i][1]-Y_0)

GRID = Polygon(ANTENNAS_POSITION)

In [4]:
FEMALES = [11, 15, 17, 20, 24, 26, 28, 30, 33, 34]
MALES = [10, 14, 16, 21, 22, 23, 25, 27, 29, 31, 32]

## Entreno el modelo que vamos a usar para predecir

In [5]:
points_recep = sc.read.json('datos/train-test-by-emission.jsonlines/').rdd

In [6]:
normalizer = EmissionsNormalizer()
data = normalizer.normalize(points_recep)
regre_data, regre_target = normalizer.get_regression_dataframes(data)

In [7]:
predictor = KnnPredictor()
predictor.fit(regre_data, regre_target)

In [8]:
random_predictor = RandomPredictor()
random_predictor.fit(regre_data, regre_target)

## Levanto el archivo con las emisiones de los pajaros

### Levanto el chekpoint en pandas

In [9]:
birds_data_complete = pd.read_parquet('tmp/checkpoint-cog.parquet')

### Saco los pajaros 15 y 16, que casi no tienen emisiones
birds_data_complete = birds_data_complete[(birds_data_complete.tag_id != 15) & (birds_data_complete.tag_id != 16)] 
birds_data_complete.reset_index()
FEMALES.remove(15)
MALES.remove(16)

birds_data = birds_data_complete.drop(columns=['tag_id','timestamp'])

### Obtengo las predicciones

In [10]:
predictions = predictor.predict(birds_data)
predictions = pd.concat([pd.DataFrame(predictions), birds_data_complete.reset_index(drop=True)], axis=1).values

### Calculo si la prediccion cayó dentro de la grilla

In [11]:
def set_prediction_grid_classification(predictions):
    grid_predictions = []
    for i in range(len(predictions)):
        prediction = predictions[i]
        point = Point(prediction[0], prediction[1])
        if GRID.contains(point):
            grid_pred = np.insert(prediction, 8,'in_grid', axis=0)
        else:
            grid_pred = np.insert(prediction, 8,'out_grid', axis=0)
        grid_predictions.append(grid_pred)
    return pd.DataFrame(grid_predictions, columns=['x', 'y', 'recep_0', 'recep_1', 'recep_2', 'recep_3', 'tag', 'time', 'region'])


### Obtengo el tiempo en que cada posible pareja esta junta

In [12]:
def get_bird_timestamp_in_grid(bird_data, delta_time_in_sec):
    """
    delta_time_in_sec es la cantidad de segundos que decimos que el pajaro se encuentra en un mismo punto dada una prediccion
    Por default, el valor es 5, ya que las emisiones son cada 5 segundos.
    """
    timestamp_in_grid = []
    for row in bird_data['time']:
        real_time = datetime.strptime(row, '%Y-%m-%d %H:%M:%S')
        for i in range(-delta_time_in_sec, delta_time_in_sec+1):
            res=real_time+timedelta(seconds=i)
            timestamp_in_grid.append(datetime.strftime(res, '%Y-%m-%d %H:%M:%S'))

    timestamp_in_grid = list(set(timestamp_in_grid))
    return pd.DataFrame(timestamp_in_grid, columns=['time'])


def get_grid_matches_per_bird(grid_predictions, delta_time):
    male_birds_in_grid = grid_predictions[(grid_predictions.tag.isin(MALES)) & (grid_predictions['region'] == 'in_grid')]

    matches_per_bird = {}
    for female_bird in FEMALES:
        female_bird_in_grid = grid_predictions[(grid_predictions['tag'] == female_bird) & (grid_predictions['region'] == 'in_grid')]
        timestamp_in_grid_pd = get_bird_timestamp_in_grid(female_bird_in_grid, delta_time)

        male_in_grid_with_female = male_birds_in_grid.join(timestamp_in_grid_pd.set_index('time'), on='time', how='inner')

        female_matches = {}
        for index, row in male_in_grid_with_female.iterrows():
            tag = row['tag']
            current_matches = female_matches.get(tag, 0)
            female_matches[tag] = current_matches+1

        matches_per_bird[female_bird] = female_matches
    return matches_per_bird



In [13]:
def get_best_partners(coocurrence_matrix):
    winning_males_coocurrence = np.max(coocurrence_matrix, axis=1)
    best_female_index = np.argmax(winning_males_coocurrence)
    best_male_index = np.argmax(coocurrence_matrix[best_female_index])
    return best_female_index, best_male_index

In [14]:
def get_random_coocurrence_matrix(random_coocurrences):
    coocurrence_matrix_random = np.zeros((len(FEMALES),len(MALES)))
    for r in random_coocurrences:
        x = MALES.index(int(r[0]))
        y = FEMALES.index(int(r[1]))
        coocurrence_matrix_random[y][x] += 1
    return coocurrence_matrix_random


In [15]:
def get_birds_distribution(predictions):
    male_predictions_pd = predictions[predictions.tag.isin(MALES)]
    female_predictions_pd = predictions[predictions.tag.isin(FEMALES)]
    
    males_distribution = (male_predictions_pd.groupby('tag').size()/len(male_predictions_pd)).to_dict()
    females_distribution = (female_predictions_pd.groupby('tag').size()/len(female_predictions_pd)).to_dict()
    return males_distribution, females_distribution
    

In [16]:
def get_random_coocurrence_list(males_distribution, females_distribution, size):
    random_males =  choice(list(males_distribution.keys()), SAMPLE_SIZE, replace=True, p=list(males_distribution.values()))
    random_females =  choice(list(females_distribution.keys()), SAMPLE_SIZE, replace=True, p=list(females_distribution.values()))
    
    random_coocurrences = list(zip(random_males, random_females))
    return random_coocurrences
    

### Calculo la matriz de co-ocurrencia real

In [20]:
predictions = set_prediction_grid_classification(predictions)
matches_per_bird = get_grid_matches_per_bird(predictions, 5)


In [21]:
## backup de las variables que vamos a modificar

#FEMALES_BCK = FEMALES.copy()
#MALES_BCK = MALES.copy()

FEMALES = FEMALES_BCK.copy()
MALES = MALES_BCK.copy()

In [22]:
real_coocurrence_matrix = [[]] * len(FEMALES)
for i in range(len(FEMALES)):
    real_coocurrence_matrix[i] = []
    for m in MALES:
        real_coocurrence_matrix[i].append(matches_per_bird.get(FEMALES[i]).get(m, 0))

In [23]:
females_count = len(FEMALES)
for i in range(females_count):
    best_female_index, best_male_index = get_best_partners(real_coocurrence_matrix)
    total_coocurrency = np.sum(real_coocurrence_matrix)
    print('best female and male', FEMALES[best_female_index], MALES[best_male_index])
    print('total_coocurrency', total_coocurrency)
    males_distribution, females_distribution = get_birds_distribution(predictions)

    TOTAL_ITER = 3000
    count_random_greater = 0
    SAMPLE_SIZE = total_coocurrency

    max_random = 0
    for i in range(TOTAL_ITER):
        random_coocurrences = get_random_coocurrence_list(males_distribution, females_distribution, SAMPLE_SIZE)
        coocurrence_matrix_random = get_random_coocurrence_matrix(random_coocurrences)
        if coocurrence_matrix_random[best_female_index][best_male_index] >= real_coocurrence_matrix[best_female_index][best_male_index]:
            ## > o >= ??
            count_random_greater += 1
        max_random = max(max_random, coocurrence_matrix_random[best_female_index][best_male_index])
    print('max coocur values random vs real: ', max_random, real_coocurrence_matrix[best_female_index][best_male_index])

    pvalue = count_random_greater/TOTAL_ITER
    print('pvalue', pvalue)
    if pvalue > 0.001:
        break
        
    FEMALES.remove(FEMALES[best_female_index])
    MALES.remove(MALES[best_male_index])

    real_coocurrence_matrix_pd = pd.DataFrame(real_coocurrence_matrix)
    real_coocurrence_matrix = (
        real_coocurrence_matrix_pd
        .drop(best_female_index)
        .drop(best_male_index, axis=1)
        .values
        .tolist()
    )    

best female and male 20 21
total_coocurrency 38679
max coocur values random vs real:  1377.0 4181
pvalue 0.0
best female and male 33 32
total_coocurrency 23806
max coocur values random vs real:  1923.0 2220
pvalue 0.0
best female and male 30 31
total_coocurrency 18159
max coocur values random vs real:  838.0 2207
pvalue 0.0
best female and male 24 23
total_coocurrency 11101
max coocur values random vs real:  377.0 1439
pvalue 0.0
best female and male 34 29
total_coocurrency 6924
max coocur values random vs real:  621.0 1212
pvalue 0.0
best female and male 28 14
total_coocurrency 4502
max coocur values random vs real:  284.0 667
pvalue 0.0
best female and male 26 25
total_coocurrency 2494
max coocur values random vs real:  89.0 628
pvalue 0.0
best female and male 11 10
total_coocurrency 1341
max coocur values random vs real:  34.0 559
pvalue 0.0
best female and male 17 22
total_coocurrency 638
max coocur values random vs real:  262.0 432
pvalue 0.0
