In [1]:
import pyspark

sc = pyspark.sql.SparkSession.Builder().getOrCreate()

## Levanto los archivos

In [2]:
DIR = "datos/"
FILE_COORDENADAS = "DistancesCoordenadasUTM.csv"
FILE_CALIBRACION = "DatosCalibracion.csv"
FILE_RECEPTORES = ["DatosRC1.csv", "DatosRC2.csv", "DatosD1.csv", "DatosD2.csv"] 

def rdd_from_file(filename):
    return sc.read.csv(filename, header=True, inferSchema=True).rdd

coord_rdd = rdd_from_file(DIR+FILE_COORDENADAS)
cal_rdd = rdd_from_file(DIR+FILE_CALIBRACION)
recep_rdd = [rdd_from_file(DIR+name) for name in FILE_RECEPTORES]

## Utils

In [3]:
def apply_all(list_rdd, f):
    return list(map(f, list_rdd)) 

## Filtro los datos de calibración vacios

In [4]:
cal_rdd = cal_rdd.filter(lambda x: x['Fecha'] != 'NA')

## Filtro datos de receptores invalidos (hay invalidos porque se repite el header)

In [5]:
recep_rdd = apply_all(recep_rdd, lambda recep: recep.filter(lambda row: row['Date'] is not None))


## Fix fechas ambiguas

In [6]:
import re
from datetime import datetime, timedelta

pattern = re.compile('[1-9][0-9]*/[1-9][0-9]*/.*')
global pattern

def fix_date_format(row, field):
    row_dict = row.asDict()
    date = row[field]
    format_from = '%m/%d/%Y'
    format_to = '%d/%m/%Y'
    if pattern.match(date):
        row_dict[field] = datetime.strptime(date, format_from).strftime(format_to)
    return row_dict

cal_rdd = cal_rdd.map(lambda x: fix_date_format(x,'Fecha'))
recep_rdd = apply_all(recep_rdd, lambda recep: recep.map(lambda x: fix_date_format(x, 'Date'))) 


## Agrego timestamp

In [7]:
def add_timestamp(row, date_field, time_field, suffix = ''):
    row_dict = row if isinstance(row, (dict)) else row.asDict()
    row_dict['timestamp'+suffix] = datetime.strptime(row[date_field]+' '+ row[time_field], '%d/%m/%Y %H:%M:%S')
    return row_dict
    
cal_rdd = cal_rdd.map(lambda x: add_timestamp(x, 'Fecha', 'Inicio', '_inicio')).map(lambda x: add_timestamp(x, 'Fecha', 'Fin', '_fin'))
recep_rdd = apply_all(recep_rdd, lambda recep: recep.map(lambda x: add_timestamp(x, 'Date', 'Time'))) 



## Saco recepciones de pajaros

In [8]:
cal_tags = cal_rdd.map(lambda x: int(x['Tag'])).distinct().collect()
recep_rdd = apply_all(recep_rdd, lambda recep: recep.filter(lambda x: x['Tag ID'] in cal_tags))

## Agrego las recepeciones de las antenas por cada periodo de emision

In [64]:
recibidos_by = [recep_rdd[i].collect() for i in range(len(recep_rdd))]
global recibidos_by


'''
### Deprecado
def add_recep(calibr):
    for i in range(len(recibidos_by)):
        recepciones = []
        for angle in range(4):
            recepciones_angle = (
                list(map(lambda x: x['Power'], filter(lambda x: int(calibr['Tag']) == x['Tag ID'] and calibr['timestamp_inicio']+timedelta(seconds=angle*30) <= x['timestamp'] and x['timestamp'] < calibr['timestamp_inicio']+timedelta(seconds=(angle+1)*30), recibidos_by[i])))
            )
            while len(recepciones_angle) < 6:
                recepciones_angle.append(0)
            recepciones.extend(recepciones_angle)
                
        calibr['recep_{}'.format(i)] = recepciones
    return calibr
'''

def is_equals_with_delta(t1, t2):
    return abs((t1-t2).total_seconds()) <= 1

def add_recep(calibr):
    for i in range(len(recibidos_by)):
        recepciones_by_time = (
            list(map(lambda x: (x['Power'],x['timestamp']), filter(lambda x: int(calibr['Tag']) == x['Tag ID'] and calibr['timestamp_inicio'] <= x['timestamp'] and x['timestamp'] < calibr['timestamp_fin'], recibidos_by[i])))
        )
        recepciones = []   
        
        t = calibr['timestamp_inicio']
        r = 0
        for j in range(24):
            if r < len(recepciones_by_time) and is_equals_with_delta(t, recepciones_by_time[r][1]):
                recepciones.append(recepciones_by_time[r][0])
                r += 1
            else:
                recepciones.append(0)
            t = t+timedelta(seconds=5)
        
            
        calibr['recep_{}'.format(i)] = recepciones
    return calibr


cal_rdd = cal_rdd.map(add_recep)


## Agrego posicion de los puntos

In [65]:
coordenadas_UTM = sc.read.csv('datos/DistancesCoordenadasUTM.csv', header=True, inferSchema=True).rdd
dict_coordenadas = coordenadas_UTM.map(lambda x: (x['Punto'],(x['X'], x['Y']))).collectAsMap()
global dict_coordenadas

def add_coord(row):
    coordinadas = dict_coordenadas[row['Punto']]
    row['x'] = coordinadas[0]
    row['y'] = coordinadas[1]
    return row

cal_rdd = cal_rdd.map(add_coord)

In [66]:
import json

def my_converter(o):
    if isinstance(o, datetime):
        return o.__str__()
    

cal_rdd.map(lambda x: json.dumps(x, default=my_converter)).saveAsTextFile('datos/points-recep-2.jsonlines')

### Cantidad de emisiones que tienen recepcion tanto al comienzo _x_  como al fin _x_+2min, es decir, que podrian tener 25 recepciones

In [77]:
def add_recep_from_antenna(row):
    i = 1
    return list(map(lambda x: x['Time'], filter(lambda x: int(row['Tag']) == x['Tag ID'] and row['timestamp_inicio'] <= x['timestamp'] and x['timestamp'] <= row['timestamp_fin'], recibidos_by[i])))

cal_rdd.map(add_recep_from_antenna).filter(lambda x: len(x)>1 and datetime.strptime(x[0], '%H:%M:%S')+timedelta(minutes=2) == datetime.strptime(x[-1], '%H:%M:%S')).count()



6