In [1]:
import pyspark

sc = pyspark.sql.SparkSession.Builder().getOrCreate()

## Levanto los archivos

In [2]:
DIR = "datos/"
FILE_COORDENADAS = "DistancesCoordenadasUTM.csv"
FILE_CALIBRACION = "DatosCalibracion.csv"
FILE_RECEPTORES = ["DatosRC1.csv", "DatosRC2.csv", "DatosD1.csv", "DatosD2.csv"] 

def rdd_from_file(filename):
    return sc.read.csv(filename, header=True, inferSchema=True).rdd

coord_rdd = rdd_from_file(DIR+FILE_COORDENADAS)
cal_rdd = rdd_from_file(DIR+FILE_CALIBRACION)
recep_rdd = [rdd_from_file(DIR+name) for name in FILE_RECEPTORES]

## Utils

In [3]:
def apply_all(list_rdd, f):
    return list(map(f, list_rdd)) 

## Filtro los datos de calibración vacios

In [4]:
cal_rdd = cal_rdd.filter(lambda x: x['Fecha'] != 'NA')

## Filtro datos de receptores invalidos (hay invalidos porque se repite el header)

In [5]:
recep_rdd = apply_all(recep_rdd, lambda recep: recep.filter(lambda row: row['Date'] is not None))


## Fix fechas ambiguas

In [6]:
import re
from datetime import datetime

pattern = re.compile('[1-9][0-9]*/[1-9][0-9]*/.*')
global pattern

def fix_date_format(row, field):
    row_dict = row.asDict()
    date = row[field]
    format_from = '%m/%d/%Y'
    format_to = '%d/%m/%Y'
    if pattern.match(date):
        row_dict[field] = datetime.strptime(date, format_from).strftime(format_to)
    return row_dict

cal_rdd = cal_rdd.map(lambda x: fix_date_format(x,'Fecha'))
recep_rdd = apply_all(recep_rdd, lambda recep: recep.map(lambda x: fix_date_format(x, 'Date'))) 


## Agrego timestamp

In [7]:
def add_timestamp(row, date_field, time_field, suffix = ''):
    row_dict = row if isinstance(row, (dict)) else row.asDict()
    row_dict['timestamp'+suffix] = datetime.strptime(row[date_field]+' '+ row[time_field], '%d/%m/%Y %H:%M:%S')
    return row_dict
    
cal_rdd = cal_rdd.map(lambda x: add_timestamp(x, 'Fecha', 'Inicio', '_inicio')).map(lambda x: add_timestamp(x, 'Fecha', 'Fin', '_fin'))
recep_rdd = apply_all(recep_rdd, lambda recep: recep.map(lambda x: add_timestamp(x, 'Date', 'Time'))) 



## Saco recepciones de pajaros

In [8]:
cal_tags = cal_rdd.map(lambda x: int(x['Tag'])).distinct().collect()
recep_rdd = apply_all(recep_rdd, lambda recep: recep.filter(lambda x: x['Tag ID'] in cal_tags))

In [9]:
recibidos_by = [recep_rdd[i].collect() for i in range(len(recep_rdd))]
global recibidos_by
def add_recep(calibr):
    for i in range(len(recibidos_by)):
        recepciones = (
            list(map(lambda x: x['Power'], filter(lambda x: int(calibr['Tag']) == x['Tag ID'] and calibr['timestamp_inicio'] <= x['timestamp'] and x['timestamp'] <= calibr['timestamp_fin'], recibidos_by[i])))
        )
        calibr['recep_{}'.format(i)] = recepciones
    return calibr

cal_rdd = cal_rdd.map(add_recep)


In [None]:
import json

def my_converter(o):
    if isinstance(o, datetime):
        return o.__str__()
    

cal_rdd.map(lambda x: json.dumps(x, default=my_converter)).saveAsTextFile('datos/points-recep.jsonlines')