In [1]:
import pyspark
from datetime import datetime, timedelta


In [2]:
sc = pyspark.sql.SparkSession.Builder().getOrCreate()


## Levanto los archivos

In [None]:
DIR = "datos/20-01-18/"

FILE_CALIBRACION = "datos/DatosCalibracion.csv"
FILE_RECEPTORES = ["rec1.csv", "rec2.csv", "rd1.csv", "rd2.csv"] 

def rdd_from_file(filename):
    return sc.read.csv(filename, header=True, inferSchema=True).rdd


cal_rdd = rdd_from_file(FILE_CALIBRACION)
recep_rdd = [rdd_from_file(DIR+name) for name in FILE_RECEPTORES]

## Utils

In [None]:
def apply_all(list_rdd, f):
    return list(map(f, list_rdd)) 

def remove_no_receptions(l):
    return filter(lambda e: e > 0,l)

## Filtro los datos de calibración vacios

In [None]:
cal_rdd = cal_rdd.filter(lambda x: x['Fecha'] != 'NA')

## Agrego timestamp

In [None]:
def add_timestamp(row, date_field, time_field, suffix = ''):
    row_dict = row if isinstance(row, (dict)) else row.asDict()
    row_dict['timestamp'+suffix] = datetime.strptime(row[date_field]+' '+ row[time_field], '%m/%d/%y %H:%M:%S')
    return row_dict
    
recep_rdd = apply_all(recep_rdd, lambda recep: recep.map(lambda x: add_timestamp(x, 'Date', 'Time'))) 



## Saco recepciones que no son de pajaros

In [None]:
cal_tags = cal_rdd.map(lambda x: int(x['Tag'])).distinct().collect()
cal_tags.append(999)
birds_recep_rdd = apply_all(recep_rdd, lambda recep: recep.filter(lambda x: x['Tag ID'] not in cal_tags))

## Junto todas las antenas

In [None]:
merged_birds_recep = birds_recep_rdd[0].union(birds_recep_rdd[1]).union(birds_recep_rdd[2]).union(birds_recep_rdd[3])


## Ordeno por fecha de manera creciente

In [None]:
merged_birds_recep = merged_birds_recep.sortBy(lambda x: x['timestamp'].timestamp())

In [121]:
def time_diff(time1, time2):
    return abs((time1-time2).total_seconds())


In [140]:
def group_antennas_recep(receptions):
    res = []
    last_recep_timestamp = datetime.min
    for recep in receptions:
        timestamp = recep['timestamp']
        if time_diff(timestamp, last_recep_timestamp) <= 2:
            if all([e['Antenna'] != recep['Antenna'] for e in res[-1]]):
                res[-1].append(recep)
        else:
            res.append([recep])
        last_recep_timestamp = timestamp
    return res



In [175]:
def format_grouped_emissions(group_emissions, tag_id):
    recep_0,recep_1,recep_2,recep_3 = 0,0,0,0
    for e in group_emissions:
        if e['Antenna'] == 'rec1':    
            recep_0 = e['Power']
        elif e['Antenna'] == 'rect':    
            recep_1 = e['Power']
        elif e['Antenna'] == 'rd1':    
            recep_2 = e['Power']
        elif e['Antenna'] == 'rd2':    
            recep_3 = e['Power']
    return {
        'tag_id': tag_id,
        'timestamp': group_emissions[0]['timestamp'],
        'recep_0': recep_0,
        'recep_1': recep_1,
        'recep_2': recep_2,
        'recep_3': recep_3,
    }
    
    
    
def format_antennas_recep(group_emissions, tag_id):
    return list(map(lambda x: format_grouped_emissions(x, tag_id), group_emissions))
    
    

In [176]:
birds_tags = merged_birds_recep.map(lambda x: x['Tag ID']).distinct().collect()

all_receptions_by_birds = []
for i in range(len(birds_tags)):
    bird_reception = merged_birds_recep.filter(lambda x: x['Tag ID'] == birds_tags[i]).collect()
    grouped_emissions = group_antennas_recep(bird_reception)
    all_receptions_by_birds.extend(format_antennas_recep(grouped_emissions, birds_tags[i]))
    

In [177]:
all_receptions_rdd = pyspark.SparkContext.getOrCreate().parallelize(all_receptions_by_birds)

In [178]:
import json

def my_converter(o):
    if isinstance(o, datetime):
        return o.__str__()
    

all_receptions_rdd.map(lambda x: json.dumps(x, default=my_converter)).saveAsTextFile('datos/day-birds.jsonlines')
