In [1]:
import pyspark
from datetime import datetime, timedelta


In [2]:
sc = (
    pyspark.sql.SparkSession
    .builder
    .config("spark.executor.heartbeatInterval","36000s")
    .config("spark.network.timeout","36001s")
    .config('spark.executor.memory', '8g')
    
    .getOrCreate()
)

In [None]:
'''
sc = (
    pyspark.sql.SparkSession.Builder()
    .getOrCreate()
)
'''

## Levanto los archivos

In [None]:
FILE_CALIBRACION = "datos/DatosCalibracion.csv"

FILE_PATH = '../Tracking/{}/2018/{}.txt'
ANTENNAS_NAMES = ["RC1", "RC2", "RC-D1", "RC-D2"] 

def rdd_from_file(filename):
    return sc.read.csv(filename, header=True, inferSchema=True).rdd


cal_rdd = rdd_from_file(FILE_CALIBRACION)
recep_rdd = [rdd_from_file(FILE_PATH.format(name, name)) for name in ANTENNAS_NAMES]

## Utils

In [12]:
import json
import os  

def apply_all(list_rdd, f):
    return list(map(f, list_rdd)) 

def remove_no_receptions(l):
    return filter(lambda e: e > 0,l)


def my_converter(o):
    if isinstance(o, datetime):
        return o.__str__()

## Agrego timestamp

In [None]:
def add_timestamp(row, date_field, time_field, suffix = ''):
    row_dict = row if isinstance(row, (dict)) else row.asDict()
    row_dict['timestamp'+suffix] = datetime.strptime(row[date_field]+' '+ row[time_field], '%m/%d/%y %H:%M:%S')
    return row_dict
    
recep_rdd = apply_all(recep_rdd, lambda recep: recep.map(lambda x: add_timestamp(x, 'Date', 'Time'))) 



## Filtro los datos de calibración vacios

In [None]:
cal_rdd = cal_rdd.filter(lambda x: x['Fecha'] != 'NA')

## Saco recepciones que no son de pajaros

In [None]:
cal_tags = cal_rdd.map(lambda x: int(x['Tag'])).distinct().collect()
cal_tags.append(999)
birds_recep_rdd = apply_all(recep_rdd, lambda recep: recep.filter(lambda x: x['Tag ID'] not in cal_tags))

## Junto todas las antenas

In [None]:
merged_birds_recep = birds_recep_rdd[0].union(birds_recep_rdd[1]).union(birds_recep_rdd[2]).union(birds_recep_rdd[3])


## Ordeno por fecha de manera creciente

In [None]:
merged_birds_recep = merged_birds_recep.sortBy(lambda x: x['timestamp'].timestamp())

### Checkpoint

In [52]:
TEMP_FILE = 'tmp/checkpoint-nobd.jsonlines'
#merged_birds_recep.map(lambda x: json.dumps(x, default=my_converter)).saveAsTextFile(TEMP_FILE)

In [53]:
restored = sc.read.json(TEMP_FILE).rdd

In [54]:
def cast_timestamp(x):
    x = x.asDict()
    x['timestamp'] =  datetime.strptime(x['timestamp'], '%Y-%m-%d %H:%M:%S')
    return x

In [55]:
restored = restored.map(cast_timestamp)

In [56]:
merged_birds_recep = restored

In [57]:
def time_diff(time1, time2):
    return abs((time1-time2).total_seconds())


In [58]:
def group_antennas_recep(receptions):
    res = []
    last_recep_timestamp = datetime.min
    for recep in receptions:
        timestamp = recep['timestamp']
        if time_diff(timestamp, last_recep_timestamp) <= 2:
            if all([e['Antenna'] != recep['Antenna'] for e in res[-1]]):
                res[-1].append(recep)
        else:
            res.append([recep])
        last_recep_timestamp = timestamp
    return res



In [59]:
def format_grouped_emissions(group_emissions, tag_id):
    recep_0,recep_1,recep_2,recep_3 = 0,0,0,0
    for e in group_emissions:
        if e['Antenna'] == 'RC1':    
            recep_0 = e['Power']
        elif e['Antenna'] == 'RC2':    
            recep_1 = e['Power']
        elif e['Antenna'] == 'RC-D1':    
            recep_2 = e['Power']
        elif e['Antenna'] == 'RC-D2':    
            recep_3 = e['Power']
    return {
        'tag_id': tag_id,
        'timestamp': group_emissions[0]['timestamp'],
        'recep_0': recep_0,
        'recep_1': recep_1,
        'recep_2': recep_2,
        'recep_3': recep_3,
    }
    
    
    
def format_antennas_recep(group_emissions, tag_id):
    return list(map(lambda x: format_grouped_emissions(x, tag_id), group_emissions))
    
    

In [60]:
#birds_tags = merged_birds_recep.map(lambda x: x['Tag ID']).distinct().collect()
birds_tags = [11, 15, 17, 20, 24, 26, 28, 30, 33, 34, 10, 14, 16, 21, 22, 23, 25, 27, 29, 31, 32]
#emissions_dates = merged_birds_recep.map(lambda x: x['Date']).distinct().collect()
emissions_dates = ['', '01/01/18', '01/06/18', '01/11/18', '01/16/18', '01/21/18', '01/26/18', '01/31/18', '02/05/18', '02/10/18', '02/18/18', '~']

'''
all_receptions_by_birds = []
for bird in birds_tags:
    for date in emissions_dates:
        print(bird, date)
        bird_reception = merged_birds_recep.filter(lambda x: x['Tag ID'] == bird and x['Date'] == date).toLocalIterator()#.collect()
        grouped_emissions = group_antennas_recep(bird_reception)
        all_receptions_by_birds.extend(format_antennas_recep(grouped_emissions, bird))
'''    

'''
for bird in birds_tags:
    if os.path.exists('tmp/birds-data-{}.jsonlines'.format(bird)):
        continue
    all_receptions_by_birds = []
    for i in range(1,len(emissions_dates)):
        print(bird, emissions_dates[i-1], emissions_dates[i])
        bird_reception = merged_birds_recep.filter(lambda x: x['Tag ID'] == bird and x['Date'] >= emissions_dates[i-1] and x['Date'] < emissions_dates[i]).toLocalIterator()#.collect()
        grouped_emissions = group_antennas_recep(bird_reception)
        all_receptions_by_birds.extend(format_antennas_recep(grouped_emissions, bird))
    all_receptions_rdd = pyspark.SparkContext.getOrCreate().parallelize(all_receptions_by_birds)
    all_receptions_rdd.map(lambda x: json.dumps(x, default=my_converter)).saveAsTextFile('tmp/birds-data-{}.jsonlines'.format(bird))
'''


"\nfor bird in birds_tags:\n    if os.path.exists('tmp/birds-data-{}.jsonlines'.format(bird)):\n        continue\n    all_receptions_by_birds = []\n    for i in range(1,len(emissions_dates)):\n        print(bird, emissions_dates[i-1], emissions_dates[i])\n        bird_reception = merged_birds_recep.filter(lambda x: x['Tag ID'] == bird and x['Date'] >= emissions_dates[i-1] and x['Date'] < emissions_dates[i]).toLocalIterator()#.collect()\n        grouped_emissions = group_antennas_recep(bird_reception)\n        all_receptions_by_birds.extend(format_antennas_recep(grouped_emissions, bird))\n    all_receptions_rdd = pyspark.SparkContext.getOrCreate().parallelize(all_receptions_by_birds)\n    all_receptions_rdd.map(lambda x: json.dumps(x, default=my_converter)).saveAsTextFile('tmp/birds-data-{}.jsonlines'.format(bird))\n"

In [61]:
grouped_birds_date = merged_birds_recep.groupBy(lambda x: x['Date']+str(x['Tag ID']))

In [89]:
def magia(x):
    grouped_emissions = group_antennas_recep(x)
    return format_antennas_recep(grouped_emissions, grouped_emissions[0][0]['Tag ID'])

res = grouped_birds_date.values().map(magia)

In [90]:
res = res.flatMap(lambda x: x)

In [91]:
res.map(lambda x: json.dumps(x, default=my_converter)).saveAsTextFile('datos/all-birds-data.jsonlines')

In [92]:
res.count()

2309123

In [94]:
res.map(lambda x: str(x)).distinct().count()

2309123