# comparar performance

In [None]:
import pandas as pd
import numpy as np
import yaml

from sqlalchemy import create_engine
import psycopg2

import geopandas as gpd
from h3 import h3
from shapely.geometry import LineString, Point

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def imputar_detino_por_tarjeta(tramos_tarjeta,tolerancia_hex = 21):
    try:
        for i in range(len(tramos_tarjeta)):
            # tomo un tramo
            tramo = tramos_tarjeta.iloc[i]
            # tomo la linea del tramo
            linea_trx = tramo.linea
            if linea_trx in equivalencias.keys():
                linea_carto = equivalencias[linea_trx]
            else:
                linea_carto = linea_trx

            # y sus posibles paradas de la cartografia
            paradas_destino_posibles = paradas.loc[paradas.linea == linea_carto,'h3_res_%i'%resolucion]

            # tomo las coordenadas del siguiente origen
            if i < (len(tramos_tarjeta)-1):
                #cuando hay un O_2
                O_2 = tramos_tarjeta.iloc[i+1]['h3_o']
            else:
                #si no tomar el primero del dia
                O_2 = tramos_tarjeta.iloc[0]['h3_o']
                
            #calculo la distancias        
            distancias_a_paradas = paradas_destino_posibles.map(
                lambda h : h3.h3_distance(
                    h3_address_origin = O_2,
                    h3_address_h3 = h))
            
            #evaluo tolerancia
            if any(distancias_a_paradas < tolerancia_hex):
                parada_destino = paradas_destino_posibles.loc[distancias_a_paradas.idxmin()]
            else:
                parada_destino = None 
                
            #asigno el destino
            tramos_tarjeta.loc[tramo.name,'h3_d'] = parada_destino
            
    except: 
        print('tarjeta:',tramos_tarjeta.tarjeta.unique())
        
    return tramos_tarjeta

def crear_Punto(row):
    if row.lat_d == None:
        return None
    else:
        return Point(row.lon_d,row.lat_d)
        

In [None]:
DB_USERNAME = 'sube_user'
DB_PASSWORD = 'sube_pass'
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = 'sube'
DB_SCHEMA = 'public'

In [None]:
# Conectar a la db
conn = psycopg2.connect(user = DB_USERNAME,
                                      password = DB_PASSWORD,
                                      host = DB_HOST,
                                      port = DB_PORT,
                                      database = DB_NAME)

In [None]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'
    .format(DB_USERNAME, DB_PASSWORD, DB_HOST,
            DB_PORT, DB_NAME))

In [None]:
with open('../recorridos_equivalencias.yaml') as file:
    equivalencias = yaml.load(file, Loader=yaml.FullLoader)

In [None]:
cantidad_de_tarjetas = 30000

In [None]:
q = """
select *
from tramos_linea_b t 
where tarjeta in (
    with mask_table as (
        select tarjeta,count(tarjeta) = SUM(CASE when h3_d  IS NULL THEN 1 else 0 END) as mask
        from tramos_linea_b tlb 
        group by tarjeta
    )
    select tarjeta
    from mask_table
    where mask = true
    limit %i
)
order by tarjeta, tramo_id;
"""%cantidad_de_tarjetas
tramos = pd.read_sql(q, conn)
tramos['delta'] = pd.to_timedelta(tramos['delta'])
tramos.head()

In [None]:
len(tramos)

In [None]:
resolucion = 11
tolerancia_metros = 1000
distancia_entre_hex = h3.edge_length(resolution=resolucion, unit='m') * 2 
tolerancia_hex = np.ceil(tolerancia_metros / distancia_entre_hex)

In [None]:
q = """
select p."linea", p."latitude",p."longitude",p."h3_res_%i"
from paradas p
"""%resolucion
paradas = pd.read_sql(q, conn)
paradas.head()

In [None]:
%time destinos = tramos.groupby(['tarjeta']).apply(imputar_detino_por_tarjeta,tolerancia_hex=tolerancia_hex)
destinos

In [None]:
#tarjetas_sin_destinos = destinos.groupby('tarjeta').agg(lambda x: x.isnull().sum()==len(x))
#tarjetas_sin_destinos[tarjetas_sin_destinos.h3_d == True].head()

In [None]:
# 5k tarjetas 18k trx tardo: 3 min
# 10k tarjetas 38k trx tardo: 6min
# 20k tarjetas 78k trx tardo: 13min
# 40k tarjetas 160k trx tardo: 29min
# 80k tarjetas 320k trx tardo: 56min
# 50k tarjetas 218 trx tardo: 40min

In [None]:
destinos.h3_d.isnull().sum()/len(destinos)*100

In [None]:
sin_destino_por_linea = destinos.groupby('linea').apply(lambda dt: dt.h3_d.isnull().sum()/len(dt)*100)
sin_destino_por_linea = pd.DataFrame({'prop_sin_destino':sin_destino_por_linea,'trx':destinos.linea.value_counts()})
sin_destino_por_linea.sort_values(by=['prop_sin_destino','trx'],ascending=False).head(20)

In [None]:
destinos.loc[destinos.tarjeta == destinos.tarjeta.sample(1).iloc[0],:]

In [None]:
destinos = destinos.reindex(columns = ['tarjeta','tramo_id','h3_d'])
destinos

In [None]:
destinos.to_sql('destinos', engine, schema=DB_SCHEMA,index=False,method='multi')

In [None]:
update_query = """
UPDATE tramos_linea_b 
SET h3_d = d.h3_d
FROM destinos d
WHERE tramos_linea_b.tarjeta = d.tarjeta
and tramos_linea_b.tramo_id = d.tramo_id;


DROP TABLE IF EXISTS destinos; 
"""

In [None]:
cur = conn.cursor()
cur.execute(update_query)
cur.close()
conn.commit()

In [None]:
q="""
with mask_table as (
    select tarjeta,count(tarjeta) = SUM(CASE when h3_d  IS NULL THEN 1 else 0 END) as mask
    from tramos_linea_b tlb 
    group by tarjeta
)
select count(*)
from mask_table
where mask = true ;
"""
quedan = pd.read_sql(q, conn)
quedan = quedan.iloc[0,0]

In [None]:
print('Quedan %s tarjetas'%quedan)

In [None]:
# post 80k 1 tanda tarjetas quedan 106811
# post 80k 2 tanda tarjetas quedan 27609
