# comparar performance

In [1]:
import pandas as pd
import numpy as np
import yaml

from sqlalchemy import create_engine
import psycopg2

import geopandas as gpd
from h3 import h3
from shapely.geometry import LineString, Point

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def imputar_detino_por_tarjeta(tramos_tarjeta,tolerancia_hex = 21):
    try:
        for i in range(len(tramos_tarjeta)):
            # tomo un tramo
            tramo = tramos_tarjeta.iloc[i]
            # tomo la linea del tramo
            linea_trx = tramo.linea
            if linea_trx in equivalencias.keys():
                linea_carto = equivalencias[linea_trx]
            else:
                linea_carto = linea_trx

            # y sus posibles paradas de la cartografia
            paradas_destino_posibles = paradas.loc[paradas.LINEA == linea_carto,'h3_res_%i'%resolucion]

            # tomo las coordenadas del siguiente origen
            if i < (len(tramos_tarjeta)-1):
                #cuando hay un O_2
                O_2 = tramos_tarjeta.iloc[i+1]['h3_o']
            else:
                #si no tomar el primero del dia
                O_2 = tramos_tarjeta.iloc[0]['h3_o']
                
            #calculo la distancias        
            distancias_a_paradas = paradas_destino_posibles.map(
                lambda h : h3.h3_distance(
                    h3_address_origin = O_2,
                    h3_address_h3 = h))
            
            #evaluo tolerancia
            if any(distancias_a_paradas < tolerancia_hex):
                parada_destino = paradas_destino_posibles.loc[distancias_a_paradas.idxmin()]
            else:
                parada_destino = None 
                
            #asigno el destino
            tramos_tarjeta.loc[tramo.name,'h3_d'] = parada_destino
            
    except: 
        print('tarjeta:',tramos_tarjeta.tarjeta.unique())
        
    return tramos_tarjeta

def crear_Punto(row):
    if row.lat_d == None:
        return None
    else:
        return Point(row.lon_d,row.lat_d)
        

In [3]:
DB_USERNAME = 'sube_user'
DB_PASSWORD = 'sube_pass'
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = 'sube'
DB_SCHEMA = 'public'

In [4]:
# Conectar a la db
conn = psycopg2.connect(user = DB_USERNAME,
                                      password = DB_PASSWORD,
                                      host = DB_HOST,
                                      port = DB_PORT,
                                      database = DB_NAME)

In [5]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'
    .format(DB_USERNAME, DB_PASSWORD, DB_HOST,
            DB_PORT, DB_NAME))

In [6]:
with open('../recorridos_equivalencias.yaml') as file:
    equivalencias = yaml.load(file, Loader=yaml.FullLoader)

In [7]:
cantidad_de_tarjetas = 30000

In [8]:
q = """
select *
from tramos_linea_b t 
where tarjeta in (
    with mask_table as (
        select tarjeta,count(tarjeta) = SUM(CASE when h3_d  IS NULL THEN 1 else 0 END) as mask
        from tramos_linea_b tlb 
        group by tarjeta
    )
    select tarjeta
    from mask_table
    where mask = true
    limit %i
)
order by tarjeta, tramo_id;
"""%cantidad_de_tarjetas
tramos = pd.read_sql(q, conn)
tramos['delta'] = pd.to_timedelta(tramos['delta'])
tramos.head()

Unnamed: 0,tarjeta,tramo_id,linea,fecha,lat_o,lon_o,delta,h3_o,lat_d,lon_d,h3_d
0,5636,0,LINEA 314,2018-10-10 08:03:29,-34.53047,-58.572,03:05:01,8bc2e3143cc1fff,,,
1,5636,1,LINEA B,2018-10-10 11:08:30,-34.574319,-58.486385,NaT,8bc2e31094b1fff,,,
2,14131,0,LINEA 107,2018-10-10 09:33:47,-34.58605,-58.50179,11:15:50,8bc2e310d2b5fff,,,
3,14131,1,LINEA B,2018-10-10 20:49:37,-34.60408,-58.411763,NaT,8bc2e311c55efff,,,
4,16149,0,SOFSE - Mitre,2018-10-10 08:49:56,-34.421285,-58.57928,10:52:27,8bc2e3040ad5fff,,,


In [9]:
len(tramos)

104109

In [10]:
resolucion = 11
tolerancia_metros = 1000
distancia_entre_hex = h3.edge_length(resolution=resolucion, unit='m') * 2 
tolerancia_hex = np.ceil(tolerancia_metros / distancia_entre_hex)

In [11]:
q = """
select p."LINEA", p."LATITUDE",p."LONGITUDE",p."h3_res_%i"
from paradas p
"""%resolucion
paradas = pd.read_sql(q, conn)
paradas.head()

Unnamed: 0,LINEA,LATITUDE,LONGITUDE,h3_res_11
0,LINEA 501,-34.887373,-58.384068,8bc2e39945adfff
1,LINEA 501,-34.88767,-58.387713,8bc2e3994cdbfff
2,LINEA 501,-34.887542,-58.390125,8bc2e39941b3fff
3,LINEA 501,-34.890546,-58.390244,8bc2e3994c76fff
4,LINEA 501,-34.892161,-58.39197,8bc2e3994d45fff


In [12]:
%time destinos = tramos.groupby(['tarjeta']).apply(imputar_detino_por_tarjeta,tolerancia_hex=tolerancia_hex)
destinos

CPU times: user 19min 1s, sys: 824 ms, total: 19min 2s
Wall time: 19min 3s


Unnamed: 0,tarjeta,tramo_id,linea,fecha,lat_o,lon_o,delta,h3_o,lat_d,lon_d,h3_d
0,5636,0,LINEA 314,2018-10-10 08:03:29,-34.530470,-58.572000,03:05:01,8bc2e3143cc1fff,,,
1,5636,1,LINEA B,2018-10-10 11:08:30,-34.574319,-58.486385,NaT,8bc2e31094b1fff,,,
2,14131,0,LINEA 107,2018-10-10 09:33:47,-34.586050,-58.501790,11:15:50,8bc2e310d2b5fff,,,
3,14131,1,LINEA B,2018-10-10 20:49:37,-34.604080,-58.411763,NaT,8bc2e311c55efff,,,
4,16149,0,SOFSE - Mitre,2018-10-10 08:49:56,-34.421285,-58.579280,10:52:27,8bc2e3040ad5fff,,,
...,...,...,...,...,...,...,...,...,...,...,...
104104,11297502,4,LINEA H,2018-10-10 15:37:49,-34.586044,-58.396803,00:29:38,8bc2e311a882fff,,,8bc2e310ad46fff
104105,11297502,5,METROVIAS S.A. (URQUIZA),2018-10-10 16:07:27,-34.585117,-58.455304,07:36:39,8bc2e310ac44fff,,,8bc2e3ab5975fff
104106,11297502,6,LINEA 440,2018-10-10 23:44:06,-34.537320,-58.706870,NaT,8bc2e3aa6c9cfff,,,8bc2e3ab5975fff
104107,11301040,0,LINEA B,2018-10-10 10:24:24,-34.603165,-58.420962,02:32:48,8bc2e311c075fff,,,8bc2e31aca6efff


In [13]:
#tarjetas_sin_destinos = destinos.groupby('tarjeta').agg(lambda x: x.isnull().sum()==len(x))
#tarjetas_sin_destinos[tarjetas_sin_destinos.h3_d == True].head()

In [14]:
# 5k tarjetas 18k trx tardo: 3 min
# 10k tarjetas 38k trx tardo: 6min
# 20k tarjetas 78k trx tardo: 13min
# 40k tarjetas 160k trx tardo: 29min
# 80k tarjetas 320k trx tardo: 56min
# 50k tarjetas 218 trx tardo: 40min

In [15]:
destinos.h3_d.isnull().sum()/len(destinos)*100

13.09108722588825

In [16]:
sin_destino_por_linea = destinos.groupby('linea').apply(lambda dt: dt.h3_d.isnull().sum()/len(dt)*100)
sin_destino_por_linea = pd.DataFrame({'prop_sin_destino':sin_destino_por_linea,'trx':destinos.linea.value_counts()})
sin_destino_por_linea.sort_values(by=['prop_sin_destino','trx'],ascending=False).head(20)

Unnamed: 0,prop_sin_destino,trx
LINEA 237A,100.0,299
LINEA 501G,100.0,174
LINEA 501 A,100.0,136
LINEA 501C,100.0,122
LINEA 061,100.0,99
LINEA 504A,100.0,63
LINEA 518,100.0,58
LINEA 443A,100.0,50
LINEA 506 AMBA ALMIRANTE BROWN,100.0,49
LINEA 32A,100.0,38


In [17]:
destinos.loc[destinos.tarjeta == destinos.tarjeta.sample(1).iloc[0],:]

Unnamed: 0,tarjeta,tramo_id,linea,fecha,lat_o,lon_o,delta,h3_o,lat_d,lon_d,h3_d
17057,10076659,0,METROVIAS S.A. (URQUIZA),2018-10-10 11:46:58,-34.590027,-58.572089,00:30:32,8bc2e3175a62fff,,,8bc2e310ad41fff
17058,10076659,1,LINEA B,2018-10-10 12:17:30,-34.587198,-58.455029,08:24:15,8bc2e310ad46fff,,,8bc2e311d98dfff
17059,10076659,2,LINEA B,2018-10-10 20:41:45,-34.598967,-58.439771,00:10:17,8bc2e311d98dfff,,,8bc2e310ad46fff
17060,10076659,3,METROVIAS S.A. (URQUIZA),2018-10-10 20:52:02,-34.585117,-58.455304,00:36:03,8bc2e310ac44fff,,,8bc2e3175a75fff
17061,10076659,4,LINEA 328,2018-10-10 21:28:05,-34.58955,-58.57268,NaT,8bc2e3175a63fff,,,8bc2e3175a54fff


In [18]:
destinos = destinos.reindex(columns = ['tarjeta','tramo_id','h3_d'])
destinos

Unnamed: 0,tarjeta,tramo_id,h3_d
0,5636,0,
1,5636,1,
2,14131,0,
3,14131,1,
4,16149,0,
...,...,...,...
104104,11297502,4,8bc2e310ad46fff
104105,11297502,5,8bc2e3ab5975fff
104106,11297502,6,8bc2e3ab5975fff
104107,11301040,0,8bc2e31aca6efff


In [19]:
destinos.to_sql('destinos', engine, schema=DB_SCHEMA,index=False,method='multi')

In [20]:
update_query = """
UPDATE tramos_linea_b 
SET h3_d = d.h3_d
FROM destinos d
WHERE tramos_linea_b.tarjeta = d.tarjeta
and tramos_linea_b.tramo_id = d.tramo_id;


DROP TABLE IF EXISTS destinos; 
"""

In [21]:
cur = conn.cursor()
cur.execute(update_query)
cur.close()
conn.commit()

In [22]:
q="""
with mask_table as (
    select tarjeta,count(tarjeta) = SUM(CASE when h3_d  IS NULL THEN 1 else 0 END) as mask
    from tramos_linea_b tlb 
    group by tarjeta
)
select count(*)
from mask_table
where mask = true ;
"""
quedan = pd.read_sql(q, conn)
quedan = quedan.iloc[0,0]

In [23]:
print('Quedan %s tarjetas'%quedan)

Quedan 935 tarjetas


In [24]:
# post 80k 1 tanda tarjetas quedan 106811
# post 80k 2 tanda tarjetas quedan 27609
