In [1]:
import pandas as pd
import numpy as np
import yaml
from h3 import h3
from sklearn.cluster import DBSCAN
from sqlalchemy import create_engine
import psycopg2
import folium


In [2]:
DB_USERNAME = 'sube_user'
DB_PASSWORD = 'sube_pass'
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = 'sube'
DB_SCHEMA = 'public'

In [3]:
# Conectar a la db
conn = psycopg2.connect(user = DB_USERNAME,
                                      password = DB_PASSWORD,
                                      host = DB_HOST,
                                      port = DB_PORT,
                                      database = DB_NAME)

In [4]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'
    .format(DB_USERNAME, DB_PASSWORD, DB_HOST,
            DB_PORT, DB_NAME))

In [5]:
cantidad_de_tarjetas = 80000

In [6]:
q = """
select *
from viajes_linea_b t 
where tarjeta in (
    with mask_table as (
        select tarjeta,count(tarjeta) = SUM(CASE when h3_d  IS NULL THEN 0 else 1 END) as mask
        from viajes_linea_b tlb 
        group by tarjeta
    )
    select tarjeta
    from mask_table
    where mask = true
    limit %i
)
order by tarjeta, viaje_id;
"""%cantidad_de_tarjetas

In [7]:
viajes = pd.read_sql(q, conn)

In [8]:
viajes.shape

(176442, 11)

In [9]:
viajes['lat_d'] = viajes.h3_d.map(lambda h: h3.h3_to_geo(h)[0])
viajes['lon_d'] = viajes.h3_d.map(lambda h: h3.h3_to_geo(h)[1])

In [10]:
X_o = viajes.reindex(columns = ['lat_o','lon_o']).values
X_d = viajes.reindex(columns = ['lat_d','lon_d']).values

In [11]:
clusters_o = DBSCAN(eps=0.002, min_samples=1000).fit(X_o)
#clusters_d = DBSCAN(eps=0.014, min_samples=50).fit(X_d)


In [12]:
viajes['k_o'] = clusters_o.labels_
#viajes['k_d'] = clusters_d.labels_

In [13]:
k_o = viajes.reindex(columns = ['lat_o','lon_o','k_o']).groupby('k_o').agg(['mean','count'])
k_o = k_o.reset_index()
k_o.columns = k_o.columns.droplevel(1)
k_o.columns = ['k_o','lat_o','n2','lon_o','n']
k_o = k_o.drop('n2',axis=1)

In [None]:
'''
k_d = viajes.reindex(columns = ['lat_d','lon_d','k_d']).groupby('k_d').agg(['mean','count'])
k_d = k_d.reset_index()
k_d.columns = k_d.columns.droplevel(1)
k_d.columns = ['k_d','lat_d','n2','lon_d','n']
k_d=k_d.drop('n2',axis=1)
'''

In [None]:
#k_d 

In [14]:
k_o

Unnamed: 0,k_o,lat_o,lon_o,n
0,-1,-34.613125,-58.459705,68227
1,0,-34.57431,-58.486419,9902
2,1,-34.604116,-58.411781,4696
3,2,-34.604158,-58.389918,14224
4,3,-34.603751,-58.380743,5069
5,4,-34.60305,-58.369912,6667
6,5,-34.58675,-58.454938,5041
7,6,-34.602149,-58.431314,6965
8,7,-34.604627,-58.399467,4662
9,8,-34.59901,-58.439722,7828


In [None]:
viajes.to_csv('caca.csv',index=False)

In [None]:
miny = min(min(viajes.lat_o), min(viajes.lat_o)) - 0.06
maxy = max(max(viajes.lat_o), max(viajes.lat_o)) + 0.06
minx = min(min(viajes.lon_o), min(viajes.lon_o)) - 0.06
maxx = max(max(viajes.lon_o), max(viajes.lon_o)) + 0.06


x_centro,y_centro = minx + ((maxx - minx)/2),miny + ((maxy - miny)/2)

m = folium.Map(
        location=[y_centro,x_centro],
        zoom_start=12,
        tiles='cartodbpositron'
    )
for i in k_o.index:
    folium.Circle(
            radius=np.sqrt(k_o.loc[i].n),
            location=[k_o.loc[i].lat_o,k_o.lon_o.loc[i]],
            popup='Origen '+str(k_o.loc[i].k_o),
            color='blue',
            fill=True,
        ).add_to(m)
'''    
for i in k_d.index:
    folium.Circle(
            radius=k_d.loc[i].n,
            location=[k_d.loc[i].lat_d,k_d.lon_d.loc[i]],
            popup='Destino '+str(k_d.loc[i].k_d),
            color='red',
            fill=True,
        ).add_to(m)

'''
m