In [1]:
import numpy as np
import pandas as pd

## Load Data

In [2]:
df = pd.read_hdf("data/sample_data.hdf")
df[:5]

Unnamed: 0,trip_id,lat,lon,imo,timestamp_position,sog,name,dist_orig,dist_dest,port_id_orig,port_id_dest,cum_dist_orig,cum_dist_dest,time_to_arrive,time_from_departure,time_port_to_port
0,1234567890,33.644173,-21.122902,1212428,2017-01-01 19:28:16,-69.9,DUMMY,457569.057001,3016.951519,285,316,542072.385284,22961.778699,0.145833,11.202083,11.347917
1,1234567890,33.711784,-21.122818,1212428,2017-01-01 21:15:36,0.0,DUMMY,464362.593692,4540.163553,285,316,549586.456525,9177.323661,0.071528,11.276389,11.347917
2,1234567890,33.85944,-22.946632,1212428,2017-01-01 16:57:17,-69.8,DUMMY,485558.703834,25914.821769,285,316,512531.912831,55064.576092,0.250694,11.097222,11.347917
3,1234567890,33.936378,-22.906212,1212428,2016-06-05 15:57:49,5.3,DUMMY,494737.796234,34899.458219,285,316,499450.236029,59167.090988,1.292361,10.055556,11.347917
4,1234567890,33.945038,-22.908249,1212428,2016-06-05 16:42:06,-69.3,DUMMY,495523.918761,35620.619295,285,316,500420.026386,57642.342082,1.261111,10.086806,11.347917


## Point for Neighborhood

In [3]:
pt = np.array([[44.7529478,-7.0088461]])

### Usual version

In [5]:
import math
def distance(lat_orig, lon_orig, lat_dest, lon_dest):
    radius = 6371.0 # km

    dlat = math.radians(lat_dest-lat_orig)
    dlon = math.radians(lon_dest-lon_orig)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat_orig)) \
        * math.cos(math.radians(lat_dest)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c
    return d

%timeit df['distance'] = df[['lat','lon']].apply(lambda x: distance(x['lat'],x['lon'], pt[0,0], pt[0,1]), axis=1)
df.sort_values(by='distance')[:5]

1 loop, best of 3: 25.5 s per loop


Unnamed: 0,trip_id,lat,lon,imo,timestamp_position,sog,name,dist_orig,dist_dest,port_id_orig,port_id_dest,cum_dist_orig,cum_dist_dest,time_to_arrive,time_from_departure,time_port_to_port,distances_cython,distance
482060,1234567890,22.261665,-72.12282,9332169,2016-10-26 15:26:34,12.9,DUMMY2,343556.44246,5369220.0,159,316,349856.94027,5667730.0,11.218056,0.625,11.843056,4971.920974,55.37072
470707,1234567890,22.28968,-72.279598,9332169,2016-10-26 15:58:53,13.0,DUMMY2,356447.489122,5356404.0,159,316,362754.19393,5655686.0,11.195833,0.647222,11.843056,4977.209267,55.806292
482058,1234567890,22.232594,-6.977557,9332169,2016-10-26 14:56:13,12.7,DUMMY2,331512.184176,5381220.0,159,316,337812.505745,5680057.0,11.238889,0.604167,11.843056,4966.718585,57.913957
470708,1234567890,22.322998,-72.464509,9332169,2016-10-26 16:37:13,12.9,DUMMY2,371653.756314,5341282.0,159,316,377965.506905,5642788.0,11.16875,0.674306,11.843056,4983.531876,59.915695
793504,1234567890,22.203129,-6.957217,9433808,2016-07-26 00:23:33,14.3,DUMMY4,329059.974783,5383893.0,159,316,336655.209965,5964044.0,10.070139,0.527778,10.597917,4963.211649,61.274151


### Vectorised version

In [6]:
DISTANCE = np.vectorize(distance, excluded=['lat_dest', 'lon_dest'])

pt[0,0], pt[0,1]
%timeit df['distances_vect'] = DISTANCE(df['lat'], df['lon'], pt[0,0], pt[0,1])
df.sort_values(by='distances_vect')[:5]

1 loop, best of 3: 3.63 s per loop


Unnamed: 0,trip_id,lat,lon,imo,timestamp_position,sog,name,dist_orig,dist_dest,port_id_orig,port_id_dest,cum_dist_orig,cum_dist_dest,time_to_arrive,time_from_departure,time_port_to_port,distances_cython,distance,distances_vect
482060,1234567890,22.261665,-72.12282,9332169,2016-10-26 15:26:34,12.9,DUMMY2,343556.44246,5369220.0,159,316,349856.94027,5667730.0,11.218056,0.625,11.843056,4971.920974,55.37072,55.37072
470707,1234567890,22.28968,-72.279598,9332169,2016-10-26 15:58:53,13.0,DUMMY2,356447.489122,5356404.0,159,316,362754.19393,5655686.0,11.195833,0.647222,11.843056,4977.209267,55.806292,55.806292
482058,1234567890,22.232594,-6.977557,9332169,2016-10-26 14:56:13,12.7,DUMMY2,331512.184176,5381220.0,159,316,337812.505745,5680057.0,11.238889,0.604167,11.843056,4966.718585,57.913957,57.913957
470708,1234567890,22.322998,-72.464509,9332169,2016-10-26 16:37:13,12.9,DUMMY2,371653.756314,5341282.0,159,316,377965.506905,5642788.0,11.16875,0.674306,11.843056,4983.531876,59.915695,59.915695
793504,1234567890,22.203129,-6.957217,9433808,2016-07-26 00:23:33,14.3,DUMMY4,329059.974783,5383893.0,159,316,336655.209965,5964044.0,10.070139,0.527778,10.597917,4963.211649,61.274151,61.274151


### Cythonised version

In [4]:
from distance_fast import dist

%timeit df['distances_cython'] = dist(df[['lat','lon']].values, pt)
df.sort_values(by='distances_cython')[:5]

1 loop, best of 3: 222 ms per loop


Unnamed: 0,trip_id,lat,lon,imo,timestamp_position,sog,name,dist_orig,dist_dest,port_id_orig,port_id_dest,cum_dist_orig,cum_dist_dest,time_to_arrive,time_from_departure,time_port_to_port,distances_cython
858953,1234567890,-0.738852,-0.417325,9467809,2016-08-06 20:06:50,13.8,DUMMY3,1536115.0,8696452.0,770,316,1542084.0,8936040.0,16.886806,2.522917,19.409722,94.355449
858954,1234567890,-0.639873,-0.640408,9467809,2016-08-06 21:11:01,13.8,DUMMY3,1563249.0,8670651.0,770,316,1569220.0,8881155.0,16.841667,2.568056,19.409722,100.663267
294224,1234567890,0.910767,-0.0086,9270749,2016-08-10 13:26:00,12.4,DUMMY5,888576.5,8609174.0,714,316,1141748.0,9105396.0,20.407639,2.11875,22.526389,101.277146
858955,1234567890,-0.606103,-0.716865,9467809,2016-08-06 21:33:13,13.6,DUMMY3,1572543.0,8661820.0,770,316,1578514.0,8854018.0,16.826389,2.583333,19.409722,104.383407
858952,1234567890,-0.939317,0.033767,9467809,2016-08-06 17:56:40,13.7,DUMMY3,1481235.0,8748655.0,770,316,1487199.0,8948106.0,16.977083,2.432639,19.409722,104.514709


## Geographical NN Search: BallTree

In [8]:
from sklearn.neighbors import BallTree

b = BallTree(np.deg2rad(df[['lat','lon']]), metric='haversine')

pt_rad = np.deg2rad(np.array([44.7529478,-7.0088461]))

ind, dist = b.query_radius(pt_rad, 200/6371.0, return_distance=True,sort_results=True)

df['distance_balltree'] = pd.Series()
df.loc[ind[0],'distance_balltree'] = dist[0] * 6371.0
df.ix[ind[0]][:5]



Unnamed: 0,trip_id,lat,lon,imo,timestamp_position,sog,name,dist_orig,dist_dest,port_id_orig,port_id_dest,cum_dist_orig,cum_dist_dest,time_to_arrive,time_from_departure,time_port_to_port,distances_cython,distance,distances_vect,distance_balltree
482060,1234567890,22.261665,-72.12282,9332169,2016-10-26 15:26:34,12.9,DUMMY2,343556.44246,5369220.0,159,316,349856.94027,5667730.0,11.218056,0.625,11.843056,4971.920974,55.37072,55.37072,55.37072
470707,1234567890,22.28968,-72.279598,9332169,2016-10-26 15:58:53,13.0,DUMMY2,356447.489122,5356404.0,159,316,362754.19393,5655686.0,11.195833,0.647222,11.843056,4977.209267,55.806292,55.806292,55.806292
482058,1234567890,22.232594,-6.977557,9332169,2016-10-26 14:56:13,12.7,DUMMY2,331512.184176,5381220.0,159,316,337812.505745,5680057.0,11.238889,0.604167,11.843056,4966.718585,57.913957,57.913957,57.913957
470708,1234567890,22.322998,-72.464509,9332169,2016-10-26 16:37:13,12.9,DUMMY2,371653.756314,5341282.0,159,316,377965.506905,5642788.0,11.16875,0.674306,11.843056,4983.531876,59.915695,59.915695,59.915695
793504,1234567890,22.203129,-6.957217,9433808,2016-07-26 00:23:33,14.3,DUMMY4,329059.974783,5383893.0,159,316,336655.209965,5964044.0,10.070139,0.527778,10.597917,4963.211649,61.274151,61.274151,61.274151
