In [1]:
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=16)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### Read Data

In [2]:
parquet = "/home/foolingeng/ruien/data/grab-posisi/city=Singapore/part-00000-8bbff892-97d2-4011-9961-703e38972569.c000.snappy.parquet"
df = pd.read_parquet(parquet)
df

Unnamed: 0,trj_id,driving_mode,osname,pingtimestamp,rawlat,rawlng,speed,bearing,accuracy
0,70014,car,android,1554943236,1.342326,103.888969,18.910000,248,3.9
1,73573,car,android,1555582623,1.321781,103.856366,17.719076,44,4.0
2,75567,car,android,1555141026,1.327088,103.861273,14.021548,34,3.9
3,1410,car,android,1555731693,1.262482,103.823794,13.026521,181,4.0
4,4354,car,android,1555584497,1.283799,103.807210,14.812943,93,3.9
...,...,...,...,...,...,...,...,...,...
3034548,67126,car,android,1554971237,1.349722,103.792840,18.255438,93,6.0
3034549,11786,car,android,1555248352,1.386651,103.774858,23.341484,352,6.0
3034550,21686,car,ios,1555805714,1.329604,103.822454,23.468313,111,5.0
3034551,70139,car,android,1555685891,1.322202,103.886999,20.248346,266,6.0


In [3]:
mall_coords = "/home/foolingeng/ruien/data/grab-posisi/amenities/shoppingmall_coordinates_clean.csv"
mall_coords = pd.read_csv(mall_coords)
mall_coords

Unnamed: 0,address,LATITUDE,LONGITUDE
0,Katong Square,1.304853,103.904574
1,PLQ Mall,1.317482,103.892832
2,Capitol Piazza,1.293063,103.851293
3,Bugis Cube,1.298195,103.855655
4,HillV2,1.363551,103.764236
...,...,...,...
154,CityLink Mall,1.292379,103.854663
155,The Clementi Mall,1.315497,103.764570
156,The Paragon,1.303949,103.835844
157,Tiong Bahru Plaza,1.286471,103.827158


In [4]:
import geopandas as gpd
mall_gdf = gpd.GeoDataFrame(mall_coords, 
    geometry=gpd.points_from_xy(mall_coords.LATITUDE, mall_coords.LONGITUDE))
gdf = gpd.GeoDataFrame(df, 
    geometry=gpd.points_from_xy(df['rawlat'], df['rawlng']))
mall_gdf, gdf

(               address  LATITUDE   LONGITUDE               geometry
 0        Katong Square  1.304853  103.904574  POINT (1.305 103.905)
 1             PLQ Mall  1.317482  103.892832  POINT (1.317 103.893)
 2       Capitol Piazza  1.293063  103.851293  POINT (1.293 103.851)
 3           Bugis Cube  1.298195  103.855655  POINT (1.298 103.856)
 4               HillV2  1.363551  103.764236  POINT (1.364 103.764)
 ..                 ...       ...         ...                    ...
 154      CityLink Mall  1.292379  103.854663  POINT (1.292 103.855)
 155  The Clementi Mall  1.315497  103.764570  POINT (1.315 103.765)
 156        The Paragon  1.303949  103.835844  POINT (1.304 103.836)
 157  Tiong Bahru Plaza  1.286471  103.827158  POINT (1.286 103.827)
 158       Jurong Point  1.339452  103.706685  POINT (1.339 103.707)
 
 [159 rows x 4 columns],
         trj_id driving_mode   osname  pingtimestamp    rawlat      rawlng  \
 0        70014          car  android     1554943236  1.342326  103

### Get nearest distance

In [7]:
from scipy.spatial import cKDTree
import numpy as np

def ckdnearest(gdA, gdB):

    nA = np.array(list(gdA.geometry.apply(lambda x: (x.x, x.y))))
    nB = np.array(list(gdB.geometry.apply(lambda x: (x.x, x.y))))
    btree = cKDTree(nB)
    dist, idx = btree.query(nA, k=1)
    gdB_nearest = gdB.iloc[idx].drop(columns="geometry").reset_index(drop=True)
    gdf = pd.concat(
        [
            gdA.reset_index(drop=True),
            gdB_nearest,
            pd.Series(dist, name='dist')
        ], 
        axis=1)

    return gdf

malls = ckdnearest(gdf, mall_gdf)
malls

Unnamed: 0,trj_id,driving_mode,osname,pingtimestamp,rawlat,rawlng,speed,bearing,accuracy,geometry,address,LATITUDE,LONGITUDE,dist
0,70014,car,android,1554943236,1.342326,103.888969,18.910000,248,3.9,POINT (1.342 103.889),Upper Serangoon Shopping Centre,1.353481,103.878828,0.015076
1,73573,car,android,1555582623,1.321781,103.856366,17.719076,44,4.0,POINT (1.322 103.856),City Square Mall,1.311477,103.856781,0.010312
2,75567,car,android,1555141026,1.327088,103.861273,14.021548,34,3.9,POINT (1.327 103.861),Zhongshan Mall,1.326945,103.846554,0.014720
3,1410,car,android,1555731693,1.262482,103.823794,13.026521,181,4.0,POINT (1.262 103.824),VivoCity,1.264395,103.821809,0.002757
4,4354,car,android,1555584497,1.283799,103.807210,14.812943,93,3.9,POINT (1.284 103.807),Alexandra Central,1.287456,103.805415,0.004074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3034548,67126,car,android,1554971237,1.349722,103.792840,18.255438,93,6.0,POINT (1.350 103.793),Beauty World Centre,1.342413,103.776539,0.017865
3034549,11786,car,android,1555248352,1.386651,103.774858,23.341484,352,6.0,POINT (1.387 103.775),Fajar Shopping Centre,1.384006,103.771030,0.004653
3034550,21686,car,ios,1555805714,1.329604,103.822454,23.468313,111,5.0,POINT (1.330 103.822),Balestier Hill Shopping Centre,1.326124,103.843710,0.021538
3034551,70139,car,android,1555685891,1.322202,103.886999,20.248346,266,6.0,POINT (1.322 103.887),Paya Lebar Square,1.318676,103.892551,0.006577


In [11]:
malls = malls.rename(columns={'LATITUDE': 'mall_lat', 'LONGITUDE': 'mall_lng', 'dist': 'mall_dst'})
# malls['mall_dst'] = malls['mall_dst'] * 111139
# malls.drop(['geometry'], axis=1).to_parquet('test.parquet')
malls

  pd.Int64Index,


In [None]:
# reading and writing for geofiles are slow ~5min++
# gdf.to_file(f'test.gpkg', layer='test', driver="GPKG")
# gdf2 = gpd.read_file('test.gpkg', layer='test')
# gdf2