In [34]:
import pandas as pd 
from geolib import geohash
from haversine import haversine, Unit
from math import radians, cos, sin, asin, sqrt
import numpy as np 

In [32]:
def calculate_distance(lat1, lon1, lat2, lon2 ):
    # haversine formula 


    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c * 1000 # in meter 
    return km

In [7]:
# load dataset 

df_poi = pd.read_csv('../google_poi.csv')

df_zomato = pd.read_csv('3_clean_zomato_feat.csv')


In [26]:
# preprocessing POI

df_poi['geohash6'] = df_poi['geohash_poi_8'].apply(lambda x:x[:6])

#filter jakarta 
df_jak_poi = df_poi[(df_poi['city'].str.contains("Jak"))]
df_jak_poi = df_jak_poi.reset_index()

In [27]:
df_jak_poi.head()

Unnamed: 0,index,city,poi_type,name,lat_poi,long_poi,geohash_poi_8,geohash6
0,32,Jaksel,park,RPTRA Intan,-6.294798,106.804146,qqgunqh6,qqgunq
1,33,Jaksel,convenience_store,Toko Mahkota Jaya,-6.330934,106.797629,qqggyprx,qqggyp
2,34,Jaksel,bank,Bank Danamon - KC Ampera Raya,-6.287712,106.818734,qqgunxr5,qqgunx
3,35,Jaksel,supermarket,Akila Cell,-6.253428,106.760147,qqgumms7,qqgumm
4,45,Jakbar,lodging,Motel Bandung,-6.122592,106.686335,qqgv5tqt,qqgv5t


In [17]:
# preprocessing zomato 
df_zomato['geohash'] = df_zomato.apply(lambda x:geohash.encode(x['lat'],x['long'],6),axis=1)


In [35]:
# perform the geo extraction 
distance = 500 
df_zom_geo = df_zomato[['index','lat','long','geohash']]
df_zom_geo['neighbors'] = df_zom_geo['geohash'].apply(lambda x:list(geohash.neighbours(x)))
df_zom_geo = df_zom_geo.explode('neighbors')

df_zom_poi = pd.merge(df_zom_geo, df_jak_poi, left_on = 'neighbors', right_on = 'geohash6', how='left')
df_zom_poi['distance'] = df_zom_poi.apply(lambda x:calculate_distance(x['lat'],x['long'],x['lat_poi'],x['long_poi']), axis= 1)

df_zom_poi = df_zom_poi[df_zom_poi['distance'] < distance]


df_zom_poi_gb = df_zom_poi.groupby(['index_x','poi_type']).agg({'index_y':'nunique'}).reset_index()
df_zom_poi_gb.columns = ['index','poi_type','total']
df_zom_poi_gb['total'] = df_zom_poi_gb['total'].astype(int)

df_zom_poi_pivot = df_zom_poi_gb.pivot_table(index='index', columns='poi_type', values='total').fillna(0).reset_index()
prefix_name = 'd_{}_'.format(distance)
df_zom_poi_pivot = df_zom_poi_pivot.add_prefix(prefix_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_zom_geo['neighbors'] = df_zom_geo['geohash'].apply(lambda x:list(geohash.neighbours(x)))


In [69]:
df_zom_poi_pivot.head()

poi_type,d_500_index,d_500_airport,d_500_atm,d_500_bank,d_500_cafe,d_500_convenience_store,d_500_gas_station,d_500_hospital,d_500_lodging,d_500_meal_takeaway,d_500_mosque,d_500_park,d_500_restaurant,d_500_school,d_500_store,d_500_supermarket,d_500_train_station
0,0,0.0,20.0,26.0,9.0,11.0,0.0,6.0,6.0,2.0,0.0,2.0,40.0,10.0,22.0,6.0,0.0
1,1,0.0,104.0,51.0,49.0,5.0,0.0,2.0,20.0,14.0,4.0,2.0,82.0,12.0,48.0,20.0,0.0
2,2,0.0,20.0,14.0,31.0,6.0,0.0,11.0,15.0,6.0,2.0,1.0,32.0,13.0,19.0,10.0,0.0
3,3,0.0,23.0,19.0,8.0,3.0,0.0,0.0,5.0,2.0,0.0,6.0,8.0,7.0,12.0,4.0,0.0
4,4,0.0,43.0,28.0,17.0,6.0,0.0,3.0,13.0,5.0,0.0,2.0,14.0,7.0,18.0,11.0,0.0
