In [1]:
import pandas as pd 
from geolib import geohash
from haversine import haversine, Unit
from math import radians, cos, sin, asin, sqrt
import numpy as np 

In [2]:
def calculate_distance(lat1, lon1, lat2, lon2 ):
    # haversine formula 


    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c * 1000 # in meter 
    return km

In [3]:
# load dataset 

df_poi = pd.read_csv('../google_poi.csv')

df_zomato = pd.read_csv('data/3_clean_zomato_feat.csv')


In [4]:
# preprocessing POI

df_poi['geohash6'] = df_poi['geohash_poi_8'].apply(lambda x:x[:6])

#filter jakarta 
df_jak_poi = df_poi[(df_poi['city'].str.contains("Jak"))]
df_jak_poi = df_jak_poi.reset_index()

In [5]:
df_jak_poi.head()

Unnamed: 0,index,city,poi_type,name,lat_poi,long_poi,geohash_poi_8,geohash6
0,32,Jaksel,park,RPTRA Intan,-6.294798,106.804146,qqgunqh6,qqgunq
1,33,Jaksel,convenience_store,Toko Mahkota Jaya,-6.330934,106.797629,qqggyprx,qqggyp
2,34,Jaksel,bank,Bank Danamon - KC Ampera Raya,-6.287712,106.818734,qqgunxr5,qqgunx
3,35,Jaksel,supermarket,Akila Cell,-6.253428,106.760147,qqgumms7,qqgumm
4,45,Jakbar,lodging,Motel Bandung,-6.122592,106.686335,qqgv5tqt,qqgv5t


In [6]:
# preprocessing zomato 
df_zomato['geohash'] = df_zomato.apply(lambda x:geohash.encode(x['lat'],x['long'],6),axis=1)


In [7]:
def density_gof(df_zomato, df_poi, distance):
    # data preparation 
    df_zom_geo = df_zomato[['index','lat','long','geohash']]
    df_zom_geo['neighbors'] = df_zom_geo['geohash'].apply(lambda x:list(geohash.neighbours(x)))
    df_zom_geo = df_zom_geo.explode('neighbors')

    # merge with poi 
    df_zom_poi = pd.merge(df_zom_geo, df_poi, left_on = 'neighbors', right_on = 'geohash6', how='left')
    df_zom_poi['distance'] = df_zom_poi.apply(lambda x:calculate_distance(x['lat'],x['long'],x['lat_poi'],x['long_poi']), axis= 1)

    # filter distance 
    df_zom_poi = df_zom_poi[df_zom_poi['distance'] < distance]

    # aggregate  
    df_zom_poi_gb = df_zom_poi.groupby(['index_x','poi_type']).agg({'index_y':'nunique'}).reset_index()
    df_zom_poi_gb.columns = ['index','poi_type','total']
    df_zom_poi_gb['total'] = df_zom_poi_gb['total'].astype(int)

    # pivot 
    df_zom_poi_pivot = df_zom_poi_gb.pivot_table(index='index', columns='poi_type', values='total').fillna(0).reset_index()
    prefix_name = 'd_{}_'.format(distance)
    df_zom_poi_pivot = df_zom_poi_pivot.add_prefix(prefix_name)

    return df_zom_poi_pivot 

In [10]:
df_poi.columns

Index(['city', 'poi_type', 'name', 'lat_poi', 'long_poi', 'geohash_poi_8',
       'geohash6'],
      dtype='object')

In [11]:
df_gof_dens_500 = density_gof(df_zomato, df_jak_poi, 500)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_zom_geo['neighbors'] = df_zom_geo['geohash'].apply(lambda x:list(geohash.neighbours(x)))


In [12]:
df_gof_dens_500.head()

poi_type,d_500_index,d_500_airport,d_500_atm,d_500_bank,d_500_cafe,d_500_convenience_store,d_500_gas_station,d_500_hospital,d_500_lodging,d_500_meal_takeaway,d_500_mosque,d_500_park,d_500_restaurant,d_500_school,d_500_store,d_500_supermarket,d_500_train_station
0,0,0.0,20.0,26.0,9.0,11.0,0.0,6.0,6.0,2.0,0.0,2.0,40.0,10.0,22.0,6.0,0.0
1,1,0.0,104.0,51.0,49.0,5.0,0.0,2.0,20.0,14.0,4.0,2.0,82.0,12.0,48.0,20.0,0.0
2,2,0.0,20.0,14.0,31.0,6.0,0.0,11.0,15.0,6.0,2.0,1.0,32.0,13.0,19.0,10.0,0.0
3,3,0.0,23.0,19.0,8.0,3.0,0.0,0.0,5.0,2.0,0.0,6.0,8.0,7.0,12.0,4.0,0.0
4,4,0.0,43.0,28.0,17.0,6.0,0.0,3.0,13.0,5.0,0.0,2.0,14.0,7.0,18.0,11.0,0.0


In [13]:
df_gof_dens_500.shape, df_zomato.shape

((5788, 17), (5803, 13))

In [28]:
df_zomato_gof_dens_500 = pd.merge(df_zomato, df_gof_dens_500, left_on='index', right_on='d_500_index', how='left')
df_zomato_gof_dens_500 = df_zomato_gof_dens_500.fillna(0)
df_zomato_gof_dens_500 = df_zomato_gof_dens_500.drop('d_500_index',axis=1)

In [29]:
df_zomato_gof_dens_500.head()

Unnamed: 0,index,url,rest_price_idr,review,lat,long,is_chain,rating,new_code_res_type,rank_res_type,...,d_500_hospital,d_500_lodging,d_500_meal_takeaway,d_500_mosque,d_500_park,d_500_restaurant,d_500_school,d_500_store,d_500_supermarket,d_500_train_station
0,0,https://www.zomato.com/jakarta/wakacao-1-kelap...,150000.0,56.0,-6.167531,106.901752,0,3.8,4,0,...,6.0,6.0,2.0,0.0,2.0,40.0,10.0,22.0,6.0,0.0
1,1,https://www.zomato.com/jakarta/the-coffee-bean...,110000.0,61.0,-6.157341,106.907888,1,3.0,0,91,...,2.0,20.0,14.0,4.0,2.0,82.0,12.0,48.0,20.0,0.0
2,2,https://www.zomato.com/jakarta/angke-kelapa-ga...,450000.0,363.0,-6.152638,106.892576,0,4.5,2,2,...,11.0,15.0,6.0,2.0,1.0,32.0,13.0,19.0,10.0,0.0
3,3,https://www.zomato.com/jakarta/ikan-nila-pak-u...,100000.0,113.0,-6.163588,106.903689,0,3.9,0,20,...,0.0,5.0,2.0,0.0,6.0,8.0,7.0,12.0,4.0,0.0
4,4,https://www.zomato.com/jakarta/hong-kong-sheng...,220000.0,257.0,-6.157255,106.90843,1,4.1,0,275,...,3.0,13.0,5.0,0.0,2.0,14.0,7.0,18.0,11.0,0.0


In [31]:
df_zomato_gof_dens_500.columns

Index(['index', 'url', 'rest_price_idr', 'review', 'lat', 'long', 'is_chain',
       'rating', 'new_code_res_type', 'rank_res_type', 'new_code_fac',
       'rank_fac', 'geohash', 'd_500_airport', 'd_500_atm', 'd_500_bank',
       'd_500_cafe', 'd_500_convenience_store', 'd_500_gas_station',
       'd_500_hospital', 'd_500_lodging', 'd_500_meal_takeaway',
       'd_500_mosque', 'd_500_park', 'd_500_restaurant', 'd_500_school',
       'd_500_store', 'd_500_supermarket', 'd_500_train_station'],
      dtype='object')

In [34]:
df_zomato_gof_dens_500.to_csv('data/3_clean_zomato_gof_500.csv',index=False)