In [1]:
import pandas as pd 
from geolib import geohash
from haversine import haversine, Unit
from math import radians, cos, sin, asin, sqrt
from scipy.stats import entropy
import numpy as np 
from numpy.linalg import norm

In [2]:
def calculate_distance(lat1, lon1, lat2, lon2 ):
    # haversine formula 


    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c * 1000 # in meter 
    return km

def compute_entropy(df_input):
    
    return entropy(df_input)


def density_gof(df_zomato, df_poi, distance):
    # data preparation 
    df_zom_geo = df_zomato[['index','lat','long','geohash']]
    df_zom_geo['neighbors'] = df_zom_geo['geohash'].apply(lambda x:list(geohash.neighbours(x)))
    df_zom_geo = df_zom_geo.explode('neighbors')

    # merge with poi 
    df_zom_poi = pd.merge(df_zom_geo, df_poi, left_on = 'neighbors', right_on = 'geohash6', how='left')
    df_zom_poi['distance'] = df_zom_poi.apply(lambda x:calculate_distance(x['lat'],x['long'],x['lat_poi'],x['long_poi']), axis= 1)

    # filter distance 
    df_zom_poi = df_zom_poi[df_zom_poi['distance'] < distance]

    # aggregate  
    df_zom_poi_gb = df_zom_poi.groupby(['index_x','poi_type']).agg({'index_y':'nunique'}).reset_index()
    df_zom_poi_gb.columns = ['index','poi_type','total']
    df_zom_poi_gb['total'] = df_zom_poi_gb['total'].astype(int)

    # pivot 
    df_zom_poi_pivot = df_zom_poi_gb.pivot_table(index='index', columns='poi_type', values='total').fillna(0).reset_index()
    prefix_name = 'd_{}_'.format(distance)
    df_zom_poi_pivot = df_zom_poi_pivot.add_prefix(prefix_name)

    return df_zom_poi_pivot 

def compt_gof(df_zomato_1, df_zomato_2, distance):
    # data preparation 
    df_zom_geo = df_zomato_1[['index','lat','long','geohash','encode']]
    df_zom_geo['neighbors'] = df_zom_geo['geohash'].apply(lambda x:list(geohash.neighbours(x)))
    df_zom_geo = df_zom_geo.explode('neighbors')

    #Preprocessing for df_zomato_2

    df_zom_geo2 = df_zomato_2[['index','lat','long','geohash','encode']]
    df_zom_geo2.columns = ['index2','lat2','long2','geohash2','encode2']

    print(df_zom_geo.shape)
    print(df_zom_geo2.shape)

    print(df_zom_geo.columns)
    print(df_zom_geo2.columns)

    # # merge with poi 
    df_zom_zom = pd.merge(df_zom_geo, df_zom_geo2, left_on = 'neighbors', right_on = 'geohash2', how='inner')
    df_zom_zom['distance'] = df_zom_zom.apply(lambda x:calculate_distance(x['lat'],x['long'],x['lat2'],x['long2']), axis= 1)

    print(df_zom_zom.shape)
    print(df_zom_zom.columns)
    # filter distance 
    df_zom_zom = df_zom_zom[df_zom_zom['distance'] < distance]

    print(df_zom_zom.shape)
    print(df_zom_zom.columns)
    
    # compute competitiveness 
    df_zom_zom['comp_score'] = df_zom_zom.appply(lambda x:compute_competitiveness(x['encode'],x['encode2']))

    return df_zom_zom 

def compute_competitiveness(rest_a, rest_b):

    # convert to list 
    list_a = list(map(lambda x:int(x), list(rest_a)))
    list_b = list(map(lambda x:int(x), list(rest_b)))

    # convert to array 
    arr_a = np.array(list_a)
    arr_b = np.array(list_b)

    # compute similarity 
    cosine = np.dot(arr_a,arr_b)/(norm(arr_a)*norm(arr_b))

    return cosine


In [3]:
# df_tmp = pd.read_csv('data/3_clean_zomato_gof_500.csv')
# df_tmp = df_tmp.drop(['url', 'rest_price_idr', 'review', 'lat', 'long', 'is_chain',
#        'rating', 'new_code_res_type', 'rank_res_type', 'new_code_fac',
#        'rank_fac', 'geohash'],axis=1)

# geo_cols = df_tmp.columns[1:]
# df_tmp.head()

In [4]:
# distance=500
# df_tmp['en_{}'.format(distance)] = df_tmp[geo_cols].apply(lambda x:compute_entropy(x),axis=1)

# df_tmp['en_500'].plot(kind='hist')

In [5]:
# load dataset 

df_poi = pd.read_csv('../google_poi.csv')

df_zomato = pd.read_csv('data/3_clean_zomato_feat.csv')

df_zomato_type = pd.read_csv('data/2_res_type.csv')

df_zomato = pd.merge(df_zomato, df_zomato_type[['encode','rank']], left_on='rank_res_type', right_on='rank', how='left')


In [6]:
# preprocessing POI

df_poi['geohash6'] = df_poi['geohash_poi_8'].apply(lambda x:x[:6])

#filter jakarta 
df_jak_poi = df_poi[(df_poi['city'].str.contains("Jak"))]
df_jak_poi = df_jak_poi.reset_index()

In [6]:
# preprocessing zomato 
df_zomato['geohash'] = df_zomato.apply(lambda x:geohash.encode(x['lat'],x['long'],6),axis=1)


In [7]:
# find surrounding zomato restaurant 

df_gof_compt = compt_gof(df_zomato,df_zomato,1000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_zom_geo['neighbors'] = df_zom_geo['geohash'].apply(lambda x:list(geohash.neighbours(x)))


(4905496, 6)
(613187, 5)
Index(['index', 'lat', 'long', 'geohash', 'encode', 'neighbors'], dtype='object')
Index(['index2', 'lat2', 'long2', 'geohash2', 'encode2'], dtype='object')


MemoryError: Unable to allocate 132. GiB for an array with shape (17767175874,) and data type int64

In [29]:
# perform geo extraction zomato and POI 

# density 
distance = 250
df_gof_dens = density_gof(df_zomato, df_jak_poi, distance)

# entropy 
geo_cols = df_gof_dens.columns[1:]

df_gof_dens['en_{}'.format(distance)] = df_gof_dens[geo_cols].apply(lambda x:compute_entropy(x),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_zom_geo['neighbors'] = df_zom_geo['geohash'].apply(lambda x:list(geohash.neighbours(x)))


KeyboardInterrupt: 

In [18]:
df_zomato_gof_dens = pd.merge(df_zomato, df_gof_dens, left_on='index', right_on='d_{}_index'.format(distance), how='left')
df_zomato_gof_dens = df_zomato_gof_dens.fillna(0)
df_zomato_gof_dens = df_zomato_gof_dens.drop('d_{}_index'.format(distance),axis=1)

In [19]:
df_zomato_gof_dens.head()

Unnamed: 0,index,url,rest_price_idr,review,lat,long,is_chain,rating,new_code_res_type,rank_res_type,...,d_250_lodging,d_250_meal_takeaway,d_250_mosque,d_250_park,d_250_restaurant,d_250_school,d_250_store,d_250_supermarket,d_250_train_station,en_250
0,0,https://www.zomato.com/jakarta/wakacao-1-kelap...,150000.0,56.0,-6.167531,106.901752,0,3.8,4,0,...,2.0,0.0,0.0,0.0,12.0,0.0,2.0,0.0,0.0,1.420572
1,1,https://www.zomato.com/jakarta/the-coffee-bean...,110000.0,61.0,-6.157341,106.907888,1,3.0,0,91,...,5.0,11.0,1.0,0.0,48.0,7.0,38.0,8.0,0.0,1.976479
2,2,https://www.zomato.com/jakarta/angke-kelapa-ga...,450000.0,363.0,-6.152638,106.892576,0,4.5,2,2,...,6.0,2.0,0.0,0.0,23.0,8.0,12.0,4.0,0.0,2.175134
3,3,https://www.zomato.com/jakarta/ikan-nila-pak-u...,100000.0,113.0,-6.163588,106.903689,0,3.9,0,20,...,0.0,2.0,0.0,0.0,5.0,2.0,1.0,2.0,0.0,1.806074
4,4,https://www.zomato.com/jakarta/hong-kong-sheng...,220000.0,257.0,-6.157255,106.90843,1,4.1,0,275,...,2.0,2.0,0.0,2.0,4.0,4.0,10.0,5.0,0.0,2.084743


In [20]:
df_zomato_gof_dens.to_csv('data/3_clean_zomato_gof_{}.csv'.format(distance),index=False)