In [1]:
import pandas as pd
import numpy as np
from geopy.distance import distance

In [2]:
df = pd.read_csv('./data/df_features.gz')

I read the dataframe with the detailed list with top landmarks in each city

In [3]:
lm = pd.read_excel('./data/landmarks.xlsx', sheet_name='final long')
lm.head(3)

Unnamed: 0,lat,long,City
0,41.403634,2.174219,Barcelona
1,41.395301,2.161633,Barcelona
2,41.386937,2.169763,Barcelona


In [4]:
df.columns

Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Hotel_Name', 'Reviewer_Nationality',
       'Negative_Review', 'Review_Total_Negative_Word_Counts',
       'Total_Number_of_Reviews', 'Positive_Review',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score', 'Tags',
       'days_since_review', 'lat', 'lng', 'Diff', 'Review_Month',
       'Review_Year', 'Country', 'City', 'Pet', 'Purpose', 'Whom', 'Room',
       'Length', 'Device', 'Room_Recode', 'Nationality_Recode',
       'Length_Recode'],
      dtype='object')

I create the function to calculate the number of landmarks around a hotel

In [5]:
def landmarks_around(x):
    lat = x[0]
    long = x[1]
    city = x[2]
    
    distancias = lm[lm['City']==city].apply(lambda z: distance((lat, long),(z[0],z[1])), axis=1)
    return(sum(distancias < 1.5))

I apply the function to the dataframe

In [6]:
df_compact = df[['Hotel_Address','lat','lng','City']].groupby('Hotel_Address').min().reset_index()
df_compact = df_compact.dropna()
df_compact.head()

Unnamed: 0,Hotel_Address,lat,lng,City
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968,Amsterdam
1,1 15 Templeton Place Earl s Court Kensington a...,51.491888,-0.194971,London
2,1 2 Serjeant s Inn Fleet Street City of London...,51.513734,-0.108751,London
3,1 3 Queens Garden Westminster Borough London W...,51.514218,-0.180903,London
4,1 3 Rue d Argentine 16th arr 75116 Paris France,48.874348,2.289733,Paris


In [7]:
close_landmarks = df_compact[['lat','lng','City']].apply(lambda x: landmarks_around(x), axis=1)

In [8]:
df_compact['Close_Landmarks'] = close_landmarks

In [9]:
df_compact[['City','Close_Landmarks']].groupby('City').mean()

Unnamed: 0_level_0,Close_Landmarks
City,Unnamed: 1_level_1
Amsterdam,4.847619
Barcelona,2.990385
London,1.546366
Milan,3.018519
Paris,1.920879
Vienna,3.993197


In [10]:
df_compact.head()

Unnamed: 0,Hotel_Address,lat,lng,City,Close_Landmarks
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,52.360576,4.915968,Amsterdam,1
1,1 15 Templeton Place Earl s Court Kensington a...,51.491888,-0.194971,London,0
2,1 2 Serjeant s Inn Fleet Street City of London...,51.513734,-0.108751,London,4
3,1 3 Queens Garden Westminster Borough London W...,51.514218,-0.180903,London,1
4,1 3 Rue d Argentine 16th arr 75116 Paris France,48.874348,2.289733,Paris,2


Join data with original dataset

In [11]:
df.shape, df_compact.shape

((515738, 31), (1476, 5))

In [12]:
result = pd.merge(df, df_compact[['Hotel_Address','Close_Landmarks']], on='Hotel_Address', how='left')

In [13]:
result.shape

(515738, 32)

In [14]:
result.columns

Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Hotel_Name', 'Reviewer_Nationality',
       'Negative_Review', 'Review_Total_Negative_Word_Counts',
       'Total_Number_of_Reviews', 'Positive_Review',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score', 'Tags',
       'days_since_review', 'lat', 'lng', 'Diff', 'Review_Month',
       'Review_Year', 'Country', 'City', 'Pet', 'Purpose', 'Whom', 'Room',
       'Length', 'Device', 'Room_Recode', 'Nationality_Recode',
       'Length_Recode', 'Close_Landmarks'],
      dtype='object')

In [15]:
result.to_csv("./data/df_features.gz", index_label=False, compression="gzip")