# Load Data

In [1]:
import pandas as pd

This file contains all of the points of interest in London, along with their coordinates.

In [4]:
pois_df = pd.read_csv("../resources/data/London_pois.csv", index_col=0)

In [6]:
pois_df.head()

Unnamed: 0,geometry,poi_type_clean,amenities,lon,lat,lon_rad,lat_rad
0,POINT (-0.1935029029846191 51.60203170776367),restaurant,restaurants,-0.193503,51.602032,-0.003377,0.900625
1,POINT (-0.1156141981482506 51.52107620239258),library,books,-0.115614,51.521076,-0.002018,0.899212
2,POINT (-0.0206975005567074 51.44462585449219),library,books,-0.020698,51.444626,-0.000361,0.897878
3,POINT (-0.3340173959732056 51.411006927490234),cafe,coffee,-0.334017,51.411007,-0.00583,0.897291
4,POINT (-0.1189566031098366 51.5173454284668),bar,restaurants,-0.118957,51.517345,-0.002076,0.899147


Add map here.

# Algorithm

In [None]:
def get_walk_score(oa_longitude, oa_latitude, df):
    radian_longitude = radians(oa_longitude)
    radian_latitude = radians(oa_latitude)

    earth_radius = 6371000 # meters in earth

    new_df = pd.DataFrame()
    df = df.copy()

    for key, values in amenity_weights.items():

        current_df = df[df['amenities'] == key]
        current_df.reset_index(drop=True, inplace=True)
        current_ball = BallTree(current_df[['lon_rad', 'lat_rad']].values, metric='haversine') # What is the ball tree doing?
        k = len(values)

        distances, indices = current_ball.query([[radian_longitude, radian_latitude]], k=k, return_distance  = True)


        distance_in_meters = [x * earth_radius for x in distances]

        if k > 1:
            result_df = current_df[current_df.index.isin(indices[0])]

        else: 
            result_df = current_df.iloc[indices[0]]


        ### Add series

        dist_series = pd.Series(distance_in_meters[0], index=indices[0], name='distance')
        weight_series = pd.Series(values, index=indices[0], name='weights')
        result_df['distance'] = dist_series
        result_df['weights'] = weight_series
        new_df = pd.concat([new_df, result_df], axis=0, join='outer')

        ### Calculate score

    new_df['distance_decayed'] = new_df['distance'].apply(lambda x: float(distance_decay(x)))
    new_df['weighted_distance'] = new_df['weights'] * new_df['distance_decayed'] * 6.67
    scores = new_df[['weighted_distance', 'amenities']].groupby('amenities').sum()
    walk_score = scores.sum().values[0]
    return walk_score

def distance_decay(distance):
    M = float(1)
    dist = distance/1000
    score = math.e**((-5.0*(dist/4))**5.0)
    return score

# Load Ball

In [9]:
from sklearn.neighbors import BallTree

ball_tree = BallTree(pois_df[['lon_rad', 'lat_rad']].values, metric='haversine') # What is the ball tree doing?

# Load property

In [10]:
import json

with open("../resources/data/property.json", "r") as file:
    property = json.load(file) 

In [42]:
latitude = property['location']['latitude']
longitude = property['location']['longitude']

In [None]:
latitude = 51.602032	

longitude = -0.193503

## Process property

In [47]:
from math import radians

In [48]:
radian_longitude = radians(longitude)
radian_latitude = radians(latitude)

k = 100

distances, indices = ball_tree.query([[radian_longitude, radian_latitude]], k=k, return_distance  = True)

In [None]:
dist_series = pd.Series(distances[0], index=indices[0], name='distance')

results_df = pd.DataFrame(dist_series)

### Result of closest properties

This DF returns the closest amenitites based on their index. 

In [51]:
results_df.head()

Unnamed: 0,distance
0,5.374171e-09
216,4.935937e-06
5618,1.5944e-05
11303,1.614258e-05
5620,2.179708e-05


### Perform a join to get the amenity type of the closest amenities.

In [52]:
results_df = results_df.join(pois_df['amenities'], how='left')

In [53]:
results_df.head()

Unnamed: 0,distance,amenities
0,5.374171e-09,restaurants
216,4.935937e-06,restaurants
5618,1.5944e-05,restaurants
11303,1.614258e-05,grocery
5620,2.179708e-05,restaurants


In [54]:
earth_radius = 6371000

def distance_decay(distance):
    M = float(1)
    dist = distance/1000
    score = math.e**((-5.0*(dist/4))**5.0)
    return score

results_df['distance_in_metres'] = results_df['distance'].apply(lambda x: x * earth_radius)

In [55]:
import math
results_df['distance_decayed'] = results_df['distance_in_metres'].apply(lambda x: float(distance_decay(x)))

We now have the distances in metres and we have decayed them according to our decay function.

In [57]:
results_df.head()

Unnamed: 0,distance,amenities,distance_in_metres,distance_decayed
0,5.374171e-09,restaurants,0.034239,1.0
216,4.935937e-06,restaurants,31.446854,1.0
5618,1.5944e-05,restaurants,101.579218,0.999967
11303,1.614258e-05,grocery,102.844363,0.999965
5620,2.179708e-05,restaurants,138.869221,0.999842


# Weights

In [59]:
amenity_weights = {
"grocery": [3],
"restaurants": [.75, .45, .25, .25, .225, .225, .225, .225, .2, .2],
"shopping": [.5, .45, .4, .35, .3],
"coffee": [1.25, .75],
"banks": [1],
"parks": [1],
"schools": [1],
"books": [1],
"entertainment": [1],
}

In [64]:
for key, values in amenity_weights.items():
    print(key, values)
    k = len(values)
    print(k)

grocery [3]
1
restaurants [0.75, 0.45, 0.25, 0.25, 0.225, 0.225, 0.225, 0.225, 0.2, 0.2]
10
shopping [0.5, 0.45, 0.4, 0.35, 0.3]
5
coffee [1.25, 0.75]
2
banks [1]
1
parks [1]
1
schools [1]
1
books [1]
1
entertainment [1]
1


## Calculate walk score, with vector multiplication

In [139]:
import numpy as np

In [250]:
def process_results_df(distance_series):
    results_df = pd.DataFrame(distance_series)

    results_df = results_df.join(pois_df['amenities'], how='left')

    results_df['distance_in_metres'] = results_df['distance'].apply(lambda x: x * earth_radius)

    results_df['distance_decayed'] = results_df['distance_in_metres'].apply(lambda x: float(distance_decay(x)))

    return results_df

In [251]:
def calculate_amenity_walk_score(property_distance_df, amenity, weights):
    k = len(weights)
    weight_array = np.array(weights)
    
    dist_array = property_distance_df[property_distance_df['amenities'] == amenity].iloc[0:k]['distance_decayed'].values
    dist_array_padded = np.pad(dist_array, (0, weight_array.size - dist_array.size), 'constant')

    scores_array = dist_array_padded * weight_array

    amenity_score = scores_array.sum()

    return amenity_score

In [298]:
def calculuate_walk_score(property):

    property_id = property['id']
    latitude = property['location']['latitude']
    longitude = property['location']['longitude']

    radian_longitude = radians(longitude)
    radian_latitude = radians(latitude)

    k = 100 # Maximum number of amenities to return

    distances, indices = ball_tree.query([[radian_longitude, radian_latitude]], k=k, return_distance  = True)

    dist_series = pd.Series(distances[0], index=indices[0], name='distance')

    results_df = process_results_df(dist_series)

    # print(results_df)

    scores_dict = {}

    
    walk_score = 0
    
    for key, values in amenity_weights.items():
    
        amenity_score = calculate_amenity_walk_score(results_df, key, values)
    
        scores_dict[key] = amenity_score

    return scores_dict

In [299]:
scores_df

NameError: name 'scores_df' is not defined

### Test

In [300]:
property = {
    "location": {
        "latitude": 51.602032,
        "longitude": -0.193503
    },
    "id": 12345
}

In [301]:
property

{'location': {'latitude': 51.602032, 'longitude': -0.193503}, 'id': 12345}

In [302]:
scores_dict = calculuate_walk_score(property)

In [309]:
sum(scores_dict.values()) * 6.67

78.01155246151885

In [None]:
[key, value

# Speed test

## Connect to Mongo DB

In [259]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/") # Hosted with Docker

In [260]:
db = client["rightmove"]

# Access collection
collection = db["properties"]

#### Fetch 1000 properties

In [262]:
fields = {'id': 1, 'location': 1}  # fields to include (1) or exclude (0)
query = {}

# Fetch data from the collection
data = collection.find(query, fields).limit(1000)

# Convert to Pandas DataFrame
df = pd.DataFrame(list(data))

In [265]:
df

Unnamed: 0,_id,id,location
0,656dfa586037a12e52f660b6,142473758,"{'latitude': 57.156601, 'longitude': -2.10075}"
1,656dfa586037a12e52f660b7,86375754,"{'latitude': 57.16631, 'longitude': -2.11688}"
2,656dfa586037a12e52f660b8,141848477,"{'latitude': 57.15214, 'longitude': -2.0856}"
3,656dfa586037a12e52f660b9,142066406,"{'latitude': 57.14584, 'longitude': -2.098261}"
4,656dfa586037a12e52f660ba,142557041,"{'latitude': 57.15393, 'longitude': -2.09585}"
...,...,...,...
995,656dfa8d6037a12e52f66499,142627973,"{'latitude': 52.609923, 'longitude': -1.835996}"
996,656dfa8d6037a12e52f6649a,142627952,"{'latitude': 52.575893, 'longitude': -1.878751}"
997,656dfa8d6037a12e52f6649b,142627976,"{'latitude': 52.579851, 'longitude': -1.901134}"
998,656dfa8d6037a12e52f6649c,142628000,"{'latitude': 52.550468, 'longitude': -1.857056}"


In [313]:
%%time
properties_processed = []
for row in df.itertuples():
    index = row.Index
    property = {
        "id": row.id,
        "location": row.location
    }
    scores_dict = calculuate_walk_score(property)
    walk_score = sum(scores_dict.values()) * 6.67
    scores_dict['walk_score'] = walk_score

    property['scores'] = scores_dict
    properties_processed.append(property)

CPU times: user 1.78 s, sys: 26.7 ms, total: 1.81 s
Wall time: 1.84 s


In [None]:
properties_processed