In [None]:
import os
from pathlib import Path
import pandas as pd
import requests
import time
from geopy.distance import geodesic

In [44]:
FOURSQUARE_API_KEY = os.environ['FOURSQUARE_API_KEY']
FOURSQUARE_URL = 'https://api.foursquare.com/v3/places/search'

In [45]:
home_dir = Path.home()
inside_airbnb_data_dir = (
    home_dir / 'Programming/data/inside-airbnb/london')
inside_airbnb_work_dir = (
    home_dir /
    'Programming/Python/machine-learning-exercises/short-term-rents-in-london')


In [46]:
inside_airbnb_data_file = (
    inside_airbnb_data_dir / 'selected_short_term_rentals_with_distances.csv')
cache_file = inside_airbnb_data_dir / 'amenities_cache.csv'
output_file = (inside_airbnb_data_dir /
    'selected_short_term_rentals_with_distances_and_amenities.csv')

In [47]:
if os.path.exists(output_file):
    inside_airbnb_df = pd.read_csv(output_file)
else:
    inside_airbnb_df = pd.read_csv(inside_airbnb_data_file)

In [48]:
inside_airbnb_df.head()

Unnamed: 0,borough,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,price,availability_365,days_from_last_review,crime_rate,distance_to_nearest_tube_station,nearest_amenity
0,Hammersmith and Fulham,51.49392,-0.22754,Entire rental unit,Entire home/apt,2,1.5,1.0,89.0,22,25,103.14,0.168,Pub
1,Hammersmith and Fulham,51.49547,-0.22864,Entire rental unit,Entire home/apt,2,1.0,1.0,88.0,149,28,103.14,0.304,"Preschool, Fast Food Restaurant"
2,Lambeth,51.46156,-0.11183,Entire condo,Entire home/apt,6,2.0,2.0,120.0,32,144,137.98,0.181,"Cocktail Bar, Dim Sum Restaurant, Caribbean Re..."
3,Camden,51.54417,-0.14651,Private room in home,Private room,4,1.5,1.0,83.0,147,11,108.07,0.599,"Elementary School, Education"
4,Lambeth,51.45371,-0.11009,Private room in rental unit,Private room,2,0.5,2.0,32.0,0,65,137.98,1.012,


In [49]:
resolution = 3 # ~111m at equator per 0.001 deg
if os.path.exists(cache_file):
    cache_df = pd.read_csv(cache_file)
    cache = {
        (round(lat, resolution), round(lon, resolution)): amenity
        for lat, lon, amenity in zip(cache_df['latitude'],
                                     cache_df['longitude'],
                                     cache_df['nearest_amenity'])
        }
else:
    cache_df = pd.DataFrame(columns=['latitude', 'longitude', 'nearest_amenity'])
    cache = {}

In [50]:
def is_within_radius(lat1, lon1, lat2, lon2, radius_meters):
    return geodesic((lat1, lon1), (lat2, lon2)).meters <= radius_meters

In [51]:
def find_cached_category(lat, lon, cache_radius_meters=100):
    for (cached_lat, cached_lon), amenity in cache.items():
        if is_within_radius(lat, lon, cached_lat,
                            cached_lon, cache_radius_meters):
            return amenity
    return None

In [52]:
def get_nearby_categories(lat, lon, limit=1, radius=100):

    HEADERS = {
    'Authorization': FOURSQUARE_API_KEY,
    'Accept': 'application/json'
    }

    params = {
        'll': f'{lat},{lon}',
        'limit': limit,
        'radius': radius,
        'sort': 'DISTANCE'
    }

    response = requests.get(FOURSQUARE_URL, headers=HEADERS, params=params)
    response.raise_for_status()
    data = response.json()

    categories = set()
    for place in data.get('results', []):
        category_list = place.get('categories', [])
        if category_list:
            top_category = category_list[0].get('name')
            categories.add(top_category)

    result = ', '.join(categories) if categories else 'None'
    return result

In [53]:
def get_category_with_retry(lat, lon, retries=5):
    for attempt in range(retries):
        try:
            return get_nearby_categories(lat, lon, limit=3, radius=100)
        except requests.RequestException as e:
            print(f'Network error at ({lat}, {lon}): {e} - retry {attempt + 1}')
            time.sleep(2 ** attempt)
    print(f'Failed after {retries} attempts: ({lat}, {lon})')
    return 'None'

In [54]:
def process_dataframe(df):
    counter_cache = 0
    counter_saved_data = 0
    counter_using_category_from_cache = 0
    if 'nearest_amenity' not in df.columns:
        df['nearest_amenity'] = ''

    for idx, row in df.iterrows():
        if pd.notna(row['nearest_amenity']) and row['nearest_amenity'] != '':
            continue

        lat, lon = row['latitude'], row['longitude']
        cached = find_cached_category(lat, lon)
        if cached:
            counter_using_category_from_cache += 1
            category = cached
            if counter_using_category_from_cache % 10 == 0:
                print(f'Used {counter_using_category_from_cache} '
                      f'results from {cache_file.name}')
        else:
            category = get_category_with_retry(lat, lon)
            cache[(round(lat, resolution), round(lon, resolution))] = category
            cache_df.loc[len(cache_df)] = [lat, lon, category]
            cache_df.to_csv(cache_file, index=False)
            counter_cache += 1
            if counter_cache % 100 == 0:
                print(f'Saved {counter_cache} results to {cache_file.name}')

        df.at[idx, 'nearest_amenity'] = category
        df.to_csv(output_file, index=False)
        counter_saved_data += 1
        if counter_saved_data % 100 == 0:
            print(f'Saved {counter_saved_data} results to {output_file.name}')

In [55]:
process_dataframe(inside_airbnb_df)

Used 10 results from amenities_cache.csv
Used 20 results from amenities_cache.csv
Used 30 results from amenities_cache.csv
Used 40 results from amenities_cache.csv
Used 50 results from amenities_cache.csv
Used 60 results from amenities_cache.csv
Used 70 results from amenities_cache.csv
Used 80 results from amenities_cache.csv
Used 90 results from amenities_cache.csv
Used 100 results from amenities_cache.csv
Saved 100 results to selected_short_term_rentals_with_distances_and_amenities.csv
Used 110 results from amenities_cache.csv
Used 120 results from amenities_cache.csv
Used 130 results from amenities_cache.csv
Used 140 results from amenities_cache.csv
Used 150 results from amenities_cache.csv
Used 160 results from amenities_cache.csv
Used 170 results from amenities_cache.csv
Used 180 results from amenities_cache.csv
Used 190 results from amenities_cache.csv
Used 200 results from amenities_cache.csv
Saved 200 results to selected_short_term_rentals_with_distances_and_amenities.csv
Used 