In [None]:
import os
import json
from pathlib import Path
import pandas as pd
import requests

In [2]:
pd.options.display.max_rows = None

In [3]:
FOURSQUARE_API_KEY = os.environ['FOURSQUARE_API_KEY']
FOURSQUARE_URL = 'https://api.foursquare.com/v3/places/search'

In [4]:
home_dir = Path.home()
inside_airbnb_data_dir = (
    home_dir / 'Programming/data/inside-airbnb/london')
crime_rate_dir = (
    home_dir / 'Programming/data/crime-rate/')
inside_airbnb_work_dir = (
    home_dir /
    'Programming/Python/machine-learning-exercises/short-term-rents-in-london')

In [32]:
inside_airbnb_data_file = (
    inside_airbnb_data_dir / 'listings.csv')
crime_rate_data_file = (
    crime_rate_dir / 'crimerate-pro-data-table-rmp-region-towns-cities.csv')
foursquare_categories_file = (
    inside_airbnb_work_dir / 'foursquare_categories/foursquare_categories.json')
location_cache_json_file = (
    inside_airbnb_work_dir / 'location_cache.json')

In [7]:
crime_rate_df = pd.read_csv(
    crime_rate_data_file, usecols=['Borough', 'Crime Rate'])
crime_rate_df.rename(
    columns={'Borough': 'borough', 'Crime Rate': 'crime_rate'},
    inplace=True)
crime_rate_df = crime_rate_df[crime_rate_df.borough != 'DownloadCSVExcelTSV']

In [8]:
columns_list = [
    'neighbourhood_cleansed', 'latitude', 'longitude', 'accommodates',
    'bedrooms', 'bathrooms', 'property_type', 'room_type', 'availability_365',
    'calendar_last_scraped', 'last_review', 'price']
inside_airbnb_df = pd.read_csv(
    inside_airbnb_data_file, usecols=columns_list,
    parse_dates=['calendar_last_scraped', 'last_review'],
    date_format="%d/%m/%Y")
inside_airbnb_df.rename(
    columns={'neighbourhood_cleansed': 'borough'},
    inplace=True)
inside_airbnb_df.price = inside_airbnb_df.price.str.replace('$', '')

In [9]:
inside_airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95144 entries, 0 to 95143
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   borough                95144 non-null  object        
 1   latitude               95144 non-null  float64       
 2   longitude              95144 non-null  float64       
 3   property_type          95144 non-null  object        
 4   room_type              95144 non-null  object        
 5   accommodates           95144 non-null  int64         
 6   bathrooms              62744 non-null  float64       
 7   bedrooms               82794 non-null  float64       
 8   price                  62777 non-null  object        
 9   availability_365       95144 non-null  int64         
 10  calendar_last_scraped  95144 non-null  datetime64[ns]
 11  last_review            70560 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(4), int64(2), object(4)
memory

In [10]:
inside_airbnb_df = inside_airbnb_df.loc[
    (inside_airbnb_df.bathrooms.notna() &
     inside_airbnb_df.bedrooms.notna() &
    inside_airbnb_df.price.notna() &
     inside_airbnb_df.last_review.notna())]

In [11]:
inside_airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48186 entries, 0 to 95065
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   borough                48186 non-null  object        
 1   latitude               48186 non-null  float64       
 2   longitude              48186 non-null  float64       
 3   property_type          48186 non-null  object        
 4   room_type              48186 non-null  object        
 5   accommodates           48186 non-null  int64         
 6   bathrooms              48186 non-null  float64       
 7   bedrooms               48186 non-null  float64       
 8   price                  48186 non-null  object        
 9   availability_365       48186 non-null  int64         
 10  calendar_last_scraped  48186 non-null  datetime64[ns]
 11  last_review            48186 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(4), int64(2), object(4)
memory usag

In [12]:
inside_airbnb_df['days_from_last_review'] = (
    inside_airbnb_df.calendar_last_scraped - inside_airbnb_df.last_review).dt.days
inside_airbnb_df.drop(['calendar_last_scraped', 'last_review'], axis=1, inplace=True)

In [13]:
inside_airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48186 entries, 0 to 95065
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   borough                48186 non-null  object 
 1   latitude               48186 non-null  float64
 2   longitude              48186 non-null  float64
 3   property_type          48186 non-null  object 
 4   room_type              48186 non-null  object 
 5   accommodates           48186 non-null  int64  
 6   bathrooms              48186 non-null  float64
 7   bedrooms               48186 non-null  float64
 8   price                  48186 non-null  object 
 9   availability_365       48186 non-null  int64  
 10  days_from_last_review  48186 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 4.4+ MB


In [14]:
six_months_in_days = 182
inside_airbnb_df = inside_airbnb_df[
    (inside_airbnb_df['days_from_last_review'] < 
     six_months_in_days)]

In [15]:
inside_airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38262 entries, 0 to 95065
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   borough                38262 non-null  object 
 1   latitude               38262 non-null  float64
 2   longitude              38262 non-null  float64
 3   property_type          38262 non-null  object 
 4   room_type              38262 non-null  object 
 5   accommodates           38262 non-null  int64  
 6   bathrooms              38262 non-null  float64
 7   bedrooms               38262 non-null  float64
 8   price                  38262 non-null  object 
 9   availability_365       38262 non-null  int64  
 10  days_from_last_review  38262 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 3.5+ MB


In [16]:
year_minus_three_months_in_days = 275
inside_airbnb_df = inside_airbnb_df[
    (inside_airbnb_df.availability_365 < year_minus_three_months_in_days)]

In [17]:
inside_airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27024 entries, 0 to 95065
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   borough                27024 non-null  object 
 1   latitude               27024 non-null  float64
 2   longitude              27024 non-null  float64
 3   property_type          27024 non-null  object 
 4   room_type              27024 non-null  object 
 5   accommodates           27024 non-null  int64  
 6   bathrooms              27024 non-null  float64
 7   bedrooms               27024 non-null  float64
 8   price                  27024 non-null  object 
 9   availability_365       27024 non-null  int64  
 10  days_from_last_review  27024 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 2.5+ MB


In [18]:
inside_airbnb_sr = (inside_airbnb_df
                    .groupby('property_type')['property_type']
                    .count()
                    .sort_values(ascending=False))
limit_num_categories = 30
inside_airbnb_sr_30 = inside_airbnb_sr[inside_airbnb_sr.values > limit_num_categories]
inside_airbnb_property_type_list = list(inside_airbnb_sr_30.index)

In [19]:
inside_airbnb_df = inside_airbnb_df.loc[
    (inside_airbnb_df['property_type']
     .isin(inside_airbnb_property_type_list))]

In [20]:
inside_airbnb_df = inside_airbnb_df.merge(crime_rate_df, on='borough', how='left')

In [21]:
inside_airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26630 entries, 0 to 26629
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   borough                26630 non-null  object 
 1   latitude               26630 non-null  float64
 2   longitude              26630 non-null  float64
 3   property_type          26630 non-null  object 
 4   room_type              26630 non-null  object 
 5   accommodates           26630 non-null  int64  
 6   bathrooms              26630 non-null  float64
 7   bedrooms               26630 non-null  float64
 8   price                  26630 non-null  object 
 9   availability_365       26630 non-null  int64  
 10  days_from_last_review  26630 non-null  int64  
 11  crime_rate             26630 non-null  float64
dtypes: float64(5), int64(3), object(4)
memory usage: 2.4+ MB


In [22]:
inside_airbnb_df.head()

Unnamed: 0,borough,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,price,availability_365,days_from_last_review,crime_rate
0,Hammersmith and Fulham,51.49392,-0.22754,Entire rental unit,Entire home/apt,2,1.5,1.0,89.0,22,25,103.14
1,Hammersmith and Fulham,51.49547,-0.22864,Entire rental unit,Entire home/apt,2,1.0,1.0,88.0,149,28,103.14
2,Lambeth,51.46156,-0.11183,Entire condo,Entire home/apt,6,2.0,2.0,120.0,32,144,137.98
3,Camden,51.54417,-0.14651,Private room in home,Private room,4,1.5,1.0,83.0,147,11,108.07
4,Lambeth,51.45371,-0.11009,Private room in rental unit,Private room,2,0.5,2.0,32.0,0,65,137.98


In [27]:
def round_coord(lat, lon, resolution=0.001):
    return (round(lat / resolution) * resolution,
            round(lon / resolution) * resolution)

In [28]:
def load_cache(path):
    if os.path.exists(path):
        with open(path, "r") as f:
            return json.load(f)
    return {}

In [34]:
def save_cache(cache, path):
    with open(path, "w") as f:
        json.dump(cache, f)

In [None]:
location_cache = {}

In [35]:
def get_nearby_categories(lat, lon, radius=100, limit=3):
    cache_key = round_coord(lat, lon)
    if cache_key in location_cache:
        return location_cache[cache_key]

    if not FOURSQUARE_API_KEY:
        return "API key missing"

    headers = {
        "Authorization": FOURSQUARE_API_KEY,
        "Accept": "application/json"
    }
    params = {
        "ll": f"{lat},{lon}",
        "radius": radius,
        "limit": limit
    }

    try:
        response = requests.get(FOURSQUARE_URL, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
    except Exception as e:
        return f"API error: {str(e)}"

    categories = set()
    for place in data.get('results', []):
        cats = place.get('categories', [])
        if cats:
            categories.add(cats[0].get('name'))

    result = ', '.join(categories) if categories else "None"
    location_cache[cache_key] = result
    return result

In [36]:
location_cache = load_cache(location_cache_json_file)

In [None]:
inside_airbnb_df['amenities'] = inside_airbnb_df.apply(
    lambda row: get_nearby_categories(
        row['latitude'],
        row['longitude']),
    axis=1)

In [None]:
inside_airbnb_df.head()

In [None]:
inside_airbnb_df.reset_index(inplace=True, drop=True)

In [None]:
inside_airbnb_df.drop(index=1490, inplace=True, axis=0)

In [None]:
inside_airbnb_df.head()

In [None]:
inside_airbnb_df.to_csv(inside_airbnb_data_dir / 'selected_short_term_rentals.csv', index=False)