In [76]:
from pathlib import Path
import pandas as pd
import requests
import os
import re
from collections import Counter

In [2]:
pd.options.display.max_rows = None

In [3]:
home_dir = Path.home()
inside_airbnb_data_dir = home_dir / 'Programming/data/inside-airbnb/london'
crime_rate_dir = home_dir / 'Programming/data/crime-rate/'

In [4]:
FOURSQUARE_API_KEY = os.environ['FOURSQUARE_API_KEY']
FOURSQUARE_URL = "https://api.foursquare.com/v3/places/search"

In [5]:
inside_airbnb_data_file = inside_airbnb_data_dir / 'listings.csv'
crime_rate_data_file = crime_rate_dir / 'crimerate-pro-data-table-rmp-region-towns-cities.csv'

In [6]:
crime_rate_df = pd.read_csv(crime_rate_data_file, usecols=['Borough', 'Crime Rate'])
crime_rate_df.rename(columns={'Borough': 'borough', 'Crime Rate': 'crime_rate'}, inplace=True)
crime_rate_df = crime_rate_df[crime_rate_df.borough != 'DownloadCSVExcelTSV']

In [7]:
columns_list = ['neighbourhood_cleansed', 'bathrooms', 'bedrooms', 'latitude', 'longitude',
                'room_type', 'latitude', 'longitude', 'property_type', 'price', 'minimum_nights']
inside_airbnb_df = pd.read_csv(inside_airbnb_data_file, usecols=columns_list)
inside_airbnb_df.rename(columns={'neighbourhood_cleansed': 'borough'}, inplace=True)
inside_airbnb_df.price = inside_airbnb_df.price.str.replace('$', '')

In [8]:
inside_airbnb_df = inside_airbnb_df.loc[inside_airbnb_df.room_type == 'Entire home/apt']

In [9]:
inside_airbnb_df = inside_airbnb_df.loc[inside_airbnb_df.minimum_nights >= 30]

In [10]:
inside_airbnb_df = inside_airbnb_df.merge(crime_rate_df, on='borough', how='left')

In [12]:
inside_airbnb_df = inside_airbnb_df.loc[(inside_airbnb_df.bathrooms.notna() & inside_airbnb_df.bedrooms.notna() & inside_airbnb_df.price.notna())]

In [79]:
BROAD_CATEGORIES = [
    ("Grocery Store", ["supermarket", "grocery", "convenience store", "gourmet", "butcher"]),
    ("Restaurant", ["restaurant", "bbq", "steakhouse", "diner", "sushi", "cuisine", "brasserie", "joint", "buffet",
                    "pizzeria", "parlor", "fish", "chips", "bistro", "dining", "buffet", "deli"]),
    ("Cafe", ["coffee", "cafe", "tea", "bakery", "dessert", "café", "drinking", "breakfast", "gelato shop",
              "bagel", "sandwich", "snack", "cupcake", "pastry"]),
    ("Nightlife", ["bar", "pub", "club", "lounge", "casino", "speakeasy", "brewery", "roof deck"]),
    ("Retail", ["shopping", "store", "mall", "market", "food", "beverage", "boutique", "office", "plaza"]),
    ("Fitness", ["gym", "fitness", "yoga", "crossfit", "martial arts", "tennis", "sports", "football", "cricket", "stable",
                 "swimming", "bowling", "skating", "sporting", "sport", "soccer"]),
    ("Wellness", ["spa", "massage", "therapy", "sauna", "escape room", "psychic", "astrologer"]),
    ("Entertainment", ["theater", "cinema", "concert", "comedy", "recreation", "bingo", "music", "auditorium", "jazz",
                       "blues", "stadium", "gun", "race", "track"]),
    ("Cultural", ["museum", "art", "gallery", "library", "historic", "landmarks", "monument", "tour", "opera", "exhibit", "memorial"]),
    ("Outdoor", ["park", "trail", "beach", "zoo", "hiking", "playground", "outdoors", "tunnel", "fountain",
                 "scenic", "nature", "aquarium", "campground", "camp", "farm", "canal"]),
    ("Transport", ["train", "bus", "subway", "parking", "taxi", "tube", "dealership", "automotive", "car rental",
                   "shipping", "motorcycle", "fuel station", "harbor", "marina"]),
    ("Healthcare", ["hospital", "clinic", "pharmacy", "dentist", "veterinary", "medicine", "doctor", "surgeon", "surgery", "healthcare",
                    "physiotherapist", "physician", "psycho", "assisted living", "medical", "nutritionist", "ambulance"]),
    ("Services", ["bank", "atm", "post", "salon", "barber", "laundry", "child care", "agency", "photographer", "chimney",
                  "veterinarian", "telecommunication", "pet", "wedding", "architecture", "upholstery", "cleaning", "computer",
                  "photography", "audiovisual", "manufacturer", "auction", "designer", "event", "renewable energy", "hotel", "wholesaler"]),
    ("Organization", ["community", "government", "assistance", "legal", "environmental", "non-profit", "charity", "youth", "city hall",
                      "disabled", "military", "embassy", "consulate", "agriculture", "forestry", "courthouse", "police", "fire", "station"]),
    ("Education", ["school", "learning", "tutoring", "preschool", "kindergarten", "university", "college", "education"]),
    ("Religion", ["church", "cathedral", "seminary", "mosque", "temple", "synagogue", "faith", "monastery", "cemetery", "spiritual", "kingdom hall"]),
    ("Home Improvement", ["hvac", "home", "heating ventilating air conditioning", "landscape", "garden", "smith", "contractor",
                          "construction", "carpenter", "builder", "plumber", "housing", "electrician", "locksmith", "real estate"]),
]

In [17]:
def classify_category(category_name):
    category_name_lower = category_name.lower()
    for broad_category, keywords in BROAD_CATEGORIES:
        if any(keyword in category_name_lower for keyword in keywords):
            return broad_category
    return category_name

In [18]:
def get_nearby_categories(lat, lon, radius=100, limit=3):
    if not FOURSQUARE_API_KEY:
        return "API key missing"

    headers = {
        "Authorization": FOURSQUARE_API_KEY,
        "Accept": "application/json"
    }
    params = {
        "ll": f"{lat},{lon}",
        "radius": radius,
        "limit": limit
    }
    
    response = requests.get(FOURSQUARE_URL, headers=headers, params=params)
    
    if response.status_code == 200:
        data = response.json()
        categories = set()
        for place in data.get('results', []):
            category_list = place.get('categories', [])
            if category_list:
                category_name = category_list[0]['name']
                broad_category = classify_category(category_name)
                categories.add(broad_category)
        
        return ', '.join(categories) if categories else "None"
    else:
        return f"API error: {response.status_code}"

In [22]:
inside_airbnb_df['amenities'] = inside_airbnb_df.apply(lambda row: get_nearby_categories(row['latitude'], row['longitude']), axis=1)

In [36]:
inside_airbnb_df.reset_index(inplace=True, drop=True)

In [62]:
inside_airbnb_df.drop(index=1490, inplace=True, axis=0)

In [69]:
inside_airbnb_df.head()

Unnamed: 0,borough,latitude,longitude,property_type,room_type,bathrooms,bedrooms,price,minimum_nights,crime_rate,amenities
0,Lambeth,51.491476,-0.111514,Entire rental unit,Entire home/apt,1.0,1.0,150.0,30.0,137.98,Education
1,Kensington and Chelsea,51.48566,-0.18415,Entire rental unit,Entire home/apt,2.0,2.0,195.0,91.0,118.02,"Nightlife, Restaurant"
2,Brent,51.53899,-0.19744,Entire rental unit,Entire home/apt,1.5,2.0,83.0,30.0,117.59,Grocery Store
3,Kensington and Chelsea,51.51732,-0.2005,Entire rental unit,Entire home/apt,1.0,2.0,288.0,91.0,118.02,Restaurant
4,Westminster,51.49695,-0.13888,Entire rental unit,Entire home/apt,3.0,4.0,901.0,186.0,132.94,"Restaurant, Retail"


In [74]:
def map_categories(amenities):
    categories = set()
    
    for amenity in amenities.split(', '):
        found = False
        for broad_category, keywords in BROAD_CATEGORIES:
            if any(re.search(rf"\b{keyword}\b", amenity, re.IGNORECASE) for keyword in keywords):
                categories.add(broad_category)
                found = True
                break
        if not found:
            categories.add(amenity)
    
    return ', '.join(sorted(categories))

In [80]:
inside_airbnb_df['amenities'] = inside_airbnb_df['amenities'].apply(map_categories)

In [81]:
flatten_arr = [category for item in inside_airbnb_df.amenities.to_list() for category in item.split(', ')]
counts = Counter(flatten_arr)
counts

Counter({'Restaurant': 543,
         'Nightlife': 328,
         'Cafe': 300,
         'Retail': 269,
         'Grocery Store': 147,
         'Education': 146,
         'Cultural': 136,
         'Healthcare': 119,
         'Organization': 117,
         'None': 95,
         'Religion': 74,
         'Home Improvement': 65,
         'Outdoor': 54,
         'Services': 50,
         'Entertainment': 47,
         'Fitness': 34,
         'Transport': 34,
         'Wellness': 6})

In [84]:
inside_airbnb_df.to_csv(inside_airbnb_data_dir / 'selected_short_term_rentals.csv', index=False)