In [1]:
from pathlib import Path
import pandas as pd
import requests
import os
import re
from collections import Counter
from datetime import timedelta

In [2]:
pd.options.display.max_rows = None

In [3]:
home_dir = Path.home()
inside_airbnb_data_dir = home_dir / 'Programming/data/inside-airbnb/london'
crime_rate_dir = home_dir / 'Programming/data/crime-rate/'

In [4]:
FOURSQUARE_API_KEY = os.environ['FOURSQUARE_API_KEY']
FOURSQUARE_URL = "https://api.foursquare.com/v3/places/search"

In [5]:
inside_airbnb_data_file = inside_airbnb_data_dir / 'listings.csv'
crime_rate_data_file = crime_rate_dir / 'crimerate-pro-data-table-rmp-region-towns-cities.csv'

In [6]:
crime_rate_df = pd.read_csv(crime_rate_data_file, usecols=['Borough', 'Crime Rate'])
crime_rate_df.rename(columns={'Borough': 'borough', 'Crime Rate': 'crime_rate'}, inplace=True)
crime_rate_df = crime_rate_df[crime_rate_df.borough != 'DownloadCSVExcelTSV']

In [18]:
columns_list = ['neighbourhood_cleansed', 'latitude', 'longitude', 'accommodates', 'bedrooms', 'bathrooms',
                'property_type', 'room_type', 'availability_365', 'calendar_last_scraped', 'last_review', 'price']
inside_airbnb_df = pd.read_csv(inside_airbnb_data_file, usecols=columns_list, parse_dates=['calendar_last_scraped', 'last_review'], date_format="%d/%m/%Y")
inside_airbnb_df.rename(columns={'neighbourhood_cleansed': 'borough'}, inplace=True)
inside_airbnb_df.price = inside_airbnb_df.price.str.replace('$', '')

In [19]:
inside_airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95144 entries, 0 to 95143
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   borough                95144 non-null  object        
 1   latitude               95144 non-null  float64       
 2   longitude              95144 non-null  float64       
 3   property_type          95144 non-null  object        
 4   room_type              95144 non-null  object        
 5   accommodates           95144 non-null  int64         
 6   bathrooms              62744 non-null  float64       
 7   bedrooms               82794 non-null  float64       
 8   price                  62777 non-null  object        
 9   availability_365       95144 non-null  int64         
 10  calendar_last_scraped  95144 non-null  datetime64[ns]
 11  last_review            70560 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(4), int64(2), object(4)
memory

In [20]:
inside_airbnb_df = inside_airbnb_df.loc[(inside_airbnb_df.bathrooms.notna() & inside_airbnb_df.bedrooms.notna())]
inside_airbnb_df = inside_airbnb_df.loc[(inside_airbnb_df.price.notna() & inside_airbnb_df.last_review.notna())]

In [21]:
inside_airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48186 entries, 0 to 95065
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   borough                48186 non-null  object        
 1   latitude               48186 non-null  float64       
 2   longitude              48186 non-null  float64       
 3   property_type          48186 non-null  object        
 4   room_type              48186 non-null  object        
 5   accommodates           48186 non-null  int64         
 6   bathrooms              48186 non-null  float64       
 7   bedrooms               48186 non-null  float64       
 8   price                  48186 non-null  object        
 9   availability_365       48186 non-null  int64         
 10  calendar_last_scraped  48186 non-null  datetime64[ns]
 11  last_review            48186 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(4), int64(2), object(4)
memory usag

In [22]:
inside_airbnb_df = inside_airbnb_df[(inside_airbnb_df.calendar_last_scraped - inside_airbnb_df.last_review) < timedelta(days=182)]

In [23]:
inside_airbnb_df = inside_airbnb_df[(inside_airbnb_df.groupby('property_type')['property_type'].transform('size') > 30)]

In [24]:
inside_airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37871 entries, 0 to 95065
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   borough                37871 non-null  object        
 1   latitude               37871 non-null  float64       
 2   longitude              37871 non-null  float64       
 3   property_type          37871 non-null  object        
 4   room_type              37871 non-null  object        
 5   accommodates           37871 non-null  int64         
 6   bathrooms              37871 non-null  float64       
 7   bedrooms               37871 non-null  float64       
 8   price                  37871 non-null  object        
 9   availability_365       37871 non-null  int64         
 10  calendar_last_scraped  37871 non-null  datetime64[ns]
 11  last_review            37871 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(4), int64(2), object(4)
memory usag

In [26]:
inside_airbnb_df = inside_airbnb_df[(365 - inside_airbnb_df.availability_365 > 90)]

In [27]:
inside_airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26767 entries, 0 to 95065
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   borough                26767 non-null  object        
 1   latitude               26767 non-null  float64       
 2   longitude              26767 non-null  float64       
 3   property_type          26767 non-null  object        
 4   room_type              26767 non-null  object        
 5   accommodates           26767 non-null  int64         
 6   bathrooms              26767 non-null  float64       
 7   bedrooms               26767 non-null  float64       
 8   price                  26767 non-null  object        
 9   availability_365       26767 non-null  int64         
 10  calendar_last_scraped  26767 non-null  datetime64[ns]
 11  last_review            26767 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(4), int64(2), object(4)
memory usag

In [35]:
inside_airbnb_df = inside_airbnb_df.merge(crime_rate_df, on='borough', how='left')

In [36]:
inside_airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26767 entries, 0 to 26766
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   borough                26767 non-null  object        
 1   latitude               26767 non-null  float64       
 2   longitude              26767 non-null  float64       
 3   property_type          26767 non-null  object        
 4   room_type              26767 non-null  object        
 5   accommodates           26767 non-null  int64         
 6   bathrooms              26767 non-null  float64       
 7   bedrooms               26767 non-null  float64       
 8   price                  26767 non-null  object        
 9   availability_365       26767 non-null  int64         
 10  calendar_last_scraped  26767 non-null  datetime64[ns]
 11  last_review            26767 non-null  datetime64[ns]
 12  crime_rate             26767 non-null  float64       
dtypes

In [37]:
inside_airbnb_df.head()

Unnamed: 0,borough,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,price,availability_365,calendar_last_scraped,last_review,crime_rate
0,Hammersmith and Fulham,51.49392,-0.22754,Entire rental unit,Entire home/apt,2,1.5,1.0,89.0,22,2024-12-12,2024-11-17,103.14
1,Hammersmith and Fulham,51.49547,-0.22864,Entire rental unit,Entire home/apt,2,1.0,1.0,88.0,149,2024-12-12,2024-11-14,103.14
2,Hammersmith and Fulham,51.49368,-0.22774,Room in aparthotel,Private room,2,1.0,1.0,75.0,62,2024-12-11,2024-11-26,103.14
3,Lambeth,51.46156,-0.11183,Entire condo,Entire home/apt,6,2.0,2.0,120.0,32,2024-12-11,2024-07-20,137.98
4,Camden,51.54417,-0.14651,Private room in home,Private room,4,1.5,1.0,83.0,147,2024-12-12,2024-12-01,108.07


In [None]:
BROAD_CATEGORIES = [
    ("Grocery Store", ["supermarket", "grocery", "convenience store", "gourmet", "butcher"]),
    ("Restaurant", ["restaurant", "bbq", "steakhouse", "diner", "sushi", "cuisine", "brasserie", "joint", "buffet",
                    "pizzeria", "parlor", "fish", "chips", "bistro", "dining", "buffet", "deli"]),
    ("Cafe", ["coffee", "cafe", "tea", "bakery", "dessert", "café", "drinking", "breakfast", "gelato shop",
              "bagel", "sandwich", "snack", "cupcake", "pastry"]),
    ("Nightlife", ["bar", "pub", "club", "lounge", "casino", "speakeasy", "brewery", "roof deck"]),
    ("Retail", ["shopping", "store", "mall", "market", "food", "beverage", "boutique", "office", "plaza"]),
    ("Fitness", ["gym", "fitness", "yoga", "crossfit", "martial arts", "tennis", "sports", "football", "cricket", "stable",
                 "swimming", "bowling", "skating", "sporting", "sport", "soccer"]),
    ("Wellness", ["spa", "massage", "therapy", "sauna", "escape room", "psychic", "astrologer"]),
    ("Entertainment", ["theater", "cinema", "concert", "comedy", "recreation", "bingo", "music", "auditorium", "jazz",
                       "blues", "stadium", "gun", "race", "track"]),
    ("Cultural", ["museum", "art", "gallery", "library", "historic", "landmarks", "monument", "tour", "opera", "exhibit", "memorial"]),
    ("Outdoor", ["park", "trail", "beach", "zoo", "hiking", "playground", "outdoors", "tunnel", "fountain",
                 "scenic", "nature", "aquarium", "campground", "camp", "farm", "canal"]),
    ("Transport", ["train", "bus", "subway", "parking", "taxi", "tube", "dealership", "automotive", "car rental",
                   "shipping", "motorcycle", "fuel station", "harbor", "marina"]),
    ("Healthcare", ["hospital", "clinic", "pharmacy", "dentist", "veterinary", "medicine", "doctor", "surgeon", "surgery", "healthcare",
                    "physiotherapist", "physician", "psycho", "assisted living", "medical", "nutritionist", "ambulance"]),
    ("Services", ["bank", "atm", "post", "salon", "barber", "laundry", "child care", "agency", "photographer", "chimney",
                  "veterinarian", "telecommunication", "pet", "wedding", "architecture", "upholstery", "cleaning", "computer",
                  "photography", "audiovisual", "manufacturer", "auction", "designer", "event", "renewable energy", "hotel", "wholesaler"]),
    ("Organization", ["community", "government", "assistance", "legal", "environmental", "non-profit", "charity", "youth", "city hall",
                      "disabled", "military", "embassy", "consulate", "agriculture", "forestry", "courthouse", "police", "fire", "station"]),
    ("Education", ["school", "learning", "tutoring", "preschool", "kindergarten", "university", "college", "education"]),
    ("Religion", ["church", "cathedral", "seminary", "mosque", "temple", "synagogue", "faith", "monastery", "cemetery", "spiritual", "kingdom hall"]),
    ("Home Improvement", ["hvac", "home", "heating ventilating air conditioning", "landscape", "garden", "smith", "contractor",
                          "construction", "carpenter", "builder", "plumber", "housing", "electrician", "locksmith", "real estate"]),
]

In [None]:
def classify_category(category_name):
    category_name_lower = category_name.lower()
    for broad_category, keywords in BROAD_CATEGORIES:
        if any(keyword in category_name_lower for keyword in keywords):
            return broad_category
    return category_name

In [None]:
def get_nearby_categories(lat, lon, radius=100, limit=3):
    if not FOURSQUARE_API_KEY:
        return "API key missing"

    headers = {
        "Authorization": FOURSQUARE_API_KEY,
        "Accept": "application/json"
    }
    params = {
        "ll": f"{lat},{lon}",
        "radius": radius,
        "limit": limit
    }
    
    response = requests.get(FOURSQUARE_URL, headers=headers, params=params)
    
    if response.status_code == 200:
        data = response.json()
        categories = set()
        for place in data.get('results', []):
            category_list = place.get('categories', [])
            if category_list:
                category_name = category_list[0]['name']
                broad_category = classify_category(category_name)
                categories.add(broad_category)
        
        return ', '.join(categories) if categories else "None"
    else:
        return f"API error: {response.status_code}"

In [None]:
inside_airbnb_df['amenities'] = inside_airbnb_df.apply(lambda row: get_nearby_categories(row['latitude'], row['longitude']), axis=1)

In [None]:
inside_airbnb_df.reset_index(inplace=True, drop=True)

In [None]:
inside_airbnb_df.drop(index=1490, inplace=True, axis=0)

In [None]:
inside_airbnb_df.head()

In [None]:
def map_categories(amenities):
    categories = set()
    
    for amenity in amenities.split(', '):
        found = False
        for broad_category, keywords in BROAD_CATEGORIES:
            if any(re.search(rf"\b{keyword}\b", amenity, re.IGNORECASE) for keyword in keywords):
                categories.add(broad_category)
                found = True
                break
        if not found:
            categories.add(amenity)
    
    return ', '.join(sorted(categories))

In [None]:
inside_airbnb_df['amenities'] = inside_airbnb_df['amenities'].apply(map_categories)

In [None]:
flatten_arr = [category for item in inside_airbnb_df.amenities.to_list() for category in item.split(', ')]
counts = Counter(flatten_arr)
counts

In [None]:
inside_airbnb_df.to_csv(inside_airbnb_data_dir / 'selected_short_term_rentals.csv', index=False)