In [1]:
from pathlib import Path
import pandas as pd
import requests
import os
import re
from collections import Counter
from geopy.distance import geodesic
from scipy.spatial import KDTree
import numpy as np

In [2]:
pd.options.display.max_rows = None

In [3]:
home_dir = Path.home()
inside_airbnb_data_dir = home_dir / 'Programming/data/inside-airbnb/london'

In [4]:
TFL_API_URL = "https://api.tfl.gov.uk/StopPoint/Mode/tube"

In [5]:
inside_airbnb_data_file = inside_airbnb_data_dir / 'selected_short_term_rentals.csv'
inside_airbnb_df = pd.read_csv(inside_airbnb_data_file)

In [6]:
response = requests.get(TFL_API_URL)
if response.status_code == 200:
    data = response.json()
else:
    raise Exception(f"API error: {response.status_code}")

In [7]:
tube_stations = []
for stop_point in data["stopPoints"]:
    lat, lon = stop_point["lat"], stop_point["lon"]
    station_name = stop_point["commonName"]
    tube_stations.append((station_name, lat, lon))

In [8]:
tube_df = pd.DataFrame(tube_stations, columns=["Station", "Latitude", "Longitude"])
tube_coords = np.array(tube_df[['Latitude', 'Longitude']])
tree = KDTree(tube_coords)

In [9]:
def find_nearest_station(lat, lon):
    _, index = tree.query([lat, lon])
    nearest_station = tube_df.iloc[index]
    distance = geodesic((lat, lon), (nearest_station.Latitude, nearest_station.Longitude)).kilometers
    return nearest_station.Station.replace(" Underground Station", "").replace(" Station", ""), distance

In [10]:
inside_airbnb_df[['nearest_station', 'distance_to_station']] = inside_airbnb_df.apply(
    lambda row: find_nearest_station(row['latitude'], row['longitude']), axis=1, result_type='expand'
)

In [11]:
inside_airbnb_df.to_csv(inside_airbnb_data_dir / 'selected_short_term_rentals_with_distances.csv', index=False)