In [48]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from IPython.display import display, HTML
from datetime import datetime


In [49]:
## bike-docks real-time data url

url = "https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Ftfl.gov.uk%2Ftfl%2Fsyndication%2Ffeeds%2Fcycle-hire%2Flivecyclehireupdates.xml&data=05%7C02%7Cyanxu.lyu.23%40ucl.ac.uk%7C90bd120420024c3fedfb08dc849d9c43%7C1faf88fea9984c5b93c9210a11d9a5c2%7C0%7C0%7C638531059658481010%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C0%7C%7C%7C&sdata=8xQWhmB7HV%2F85Gden3j9rQLNYQtUxqXlefgcI%2FnZmmk%3D&reserved=0"


In [95]:
## categorize time

def categorize_time(hour):
    if 5 <= hour < 8:
        return "morning"
    elif 8 <= hour < 12:
        return "late morning"
    elif 12 <= hour < 14:
        return "noon"
    elif 14 <= hour < 18:
        return "afternoon"
    elif 18 <= hour < 21:
        return "evening"
    elif 21 <= hour < 24:
        return "night"
    else:
        return "late night"
        
current_hour = datetime.now().hour
time_category = categorize_time(current_hour)
current_hour

22

Based on TfL's bicycle station information data, longitude and latitude information for all stations can be obtained. Using this geographical data along with OpenWeatherMap, real-time weather information is obtained, including temperature, humidity, wind speed, and weather description. This process involves API calls. 

Due to the high number of repetitive API calls, execution may fail and requires repeated attempts.

In [96]:
## combine dock data with weather data, both real-time

# fetch XML data from URL
response = requests.get(url)
xml_data = response.content

# parse XML data
root = ET.fromstring(xml_data)

# extract relevant data and convert to DataFrame
stations = []
for station in root.findall('.//station'):
    name = station.find('name').text
    lon = float(station.find('long').text)
    lat = float(station.find('lat').text)
    available_bikes = int(station.find('nbBikes').text)
    free_slots = int(station.find('nbEmptyDocks').text)
    
    # current date
    date = datetime.now().date() 

    # fetch weather information from "openweathermap"
    api_key = "dcb2105d582eadd87cc45ebf395eb839"
    weather_url = f"https://api.openweathermap.org/data/2.5/weather?lat={lat}&lon={lon}&appid={api_key}"
    weather_response = requests.get(weather_url)
    
    if weather_response.status_code == 200:
        weather_data = weather_response.json()
        temperature = weather_data['main']['temp'] - 273.15  # Convert Kelvin to Celsius
        humidity = weather_data['main']['humidity']
        weather = weather_data['weather'][0]['main']
        wind_speed = weather_data['wind']['speed']
    else:
        # if API call fails, use placeholders
        temperature = None
        humidity = None
        weather = "Unknown"
        wind_speed = None

    current_hour = datetime.now().hour
    is_weekend = datetime.now().weekday() >= 5
    is_holiday = False   # placeholder for holiday, need additional logic to determine holidays
    land_type = "Urban"  # placeholder for land type

    stations.append({
        'Station': name,
        'Lon': lon,
        'Lat': lat,
        'Available Bikes': available_bikes,
        'Free Slots': free_slots,
        'Date': date,
        'Hour': current_hour,
        'Temperature': temperature,
        'Humidity': humidity,
        'Weather': weather,
        'Wind Speed': wind_speed,
        'Is Weekend': is_weekend,
        'Is Holiday': is_holiday,
        'Land Type': land_type
    })

df = pd.DataFrame(stations)

df.head()


Unnamed: 0,Station,Lon,Lat,Available Bikes,Free Slots,Date,Hour,Temperature,Humidity,Weather,Wind Speed,Is Weekend,Is Holiday,Land Type
0,"River Street , Clerkenwell",-0.109971,51.529163,15,0,2024-06-29,22,17.92,68,Clouds,2.06,True,False,Urban
1,"Phillimore Gardens, Kensington",-0.197574,51.499607,25,9,2024-06-29,22,18.44,66,Clouds,2.06,True,False,Urban
2,"Christopher Street, Liverpool Street",-0.084606,51.521284,0,32,2024-06-29,22,18.19,67,Clouds,2.06,True,False,Urban
3,"St. Chad's Street, King's Cross",-0.120974,51.530059,15,8,2024-06-29,22,18.27,66,Clouds,2.06,True,False,Urban
4,"Sedding Street, Sloane Square",-0.156876,51.49313,3,22,2024-06-29,22,18.26,67,Clouds,2.06,True,False,Urban


In [97]:
# save data to csv

csv_file_path = "work/Documents/UCL_CASA/london_bike_stations_6-29-22.csv"
df.to_csv(csv_file_path, index=False)

Root Mean Squared Error: 9.30976180471948


In [87]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# 假设已经有一个数据框 data 包含以下列：
# Date, Hour, Temperature, Humidity, Wind Speed, Weather, Land Type, Free Slots, Name, Is Weekend, Is Holiday

# 1. 加载数据
files = [
            "work/Documents/UCL_CASA/london_bike_stations_6-29-17.csv",
            "work/Documents/UCL_CASA/london_bike_stations_6-29-18.csv",
            "work/Documents/UCL_CASA/london_bike_stations_6-29-19.csv"
        ]

# 创建一个空的DataFrame用于存储所有数据
data = pd.DataFrame()

# 读取每个文件并合并到总的数据框中
for file in files:
    df = pd.read_csv(file, parse_dates=['Date'])
    data = pd.concat([data, df], ignore_index=True)

# 2. 处理时间和天气列
# 将日期列转换为 datetime 类型
data['Date'] = pd.to_datetime(data['Date'])

# 将小时（Hour）列转换为分类变量，并进行独热编码
data['Hour'] = data['Hour'].astype('category')
data['Weather'] = data['Weather'].astype('category')
data['Land Type'] = data['Land Type'].astype('category')

# 或者使用 LabelEncoder 对 Weather 和 Land Type 列进行编码
# label_encoder = LabelEncoder()
# data['Hour'] = label_encoder.fit_transform(data['Hour'])
# data['Weather'] = label_encoder.fit_transform(data['Weather'])
# data['Land Type'] = label_encoder.fit_transform(data['Land Type'])

# df['Weather'] = df['Weather'].astype('category')
# df['Land Type'] = df['Land Type'].astype('category')

# 将 Is Weekend 和 Is Holiday 列转换为布尔值
# data['Is Weekend'] = data['Is Weekend'].map({'True': True, 'False': False})
# data['Is Holiday'] = data['Is Holiday'].map({'True': True, 'False': False})

# 3. 添加滞后特征
# 对每个站点的自行车需求量（Free Slots）添加一个小时的滞后特征
data['Demand_Lag1'] = data.groupby('Station')['Free Slots'].shift(1)

# 4. 站点按时间排序并选择连续时间作为训练集和测试集
train_data = pd.DataFrame()
test_data = pd.DataFrame()

# 对每个站点进行分组
groups = data.groupby('Station')

# 遍历每个站点的数据组
for name, group in groups:
    # 选择时间连续的部分作为训练集和测试集
    split_index = int(len(group) * 0.8)  # 80%作为训练集，剩余20%作为测试集
    
    train_group = group.iloc[:split_index]
    test_group = group.iloc[split_index:]
    
    # 将每个站点的训练集和测试集数据合并到总的训练集和测试集中
    train_data = pd.concat([train_data, train_group])
    test_data = pd.concat([test_data, test_group])

# 确保数据已经按照时间顺序划分完成
train_data = train_data.sort_values(by=['Date', 'Hour'])
test_data = test_data.sort_values(by=['Date', 'Hour'])

# 5. 准备特征和目标变量
X_train = train_data.drop(['Free Slots', 'Station', 'Date'], axis=1)
y_train = train_data['Free Slots']

X_test = test_data.drop(['Free Slots', 'Station', 'Date'], axis=1)
y_test = test_data['Free Slots']

# 6. 训练 XGBoost 模型
model = XGBRegressor(enable_categorical=True)
model.fit(X_train, y_train)

# 7. 预测并评估模型
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# 如果需要，可以进一步调整模型超参数，比如使用网格搜索等方法

# 8. 可选步骤：保存模型
# model.save_model('xgb_model.bin')

# 这样，你就完成了站点数据的时间序列预测模型的建立和评估


                                Station       Lon        Lat  Available Bikes  \
103   Abbey Orchard Street, Westminster -0.132102  51.498126               11   
512       Abbotsbury Road, Holland Park -0.205991  51.501391               12   
364     Aberdeen Place, St. John's Wood -0.176268  51.524826                5   
508            Aberfeldy Street, Poplar -0.005659  51.513548               14   
532         Abingdon Green, Westminster -0.125972  51.497640                7   
...                                 ...       ...        ...              ...   
1181               Wren Street, Holborn -0.116279  51.524564                9   
936           Wright's Lane, Kensington -0.193068  51.500398                8   
1556              Wynne Road, Stockwell -0.112687  51.469217                6   
1192           York Hall, Bethnal Green -0.055894  51.528936               24   
1563              York Way, Kings Cross -0.125441  51.541596                5   

      Free Slots       Date