In [None]:
# Import 
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
from geopy.distance import geodesic

In [None]:
# ----------------------------
# Step 1: Load and preprocess data
# ----------------------------
# Load the bike trip data
data = pd.read_csv('zhong.csv')

# Convert time columns to datetime format and extract features
data['start_time'] = pd.to_datetime(data['start_time'])
data['hour'] = data['start_time'].dt.hour

# Count hourly inflows and outflows
daily_inflow = data.groupby(['end_station_name', 'hour']).size().rename('inflow')
daily_outflow = data.groupby(['start_station_name', 'hour']).size().rename('outflow')

# Combine inflow and outflow into a single dataset
demand_data = pd.concat([daily_inflow, daily_outflow], axis=1).fillna(0)
demand_data['total_demand'] = demand_data['inflow'] + demand_data['outflow']

# Normalize demand data
scaler = MinMaxScaler()
demand_data[['inflow', 'outflow', 'total_demand']] = scaler.fit_transform(demand_data[['inflow', 'outflow', 'total_demand']])

# Reset index for easier handling
demand_data = demand_data.reset_index()

# Rename 'level_0' to 'station_name' for clarity
demand_data = demand_data.rename(columns={'level_0': 'station_name'})

# Create station mapping for node index
demand_data['station_id'] = demand_data['station_name'].factorize()[0]

# ----------------------------
# Step 2: Create graph structure
# ----------------------------
# Load station coordinates for edges
station_coords = data[['start_station_name', 'start_lat', 'start_lng']].drop_duplicates()
station_coords.columns = ['station_name', 'lat', 'lng']

# Build edge connections based on station proximity
edges = []
edge_weights = []
stations = station_coords.to_dict('records')

for i, station1 in enumerate(stations):
    for j, station2 in enumerate(stations):
        if i != j:
            distance = geodesic((station1['lat'], station1['lng']), (station2['lat'], station2['lng'])).kilometers
            if distance <= 1.0:  # Only consider stations within 1 km
                edges.append((station1['station_name'], station2['station_name']))
                edge_weights.append(1 / distance)  # Weight inversely proportional to distance

# Convert station names to indices for PyTorch Geometric
station_index_map = dict(zip(demand_data['station_name'], demand_data['station_id']))
edges = [(station_index_map[edge[0]], station_index_map[edge[1]]) for edge in edges]
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
edge_attr = torch.tensor(edge_weights, dtype=torch.float)

# ----------------------------
# Step 3: Prepare node features and labels
# ----------------------------
# Node features: inflow, outflow, and hour
node_features = demand_data[['inflow', 'outflow', 'hour']].values
x = torch.tensor(node_features, dtype=torch.float)

# Labels: total demand
labels = demand_data['total_demand'].values
y = torch.tensor(labels, dtype=torch.float).unsqueeze(1)


In [None]:
# ----------------------------
# Step 4: Split data into train/test
# ----------------------------
train_mask, test_mask = train_test_split(range(x.size(0)), test_size=0.2, random_state=42)
train_mask = torch.tensor(train_mask, dtype=torch.long)
test_mask = torch.tensor(test_mask, dtype=torch.long)

# ----------------------------
# Step 5: Define GNN model
# ----------------------------
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

# Initialize the model
input_dim = x.size(1)
hidden_dim = 16
output_dim = 1
model = GCN(input_dim, hidden_dim, output_dim)

# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
# ----------------------------
# Step 6: Train the model
# ----------------------------
for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(x, edge_index)
    loss = criterion(out[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch + 1}, Loss: {loss.item():.4f}')

Epoch 10, Loss: 0.1013
Epoch 20, Loss: 0.0706
Epoch 30, Loss: 0.0184
Epoch 40, Loss: 0.0185
Epoch 50, Loss: 0.0118
Epoch 60, Loss: 0.0087
Epoch 70, Loss: 0.0062
Epoch 80, Loss: 0.0047
Epoch 90, Loss: 0.0035
Epoch 100, Loss: 0.0026
Epoch 110, Loss: 0.0020
Epoch 120, Loss: 0.0015
Epoch 130, Loss: 0.0012
Epoch 140, Loss: 0.0010
Epoch 150, Loss: 0.0009
Epoch 160, Loss: 0.0008
Epoch 170, Loss: 0.0008
Epoch 180, Loss: 0.0008
Epoch 190, Loss: 0.0007
Epoch 200, Loss: 0.0007


In [None]:
# ----------------------------
# Step 7: Evaluate the model
# ----------------------------
model.eval()
with torch.no_grad():
    predictions = model(x, edge_index)
    test_loss = criterion(predictions[test_mask], y[test_mask])
    print(f'Test Loss: {test_loss.item():.4f}')

total_demand_scaled = np.zeros((predictions.shape[0], 3))  # Create a placeholder array with 3 columns
total_demand_scaled[:, 2] = predictions.detach().numpy().flatten()  # Only set the total_demand column
predictions_denormalized = scaler.inverse_transform(total_demand_scaled)[:, 2]

actual_scaled = np.zeros((y.shape[0], 3))
actual_scaled[:, 2] = y.numpy().flatten()
actual_denormalized = scaler.inverse_transform(actual_scaled)[:, 2]

# Print sample predictions vs actual values
for i in range(20):
    print(f'Predicted: {predictions_denormalized[test_mask[i]]:.2f}, Actual: {actual_denormalized[test_mask[i]]:.2f}')

Test Loss: 0.0007
Predicted: 2.16, Actual: 2.00
Predicted: 20.78, Actual: 22.00
Predicted: 2.52, Actual: 2.00
Predicted: 16.51, Actual: 1.00
Predicted: 39.50, Actual: 38.00
Predicted: 74.41, Actual: 74.00
Predicted: 5.29, Actual: 6.00
Predicted: 33.89, Actual: 31.00
Predicted: 46.28, Actual: 43.00
Predicted: 8.93, Actual: 8.00
Predicted: 88.54, Actual: 87.00
Predicted: 3.33, Actual: 3.00
Predicted: 14.29, Actual: 15.00
Predicted: 16.93, Actual: 17.00
Predicted: 17.54, Actual: 18.00
Predicted: 2.13, Actual: 2.00
Predicted: 55.65, Actual: 56.00
Predicted: 46.27, Actual: 46.00
Predicted: 5.97, Actual: 5.00
Predicted: 46.13, Actual: 45.00
