In [None]:
# Install required packages
!pip install pandas osmnx scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Uninstall numpy, scikit-learn, and osmnx
!pip uninstall -y numpy scikit-learn osmnx

# Reinstall numpy, scikit-learn, and osmnx in a specific order
!pip install numpy
!pip install scikit-learn
!pip install osmnx

Found existing installation: numpy 1.22.4
Uninstalling numpy-1.22.4:
  Successfully uninstalled numpy-1.22.4
Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting numpy
  Downloading numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
yellowbrick 1.5 requires scikit-learn>=1.0.0, which is not installed.
sklearn-pandas 2.2.0 requires scikit-learn>=0.23.0, which is not installed.
qudida 0.0.4 requires scikit-learn>=0.19.1, which is not i

In [None]:
import os
import zipfile
from datetime import datetime

import pandas as pd
import osmnx as ox
from sklearn.linear_model import LinearRegression
from google.colab import files
import concurrent.futures

# Upload kaggle.json
uploaded = files.upload()
for fn in uploaded.keys():
    print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')

# Set up Kaggle API
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

# Download dataset
!kaggle datasets download -d arashnic/microsoft-geolife-gps-trajectory-dataset
!unzip -q -o microsoft-geolife-gps-trajectory-dataset.zip
 
def load_geolife_trajectory(zip_path, file):
    with zipfile.ZipFile(zip_path, 'r') as zfile:
        with zfile.open(file) as plt_file:
            trajectory = pd.read_csv(plt_file, skiprows=6, header=None, names=['lat', 'lon', 'alt', 'date', 'time'])
            trajectory['timestamp'] = pd.to_datetime(trajectory['date'] + ' ' + trajectory['time'])
            trajectory = trajectory.drop(['date', 'time'], axis=1)
    return trajectory

def parallel_load_geolife_trajectories(zip_path, max_files=1000, n_workers=None):
    trajectories = []

    with zipfile.ZipFile(zip_path, 'r') as zfile:
        plt_files = [file for file in zfile.namelist() if file.endswith(".plt")]

    with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor:
        tasks = [executor.submit(load_geolife_trajectory, zip_path, file) for file in plt_files[:max_files]]
        trajectories = [task.result() for task in concurrent.futures.as_completed(tasks)]
    
    return trajectories

geolife_zip = "/content/microsoft-geolife-gps-trajectory-dataset.zip"
trajectories = parallel_load_geolife_trajectories(geolife_zip)


# Download and create the graph
place_name = "Beijing, China"
graph = ox.graph_from_place(place_name, network_type='drive')
ox.io.save_graph_geopackage(graph, filepath='osmnx_data/Beijing')

import numpy as np
from scipy.spatial import KDTree

def build_kdtree(graph):
    nodes, data = zip(*graph.nodes(data=True))
    coords = np.array([(d['y'], d['x']) for d in data])
    kdtree = KDTree(coords)
    return kdtree, nodes

def map_gps_points_to_nodes(trajectory, kdtree, nodes, graph):
    mapped_points = []
    for _, point in trajectory.iterrows():
        dist, idx = kdtree.query((point.lat, point.lon))
        nearest_node = nodes[idx]
        mapped_points.append((point.timestamp, nearest_node))
    return mapped_points

def parallel_map_gps_points_to_nodes(trajectories, graph, n_workers=None):
    kdtree, nodes = build_kdtree(graph)
    with concurrent.futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
        tasks = [executor.submit(map_gps_points_to_nodes, trajectory, kdtree, nodes, graph) for trajectory in trajectories]
        mapped_trajectories = [task.result() for task in concurrent.futures.as_completed(tasks)]
    return mapped_trajectories

mapped_trajectories = parallel_map_gps_points_to_nodes(trajectories, graph)



import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

input_size = 10  # Set an appropriate value for your dataset
embedding_size = 20 # Set an appropriate value for your model

# Prepare input data for the model
def prepare_input_data(mapped_trajectories):
    input_data = []
    for trajectory in mapped_trajectories:
        input_data.append(torch.tensor([node_id for _, node_id in trajectory[:input_size]], dtype=torch.long))
    return input_data

inputs = pad_sequence(prepare_input_data(mapped_trajectories), batch_first=True)
labels = torch.randn(len(mapped_trajectories), embedding_size)

max_node_id = max([node_id for seq in inputs for node_id in seq])


# ... (rest of the code)


# Define a TensorDataset to hold the data
dataset = TensorDataset(inputs, labels)

# Define a DataLoader to iterate over the data in batches
batch_size = 1
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define the ST2Vec model with spatial and temporal modeling and STCF
class ST2Vec(nn.Module):
    def __init__(self, input_size, embedding_size):
        super(ST2Vec, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=max_node_id+1, embedding_dim=embedding_size)
        self.spatial_gc1 = nn.Linear(embedding_size, embedding_size)
        self.spatial_gc2 = nn.Linear(embedding_size, embedding_size)
        self.temporal_gc1 = nn.Linear(embedding_size, embedding_size)
        self.temporal_gc2 = nn.Linear(embedding_size, embedding_size)
        self.stcf = nn.Linear(embedding_size * 2, embedding_size)

    def forward(self, x):
        spatial_x = self.embedding(x)
        spatial_x = F.relu(self.spatial_gc1(spatial_x))
        spatial_x = F.relu(self.spatial_gc2(spatial_x))

        temporal_x = self.embedding(x)
        temporal_x = F.relu(self.temporal_gc1(temporal_x))
        temporal_x = F.relu(self.temporal_gc2(temporal_x))

        x = torch.cat((spatial_x, temporal_x), dim=2)
        x = F.relu(self.stcf(x))
        x = x.mean(dim=1)

        return x

# Instantiate an object of the ST2Vec class
model = ST2Vec(input_size, embedding_size)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        # Clear the GPU memory cache after each batch
        torch.cuda.empty_cache()

    # Compute the validation loss after each epoch
    with torch.no_grad():
        val_inputs, val_labels = next(iter(train_loader))
        val_outputs = model(val_inputs)
        val_loss = criterion(val_outputs, val_labels)

    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {running_loss/len(train_loader):.4f}, Validation Loss: {val_loss:.4f}")
# Use the model to compute the embedding for the input sequence
with torch.no_grad():
    embedding = model(inputs)

# Compute the t-SNE visualization of the embedding
#tsne = TSNE(n_components=2, perplexity=1, n_iter# Compute the t-SNE visualization of the embedding
tsne = TSNE(n_components=2, perplexity=1, n_iter=1000)
embedding_tsne = tsne.fit_transform(embedding.numpy())

# Visualize the t-SNE embedding
plt.scatter(embedding_tsne[:, 0], embedding_tsne[:, 1])
plt.title('t-SNE visualization of ST2Vec embeddings')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()




RuntimeError: ignored

RuntimeError: ignored

ImportError: ignored

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np

input_size = 10  # Set an appropriate value for your dataset
embedding_size = 20 # Set an appropriate value for your model

# Prepare input data for the model
def prepare_input_data(mapped_trajectories):
    input_data = []
    for trajectory in mapped_trajectories:
        input_data.append(torch.tensor([node_id for _, node_id in trajectory[:input_size]], dtype=torch.long))
    return input_data

inputs = pad_sequence(prepare_input_data(mapped_trajectories), batch_first=True)
labels = torch.randn(len(mapped_trajectories), embedding_size)

max_node_id = max([node_id for seq in inputs for node_id in seq])


# ... (rest of the code)


# Define a TensorDataset to hold the data
dataset = TensorDataset(inputs, labels)

# Define a DataLoader to iterate over the data in batches
batch_size = 1
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define the ST2Vec model with spatial and temporal modeling and STCF
class ST2Vec(nn.Module):
    def __init__(self, input_size, embedding_size):
        super(ST2Vec, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=max_node_id+1, embedding_dim=embedding_size)
        self.spatial_gc1 = nn.Linear(embedding_size, embedding_size)
        self.spatial_gc2 = nn.Linear(embedding_size, embedding_size)
        self.temporal_gc1 = nn.Linear(embedding_size, embedding_size)
        self.temporal_gc2 = nn.Linear(embedding_size, embedding_size)
        self.stcf = nn.Linear(embedding_size * 2, embedding_size)

    def forward(self, x):
        spatial_x = self.embedding(x)
        spatial_x = F.relu(self.spatial_gc1(spatial_x))
        spatial_x = F.relu(self.spatial_gc2(spatial_x))

        temporal_x = self.embedding(x)
        temporal_x = F.relu(self.temporal_gc1(temporal_x))
        temporal_x = F.relu(self.temporal_gc2(temporal_x))

        x = torch.cat((spatial_x, temporal_x), dim=2)
        x = F.relu(self.stcf(x))
        x = x.mean(dim=1)

        return x

# Instantiate an object of the ST2Vec class
model = ST2Vec(input_size, embedding_size)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        # Clear the GPU memory cache after each batch
        torch.cuda.empty_cache()

    # Compute the validation loss after each epoch
    with torch.no_grad():
        val_inputs, val_labels = next(iter(train_loader))
        val_outputs = model(val_inputs)
        val_loss = criterion(val_outputs, val_labels)

    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {running_loss/len(train_loader):.4f}, Validation Loss: {val_loss:.4f}")
# Use the model to compute the embedding for the input sequence
with torch.no_grad():
    embedding = model(inputs)

# Compute the t-SNE visualization of the embedding
#tsne = TSNE(n_components=2, perplexity=1, n_iter# Compute the t-SNE visualization of the embedding
tsne = TSNE(n_components=2, perplexity=1, n_iter=1000)
embedding_tsne = tsne.fit_transform(embedding.numpy())

# Visualize the t-SNE embedding
plt.scatter(embedding_tsne[:, 0], embedding_tsne[:, 1])
plt.title('t-SNE visualization of ST2Vec embeddings')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()


In [None]:
def build_kdtree(graph):
    nodes = np.array([(node, latlon) for node, latlon in graph.nodes(data='latlon') if latlon is not None])
    xy = np.array([(latlon[1], latlon[0]) for _, latlon in nodes])
    kdtree = KDTree(xy)
    return nodes, kdtree

def map_gps_points_to_nodes(trajectories, graph, distance=50):
    nodes, kdtree = build_kdtree(graph)
    mapped_trajectories = []

    for trajectory in trajectories:
        xy = trajectory[['lon', 'lat']].values
        nearest_nodes_indices = kdtree.query(xy, distance_upper_bound=distance)[1]
        valid_indices = nearest_nodes_indices != len(nodes)

        timestamps = trajectory.loc[valid_indices, 'timestamp'].values
        nearest_nodes_indices = nearest_nodes_indices[valid_indices]
        nearest_nodes = np.array([nodes[i][0] for i in nearest_nodes_indices])
        mapped_points = list(zip(timestamps, nearest_nodes))

        mapped_trajectories.append(mapped_points)

    return mapped_trajectories
