# Imports

In [None]:
!pip install -q wandb
!pip install -q torch-geometric
!pip install -q pytorch-lightning

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.1/309.1 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m866.2/866.2 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import re
import math
import pickle
from typing import Dict, Iterator, List, Union, Optional
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import torch
import torch.nn.functional as F
import torch.optim as optim
import torchmetrics
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import (
    MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler, Normalizer
)
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
import wandb

import torch_geometric
from torch_geometric.data import Data, Batch
from torch_geometric.utils import from_networkx
from torch_geometric.nn import GCNConv,GATConv, global_mean_pool

SEED: int = 42
pl.seed_everything(SEED)

INFO:lightning_fabric.utilities.seed:Seed set to 42


42

# Device

In [None]:
! nvidia-smi
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

Tue Aug 27 17:39:18 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Preprocessing
Since the preprocessing can take a lot of computational resources, we decided to directly download the preprocessed dataset and other useful data from a git repository.
Therefore the following cells contain the functions we used for preprocessing.

## Create Dataframe

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# columns = ['User ID',
#  'Venue ID',
#  'Venue category ID',
#  'Venue category name',
#  'Latitude',
#  'Longitude',
#  'Timezone',
#  'UTC time']

# df = pd.read_csv('/content/drive/MyDrive/Deep Learning Project/data/dataset_tsmc2014/dataset_TSMC2014_NYC.txt', sep='\t', encoding='latin-1', names=columns)

## Encode dates and POI

In [None]:
def encode_date(time):
    """
    Encode a list of date strings into integers in the format YYYYMMDDHHMMSS.

    Parameters
    ----------
    time : list of str
        List of date strings in the format "Day Month Date HH:MM:SS Year".

    Returns
    -------
    list of int
        List of encoded dates as integers.
    """
    month_dict = {
        'Jan': '01',
        'Feb': '02',
        'Mar': '03',
        'Apr': '04',
        'May': '05',
        'Jun': '06',
        'Jul': '07',
        'Aug': '08',
        'Sep': '09',
        'Oct': '10',
        'Nov': '11',
        'Dec': '12',
    }
    ret_list = []
    for day in time:
        day_list = day.split()
        year = day_list[-1]
        month = day_list[1]
        encoded_month = month_dict[month]
        number_day = day_list[2]
        hms = day_list[3].split(':')
        hour = hms[0]
        minute = hms[1]
        second = hms[2]
        date = int(year + encoded_month + number_day + hour + minute + second)
        ret_list.append(date)

    return ret_list


def create_venue_dict(df):
    """
    Create a dictionary for unique venue categories.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing a column named 'Venue category name'.

    Returns
    -------
    dict
        Dictionary mapping each unique venue category to a unique integer.
    """
    unique_venue_categories = df['Venue category name'].unique()
    venue_dict = {}

    for venue in unique_venue_categories:
        venue_dict[venue] = len(venue_dict) + 1

    return venue_dict


def encode_venues(df, venue_dictionary):
    """
    Encode venue categories using the provided dictionary.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing a column named 'Venue category name'.
    venue_dictionary : dict
        Dictionary mapping each unique venue category to a unique integer.

    Returns
    -------
    list
        List of encoded venue categories.
    """
    venues_names = df['Venue category name'].tolist()
    encoded_venues = [venue_dictionary[v] for v in venues_names]
    return encoded_venues

In [None]:
# # Create the venue dictionary
# venue_dict = create_venue_dict(df)

# # Encode the venues and add columns
# encoded_venues = encode_venues(df, venue_dict)
# df['Encoded Venue'] = encoded_venues

# # Encode the dates
# dates = df['UTC time'].tolist()
# encoded_dates = encode_date(dates)
# df['Encoded Date'] = encoded_dates

# df = df.sort_values(by=['User ID', 'Encoded Date']).reset_index(drop = True)

## Drop and Filter
drop irrelevant columns in order to speed up the process and filter the dataframe in order to have users with at least 20 POIs

In [None]:
def drop_irrelevant_columns(df):
    """
    Drop columns that are not needed for further analysis.

    Parameters
    ----------
    df : pandas.DataFrame
        The original DataFrame.

    Returns
    -------
    pandas.DataFrame
        The DataFrame after dropping irrelevant columns.
    """
    columns_to_drop = [
        'Venue category ID',
        'Venue ID',
        'Venue category name',
        'Timezone',
        'UTC time'
    ]
    return df.drop(columns=columns_to_drop, axis=1)


def filter_min_pois(df, min_pois=20):
    """
    Filter users who have visited at least a specified number of unique POIs.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame containing user visit data.
    min_pois : int, optional
        The minimum number of unique POIs a user must have visited to be included
        in the DataFrame, by default 20.

    Returns
    -------
    pandas.DataFrame
        The filtered DataFrame containing only users who have visited at least
        `min_example` unique POIs.
    """
    user_poi_counts = df.groupby('User ID')['Encoded Venue'].nunique()
    sufficient_pois_users = user_poi_counts[user_poi_counts >= min_pois].index
    filtered_df = df[df['User ID'].isin(sufficient_pois_users)]
    return filtered_df.reset_index(drop=True)

In [None]:
# # Drop irrelevant columns
# df = drop_irrelevant_columns(df)

# # Filter users with at least 20 unique POIs
# df = filter_min_pois(df, min_pois=20)


## Time Slots

In [None]:
def create_time_slots(start_date, end_date):
    """
    Create a dictionary of time slots, each representing a week between the start
    and end dates.

    Parameters
    ----------
    start_date : datetime
        The start date for generating time slots.
    end_date : datetime
        The end date for generating time slots.

    Returns
    -------
    dict
        A dictionary where keys are time slots in 'YYYYMMDDHHMMSS' format and
        values are incremental indices.
    """
    time_slots = {}
    current_slot = start_date
    index = 1

    while current_slot <= end_date:
        slot_key = current_slot.strftime('%Y%m%d%H%M%S')
        time_slots[slot_key] = index
        current_slot += timedelta(weeks=1)
        index += 1

    return time_slots


def find_closest_slot(time_slots, date_str):
    """
    Find the closest future time slot in the dictionary given a date string.

    Parameters
    ----------
    time_slots : dict
        Dictionary of time slots with keys as slot strings in 'YYYYMMDDHHMMSS' format.
    date_str : str
        The date string for which to find the closest future slot.

    Returns
    -------
    int
        The index of the closest future time slot.
    """
    given_date = datetime.strptime(date_str, '%Y%m%d%H%M%S')

    # Convert all keys to datetime objects and filter for those after the given date
    future_slots = {
        slot: datetime.strptime(slot, '%Y%m%d%H%M%S')
        for slot in time_slots.keys()
        if datetime.strptime(slot, '%Y%m%d%H%M%S') > given_date
    }

    if not future_slots:
        return None  # Return None if no future slot is found

    # Find the closest future slot
    closest_slot = min(future_slots, key=future_slots.get)
    return time_slots[closest_slot]


def get_time_slot(df, time_slots):
    """
    Map each encoded date in the DataFrame to the closest time slot.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing an 'Encoded Date' column with dates to map to time slots.
    time_slots : dict
        Dictionary of time slots with keys as slot strings in 'YYYYMMDDHHMMSS' format.

    Returns
    -------
    list
        A list of time slot indices corresponding to each encoded date.
    """
    slots = []
    encoded_dates = df['Encoded Date'].tolist()

    for encoded_date in encoded_dates:
        time_slot = find_closest_slot(time_slots, str(encoded_date))
        slots.append(time_slot)

    return slots

In [None]:
# # Define the start and end dates
# start_date = datetime(2012, 4, 12)
# end_date = datetime(2013, 3, 16)

# # Create the dictionary of time slots
# time_slots = create_time_slots(start_date, end_date)

# # Map encoded dates to time slots and create a new column in the DataFrame
# df['Time Slot'] = get_time_slot(df, time_slots)

# # Drop the 'Encoded Date' column as it's no longer needed
# df = df.drop(['Encoded Date'], axis=1)
# df.head()

## Spatial Graphs

In [None]:
def compute_haversine_distance(lat1, lon1, lat2, lon2):
    """
    Compute the Haversine distance between two points on the Earth.

    Parameters
    ----------
    lat1 : float
        Latitude of the first point.
    lon1 : float
        Longitude of the first point.
    lat2 : float
        Latitude of the second point.
    lon2 : float
        Longitude of the second point.

    Returns
    -------
    float
        The distance between the two points in kilometers.
    """
    # Earth radius in kilometers
    R = 6371.0

    # Convert coordinates from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Compute differences in coordinates
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    # Haversine formula
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Distance in kilometers
    distance = R * c

    return distance


def extract_unique_coordinates(df):
    """
    Extract unique POIs and their corresponding coordinates for each user.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame containing user visit data.

    Returns
    -------
    tuple
        Three lists: POIs, longitudes, and latitudes for each user.
    """
    longs, lats, pois = [], [], []

    for user_id, group in df.groupby('User ID'):
        total_poi = group['Encoded Venue'].unique().tolist()
        total_lon = group['Longitude'].unique().tolist()
        total_lat = group['Latitude'].unique().tolist()

        pois.append(total_poi)
        longs.append(total_lon)
        lats.append(total_lat)

    return pois, longs, lats


def build_spatial_graphs(pois, longs, lats, max_distance=5):
    """
    Build spatial graphs for each user based on the Haversine distance.

    Parameters
    ----------
    pois : list
        List of POIs for each user.
    longs : list
        List of longitudes for each user.
    lats : list
        List of latitudes for each user.
    max_distance : float, optional
        The maximum distance (in km) between POIs to create an edge, by default 5 km.

    Returns
    -------
    list
        A list of NetworkX graphs representing the spatial relationships for each user.
    """
    spatial_graphs = []

    for i in range(len(pois)):
        G = nx.Graph()

        for j in range(len(pois[i])):
            place_1, lon_1, lat_1 = pois[i][j], lats[i][j], longs[i][j]
            G.add_node(place_1)

            for q in range(j + 1, len(pois[i])):  # Start from j + 1 to avoid redundant calculations
                place_2, lon_2, lat_2 = pois[i][q], lats[i][q], longs[i][q]
                G.add_node(place_2)

                if place_1 != place_2:
                    distance = compute_haversine_distance(lat_1, lon_1, lat_2, lon_2)

                    if distance <= max_distance:
                        G.add_edge(place_1, place_2)

        spatial_graphs.append(G)

    return spatial_graphs

In [None]:
# # Extract encoded venues (POIs) and coordinates
# pois, longs, lats = extract_unique_coordinates(df)

# # Create spatial graphs
# spatial_graphs = build_spatial_graphs(pois, longs, lats, max_distance=5)

# # Node features
# num_nodes = len(venue_dict)
# num_features = 100
# node_features = torch.randn(num_nodes, num_features)

## Similarity

In [None]:
def compute_user_poi_frequency(df, venue_dict):
    """
    Compute the frequency of visits for each POI for each user.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing user visit data with 'User ID' and 'Encoded Venue' columns.
    venue_dict : dict
        Dictionary mapping venue names to unique encoded integers.

    Returns
    -------
    list
        A list of frequency lists where each list represents the number of visits
        to each POI by a specific user.
    """
    user_poi_frequency = []

    for user_id, group in df.groupby('User ID'):
        freq = [0] * (len(venue_dict) + 1)  # Initialize the frequency list
        for encoded_venue in group['Encoded Venue']:
            freq[encoded_venue] += 1
        user_poi_frequency.append(freq)

    return user_poi_frequency


def normalize_poi_frequency(user_poi_frequency, venue_dict):
    """
    Normalize the POI frequency for each user by dividing by the total number of POIs.

    Parameters
    ----------
    user_poi_frequency : list
        List of frequency lists where each list represents the number of visits
        to each POI by a user.
    venue_dict : dict
        Dictionary mapping venue names to unique encoded integers.

    Returns
    -------
    list
        A list of normalized frequency lists for each user.
    """
    normalized_poi_frequency = []

    for user_freq in user_poi_frequency:
        norm = [freq / len(venue_dict) for freq in user_freq]
        normalized_poi_frequency.append(norm)

    return normalized_poi_frequency


def minmax_normalize_frequency(user_poi_frequency):
    """
    Apply MinMax normalization to the POI frequency lists.

    Parameters
    ----------
    user_poi_frequency : list
        List of frequency lists where each list represents the number of visits to each POI by a user.

    Returns
    -------
    np.ndarray
        A numpy array of MinMax normalized frequency lists.
    """
    scaler = MinMaxScaler()
    minmax_normalized_frequency = scaler.fit_transform(user_poi_frequency)

    return minmax_normalized_frequency

In [None]:
# # Compute POI frequency for each user
# user_poi_frequency = compute_user_poi_frequency(df, venue_dict)

# # Normalize the POI frequency by dividing by the number of POIs
# normalized_poi_frequency = normalize_poi_frequency(user_poi_frequency, venue_dict)

# # Apply MinMax normalization
# minmax_poi_frequency = minmax_normalize_frequency(user_poi_frequency)

# # Example: print the min-max normalized frequency
# for p in minmax_poi_frequency:
#     print(p)

### KMeans

In [None]:
def plot_2d_pca(data, title="2D Visualization using PCA"):
    """
    Plots a 2D scatter plot of PCA-reduced data.

    Parameters
    ----------
    data : np.ndarray
        Data reduced to 2 dimensions using PCA.
    title : str, optional
        Title of the plot (default is "2D Visualization using PCA").
    """
    x = data[:, 0]
    y = data[:, 1]

    plt.scatter(x, y)
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.title(title)
    plt.show()

def apply_kmeans(data, n_clusters):
    """
    Applies K-Means clustering to the data and returns centroids and labels.

    Parameters
    ----------
    data : np.ndarray
        Data to cluster.
    n_clusters : int
        Number of clusters for K-Means.

    Returns
    -------
    tuple
        Centroids and labels from K-Means clustering.
    """
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(data)
    return kmeans.cluster_centers_, kmeans.labels_

def plot_kmeans_clusters(data, centroids, title="2D Visualization with K-means Centroids"):
    """
    Plots a 2D scatter plot of PCA-reduced data with K-means centroids.

    Parameters
    ----------
    data : np.ndarray
        Data reduced to 2 dimensions using PCA.
    centroids : np.ndarray
        Centroids from K-Means clustering.
    title : str, optional
        Title of the plot (default is "2D Visualization with K-means Centroids").
    """
    x = data[:, 0]
    y = data[:, 1]

    plt.scatter(x, y, label='Data points')
    plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='x', s=100, label='Centroids')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
# # Execute PCA reducing to 2 dimensions
# pca = PCA(n_components=2)
# data_reduced_2d = pca.fit_transform(minmax_poi_frequency)

# # Visualization of 2D data
# plot_2d_pca(data_reduced_2d)

# # Apply Kmeans on 2D reduced data
# n_clusters = 6  # Desired cluster number
# centroids, labels = apply_kmeans(data_reduced_2d, n_clusters)

# # Visualization of 2D data with kmeans clusters
# plot_kmeans_clusters(data_reduced_2d, centroids)

### User Similarity

In [None]:
def calculate_cosine_similarity(vector1, vector2):
    """
    Calculate the cosine similarity between two vectors.

    Args:
        vector1 (torch.Tensor): The first vector.
        vector2 (torch.Tensor): The second vector.

    Returns:
        float: The cosine similarity between the two vectors.
    """
    return F.cosine_similarity(vector1.unsqueeze(0), vector2.unsqueeze(0)).item()


def find_most_similar_vectors(normalized_tensor):
    """
    Find the most similar vector for each vector in a tensor using cosine similarity.

    Args:
        normalized_tensor (torch.Tensor): A tensor containing normalized vectors.

    Returns:
        list: A list of tuples containing the most similar vector index and the
        corresponding similarity.
    """
    most_similar_vectors = []

    for i in range(len(normalized_tensor)):
        # Calculate cosine similarity with all other vectors
        similarities = F.cosine_similarity(
            normalized_tensor[i].unsqueeze(0), normalized_tensor
        )

        # Set similarity with itself to -1 to exclude it from consideration
        similarities[i] = -1

        # Find the index of the vector with the highest similarity
        most_similar_index = torch.argmax(similarities).item()
        most_similar_user = most_similar_index + 1  # +1 because user IDs start from 1

        # Append the result to the list
        most_similar_vectors.append(
            (most_similar_user, similarities[most_similar_index].item())
        )

    return most_similar_vectors


def print_similar_users(most_similar_vectors):
    """
    Print the most similar user for each user based on cosine similarity.

    Args:
        most_similar_vectors (list): A list of tuples containing the most similar
        vector index and similarity.
    """
    similar_users = []

    for i, (idx, similarity) in enumerate(most_similar_vectors):
        print(f"User {i + 1} is similar to User {idx} with a similarity of: {similarity:.4f}")
        similar_users.append(idx)

    return similar_users

In [None]:
# # Convert the normalized frequency list into a PyTorch tensor
# normalized_tensor = torch.tensor(minmax_poi_frequency, dtype=torch.float32)

# # Example calculation of cosine similarity between two specific vectors
# vector1 = normalized_tensor[8]
# vector2 = normalized_tensor[5]
# similarity = calculate_cosine_similarity(vector1, vector2)
# print(f"Cosine similarity: {similarity:.4f}")

# # Find the most similar vectors for each user
# most_similar_vectors = find_most_similar_vectors(normalized_tensor)

# # Print and store the most similar users
# similar_users = print_similar_users(most_similar_vectors)

## Labels

In [None]:
def compute_total_repetitions(df):
    """
    Compute the frequency of venue visits for each user and return a sorted dictionary
    for each user where venues are ordered by the highest number of visits.

    Args:
        df (pd.DataFrame): DataFrame containing the user data.

    Returns:
        list: A list of dictionaries with venue visit frequencies, sorted in
        descending order.
    """
    total_repetitions = []

    # Group by 'User ID' to analyze visits per user
    for user_id, group in df.groupby('User ID'):
        venue_counts = {}

        # Count the occurrences of each venue
        for e_venue in group['Encoded Venue']:
            if e_venue not in venue_counts:
                venue_counts[e_venue] = 1
            else:
                venue_counts[e_venue] += 1

        # Sort the dictionary by visit frequency in descending order
        sorted_venue_counts = dict(
            sorted(venue_counts.items(), key=lambda item: item[1], reverse=True)
        )

        total_repetitions.append(sorted_venue_counts)

    return total_repetitions


def extract_top_pois(total_repetitions, top_n=20):
    """
    Extract the top N most visited Points of Interest (POIs) for each user.

    Args:
        total_repetitions (list): A list of dictionaries with sorted venue visit frequencies.
        top_n (int): The number of top POIs to extract. Default is 20.

    Returns:
        list: A list of lists containing the top N POIs for each user.
    """
    labels = []

    for repetition_dict in total_repetitions:
        # Get the first 'top_n' POIs from the sorted dictionary keys
        top_pois = list(repetition_dict.keys())[:top_n]
        labels.append(top_pois)

    return labels


In [None]:
# # Count the number of visits of each POI for each user
# total_repetitions = compute_total_repetitions(df)

# # Extract the top_n visited POI from the full list for each user
# labels = extract_top_pois(total_repetitions, top_n=20)

## Train, Test and Validation DF

In [None]:
def filter_data(df):
    """
    Filters the DataFrame to retain only users who have visited at least 20 distinct
    time slots.

    Parameters:
      df (pd.DataFrame): The original DataFrame containing user and time slot information.

    Returns:
      final_df (pd.DataFrame): A filtered DataFrame containing only rows for users
      with at least 20 distinct time slots.
    """
    # Count the number of distinct time slots for each user
    user_time_slot_counts = df.groupby('User ID')['Time Slot'].nunique().reset_index(name='Time Slot Count')

    # Filter to retain only users with at least 20 distinct time slots
    good_users = user_time_slot_counts[user_time_slot_counts['Time Slot Count'] >= 20]['User ID']

    # Filter the original DataFrame to keep only these users
    final_df = df[df['User ID'].isin(good_users)]

    return final_df


def split_data_by_user(df, train_size=0.7, val_size=0.15, test_size=0.15):
    """
    Splits the DataFrame into training, validation, and test sets by dividing users (User ID),
    ensuring that each user appears in only one set.

    Parameters:
    - df (pd.DataFrame): The filtered DataFrame containing user and time slot information.
    - train_size (float): The proportion of users to include in the training set. Default is 0.7.
    - val_size (float): The proportion of users to include in the validation set. Default is 0.15.
    - test_size (float): The proportion of users to include in the test set. Default is 0.15.

    Returns:
    - train_df (pd.DataFrame): The training DataFrame.
    - val_df (pd.DataFrame): The validation DataFrame.
    - test_df (pd.DataFrame): The test DataFrame.
    """
    # Ensure that the sum of train_size, val_size, and test_size equals 1
    if not (0 < train_size < 1 and 0 < val_size < 1 and 0 < test_size < 1 and abs((train_size + val_size + test_size) - 1) < 1e-5):
        raise ValueError("The sum of train_size, val_size, and test_size must equal 1.")

    # Get the unique list of User IDs
    user_ids = df['User ID'].unique()

    # Split users into training set (train_size) and a temporary set (remaining_size)
    train_users, temp_users = train_test_split(user_ids, test_size=(1 - train_size), random_state=42)

    # Calculate the proportion for validation and test sets from the temporary set
    val_proportion = val_size / (val_size + test_size)

    # Split the temporary set into validation and test sets
    val_users, test_users = train_test_split(temp_users, test_size=(1 - val_proportion), random_state=42)

    # Separate data based on User IDs
    train_df = df[df['User ID'].isin(train_users)].reset_index(drop=True)
    val_df = df[df['User ID'].isin(val_users)].reset_index(drop=True)
    test_df = df[df['User ID'].isin(test_users)].reset_index(drop=True)

    return train_df, val_df, test_df


In [None]:
# # Filter the data
# final_df = filter_data(df)

# # Split the data by User ID into train, val, and test sets with specified proportions
# train_df, val_df, test_df = split_data_by_user(final_df, train_size=0.7, val_size=0.15, test_size=0.15)
# print(train_df.shape)
# print(val_df.shape)
# print(test_df.shape)

# Download preprocessed data

In [None]:
# List of files to download and their respective URLs
files = {
    "train_df": "https://github.com/andreuni/Next-POI-prediction/raw/main/data/train_df.pkl",
    "val_df": "https://github.com/andreuni/Next-POI-prediction/raw/main/data/val_df.pkl",
    "test_df": "https://github.com/andreuni/Next-POI-prediction/raw/main/data/test_df.pkl",
    "venue_vocab": "https://github.com/andreuni/Next-POI-prediction/raw/main/data/venue_vocab.pkl",
    "centroid_labels": "https://github.com/andreuni/Next-POI-prediction/raw/main/data/centroid_labels.pkl",
    "labels": "https://github.com/andreuni/Next-POI-prediction/raw/main/data/labels.pkl",
    "similar_users": "https://github.com/andreuni/Next-POI-prediction/raw/main/data/similar_users.pkl",
    "spatial_graphs": "https://github.com/andreuni/Next-POI-prediction/raw/main/data/spatial_graphs.pkl"
}

# Download each file
for name, url in files.items():
    !wget -O {name}.pkl {url}
    print(f"Downloaded {name}.pkl, size: {os.path.getsize(f'{name}.pkl')} bytes")

# Load each file using pickle and save them without the .pkl extension
for name in files.keys():
    with open(f"{name}.pkl", 'rb') as f:
        globals()[name] = pickle.load(f)
        print(f"Loaded {name}")


--2024-08-27 17:39:18--  https://github.com/andreuni/Next-POI-prediction/raw/main/data/train_df.pkl
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/andreuni/Next-POI-prediction/main/data/train_df.pkl [following]
--2024-08-27 17:39:19--  https://raw.githubusercontent.com/andreuni/Next-POI-prediction/main/data/train_df.pkl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1077346 (1.0M) [application/octet-stream]
Saving to: ‘train_df.pkl’


2024-08-27 17:39:19 (25.9 MB/s) - ‘train_df.pkl’ saved [1077346/1077346]

Downloaded train_df.pkl, size: 1077346 bytes
--2024-08-27 17:39:19--  https://github.co

# New Dataset

In [None]:
class MyDataset(Dataset):
    def __init__(self, df, spatial_graphs, labels, venue_vocab, padding_dim, similar_users, centroid_labels):
        """
        Initializes the dataset with user data, spatial graphs, labels, and other attributes.

        Parameters:
        - df (pd.DataFrame): DataFrame containing user, time slot, and venue information.
        - spatial_graphs (list): List of spatial graphs for each user.
        - labels (list): List of labels for each user.
        - venue_vocab (list): List of venue IDs.
        - padding_dim (int): Dimension for padding sequences.
        - similar_users (list): List of indices of similar users for each user.
        - centroid_labels (list): List of centroid labels for each user.
        """
        self.df = df
        self.spatial_graphs = spatial_graphs
        self.labels = labels
        self.venue_vocab = venue_vocab
        self.padding_dim = padding_dim
        self.similar_users = similar_users
        self.centroid_labels = centroid_labels

        self.data = {}
        self.graph_data = []

        # Build the data dictionary
        for _, row in df.iterrows():
            user_id = int(row['User ID'])
            encoded_venue = int(row['Encoded Venue'])
            time_slot = int(row['Time Slot'])

            if user_id not in self.data:
                self.data[user_id] = {time_slot: {'encoded_venue': [encoded_venue]}}
            else:
                if time_slot in self.data[user_id]:
                    self.data[user_id][time_slot]['encoded_venue'].append(encoded_venue)
                else:
                    self.data[user_id][time_slot] = {'encoded_venue': [encoded_venue]}

        # Process the data for each user
        for user in self.data:
            time_slots = list(self.data[user].keys())
            label = torch.tensor(self.labels[user - 1])  # User ID starts from 1, list index starts from 0
            num_nodes = len(self.venue_vocab)
            one_hot_label = torch.zeros(num_nodes)
            one_hot_label[label] = 1                    # Create one-hot label for the user

            spa_graph = self.spatial_graphs[user - 1]
            spa_data = self.from_g_to_data(spa_graph)
            centroid = self.centroid_labels[user - 1]
            similar_user = self.similar_users[user - 1]
            similar_spa_data = self.from_g_to_data(self.spatial_graphs[similar_user - 1])

            for time_slot in time_slots:
                temporal_sequence = self.get_temporal_sequence(self.data[user][time_slot]['encoded_venue'], self.padding_dim)
                t_graph = self.create_temporal_graph(user, time_slot, time_slots)

                self.graph_data.append([t_graph, spa_graph, temporal_sequence, spa_data,
                                        user, label, one_hot_label, similar_spa_data, centroid])

    def get_temporal_sequence(self, temporal_pois, padding_dim):
        """
        Pads the temporal POIs list to a fixed dimension with a padding value.

        Parameters:
        - temporal_pois (list): List of temporal POIs.
        - padding_dim (int): Dimension to pad the POIs list to.

        Returns:
        - torch.Tensor: Padded tensor of temporal POIs.
        """
        current_length = len(temporal_pois)
        padding_needed = padding_dim - current_length
        temporal_list = temporal_pois + [1000] * padding_needed  # 1000 is used as the padding index
        return torch.tensor(temporal_list)

    def from_g_to_data(self, G):
        """
        Converts a NetworkX graph to a PyTorch Geometric Data object.

        Parameters:
        - G (nx.Graph): NetworkX graph.

        Returns:
        - Data: PyTorch Geometric Data object.
        """
        edge_index = torch.tensor(list(G.edges()), dtype=torch.long).t().contiguous()
        num_nodes = len(self.venue_vocab)
        num_features = 100  # Example feature size; adjust as needed
        node_features = torch.randn(num_nodes, num_features)

        data = from_networkx(G)
        data.edge_index = edge_index
        data.x = node_features
        data.num_nodes = num_nodes

        return data

    def create_temporal_graph(self, user, time_slot, time_slots):
        """
        Creates a temporal graph based on the user's visit data.

        Parameters:
        - user (int): User ID.
        - time_slot (int): Time slot.
        - time_slots (list): List of all time slots for the user.

        Returns:
        - nx.DiGraph: Temporal graph.
        """
        places = self.data[user][time_slot]['encoded_venue']
        G = nx.DiGraph()

        if places:
            G.add_node(places[0])
            for i in range(len(places) - 1):
                G.add_edge(places[i], places[i + 1])

        return G

    def __len__(self):
        """
        Returns the number of users in the dataset.

        Returns:
        - int: Number of users.
        """
        return len(self.data)

    def __getitem__(self, index):
        """
        Retrieves an item from the dataset.

        Parameters:
        - index (int): Index of the item.

        Returns:
        - list: A list containing the following elements:
            - t_graph (nx.DiGraph): Temporal graph for the specific user and time slot.
            - spa_graph (nx.Graph): Spatial graph representing the connections between venues for the user.
            - temporal_sequence (torch.Tensor): Padded tensor of the sequence of visited POIs (Point of Interests).
            - spa_data (Data): PyTorch Geometric Data object representing the spatial graph with node features.
            - user (int): User ID.
            - label (torch.Tensor): The true label (venue ID) for the user's next POI.
            - one_hot_label (torch.Tensor): One-hot encoded label vector representing the venue.
            - similar_spa_data (Data): PyTorch Geometric Data object representing the spatial graph of a similar user.
            - centroid (int): Centroid label representing the cluster center associated with the user.
        """
        return self.graph_data[index]

# Datamodule

In [None]:
class MyDataModule(pl.LightningDataModule):
    """
    A PyTorch Lightning DataModule for managing datasets with spatial and temporal information.
    """

    def __init__(
        self,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        test_df: pd.DataFrame,
        spatial_graphs: List[nx.Graph],
        venue_vocab: List[int],
        labels: List[int],
        padding_dim: int,
        similar_users: List[int],
        centroid_labels: List[int],
        batch_size: int
    ) -> None:
        """
        Initializes the DataModule with dataset and configuration parameters.

        Parameters:
        - train_df (pd.DataFrame): Training dataset.
        - val_df (pd.DataFrame): Validation dataset.
        - test_df (pd.DataFrame): Test dataset.
        - spatial_graphs (List[nx.Graph]): List of spatial graphs for users.
        - venue_vocab (List[int]): Vocabulary of venue IDs.
        - labels (List[int]): Labels for each user.
        - padding_dim (int): Padding dimension for sequences.
        - similar_users (List[int]): List of indices for similar users.
        - centroid_labels (List[int]): List of centroid labels for users.
        - batch_size (int): Batch size for data loaders.
        """
        super().__init__()

        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.spatial_graphs = spatial_graphs
        self.venue_vocab = venue_vocab
        self.labels = labels
        self.padding_dim = padding_dim
        self.similar_users = similar_users
        self.centroid_labels = centroid_labels
        self.batch_size = batch_size

    def setup(self, stage: Optional[str] = None) -> None:
        """
        Sets up the datasets for training, validation, and testing based on the stage.
        """
        if stage == 'fit':
            self.train_dataset = MyDataset(
                self.train_df, self.spatial_graphs, self.labels,
                self.venue_vocab, self.padding_dim, self.similar_users,
                self.centroid_labels
            )
            self.validation_dataset = MyDataset(
                self.val_df, self.spatial_graphs, self.labels,
                self.venue_vocab, self.padding_dim, self.similar_users,
                self.centroid_labels
            )
        elif stage == 'test':
            self.test_dataset = MyDataset(
                self.test_df, self.spatial_graphs, self.labels,
                self.venue_vocab, self.padding_dim, self.similar_users,
                self.centroid_labels
            )

    def train_dataloader(self, *args, **kwargs) -> DataLoader:
        """
        Returns the DataLoader for the training dataset.
        """
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size,
            shuffle=False, drop_last=True, collate_fn=self.prepare_batch
        )

    def val_dataloader(self, *args, **kwargs) -> DataLoader:
        """
        Returns the DataLoader for the validation dataset.
        """
        return DataLoader(
            self.validation_dataset, batch_size=self.batch_size,
            shuffle=False, drop_last=True, collate_fn=self.prepare_batch
        )

    def test_dataloader(self, *args, **kwargs) -> DataLoader:
        """
        Returns the DataLoader for the test dataset.
        """
        return DataLoader(
            self.test_dataset, batch_size=self.batch_size,
            shuffle=False, drop_last=True, collate_fn=self.prepare_batch
        )

    def prepare_batch(self, batch):
        """
        Prepares the batch by converting lists to tensors and handling spatial graph data.

        Parameters:
        - batch (List[Tuple]): List of tuples containing graph data.

        Returns:
        - Tuple: Contains tensors for temporal sequences, spatial graph data, similar spatial graph data,
                 centroids, labels, and one-hot labels.
        """
        t_data = [item[2] for item in batch]  # Temporal sequences
        spa_data = [item[3] for item in batch]  # Spatial graph data
        similar_spa_data = [item[7] for item in batch]  # Similar spatial graph data
        centroid = [item[8] for item in batch]  # Centroid labels
        labels = [item[5] for item in batch]  # Labels
        one_hot_labels = [item[6] for item in batch]  # One-hot labels

        centroid = torch.tensor(centroid)
        spa_data = Batch.from_data_list(spa_data)
        similar_spa_data = Batch.from_data_list(similar_spa_data)
        t_data = torch.stack(t_data)
        labels = torch.stack(labels)
        one_hot_labels = torch.stack(one_hot_labels)

        return t_data, spa_data, similar_spa_data, centroid, labels, one_hot_labels

# Model

In [None]:
class My_Model(pl.LightningModule):
    """
    A PyTorch Lightning model that combines temporal and spatial graph features for classification.
    """

    def __init__(
        self,
        max_nodes: int,
        num_nodes: int,
        embedding_dim: int,
        hidden_dimension: int,
        cluster_embedding_dim: int,
        cluster_hidden_dim: int,
        n_clusters: int,
        k: int,
        gcn_or_gat: str,
        batch_size: int
    ) -> None:
        """
        Initializes the model with specified parameters.

        Parameters:
        - max_nodes (int): Maximum number of nodes in temporal sequences.
        - num_nodes (int): Number of nodes in spatial graphs.
        - embedding_dim (int): Dimension of node embeddings.
        - hidden_dimension (int): Dimension of LSTM hidden states.
        - cluster_embedding_dim (int): Dimension of cluster embeddings.
        - cluster_hidden_dim (int): Dimension of hidden layer in cluster network.
        - n_clusters (int): Number of clusters.
        - k (int): Value of K for accuracy@K.
        - gcn_or_gat (str): Type of GNN layer ('gcn' or 'gat').
        - batch_size (int): Batch size for data loaders.
        """
        super(My_Model, self).__init__()
        self.max_nodes = max_nodes
        self.num_nodes = num_nodes
        self.embedding_dim = embedding_dim
        self.hidden_dimension = hidden_dimension
        self.cluster_embedding_dim = cluster_embedding_dim
        self.cluster_hidden_dim = cluster_hidden_dim
        self.n_clusters = n_clusters
        self.k = k
        self.batch_size = batch_size
        self.gcn_or_gat = gcn_or_gat

        self.dropout = nn.Dropout(0.3)

        # Define embeddings
        self.temporal_node_embedding = nn.Embedding(self.max_nodes, self.embedding_dim, padding_idx=1000)
        self.spatial_node_embedding = nn.Embedding(self.num_nodes * self.batch_size, self.embedding_dim)
        self.cluster_embedding = nn.Embedding(self.n_clusters, self.cluster_embedding_dim)

        # Define LSTM
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.hidden_dimension,
            batch_first=True
        )

        # Define GNN layers
        if self.gcn_or_gat == 'gcn':
            self.gnn1 = GCNConv(self.embedding_dim, 75)
            self.gnn2 = GCNConv(75, 50)
        else:
            self.gnn1 = GATConv(self.embedding_dim, 75, heads=4)
            self.gnn2 = GATConv(75 * 4, 50, heads=1)

        # Define cluster network
        self.cluster_net = nn.Sequential(
            nn.Linear(self.cluster_embedding_dim, self.cluster_hidden_dim),
            nn.ReLU(),
            self.dropout
        )

        # Define output layer
        self.fc1 = nn.Linear(50 + 50 + 10, self.num_nodes)

        # Define loss function
        self.loss_fn = nn.BCEWithLogitsLoss()

        # Lists for metrics
        self.train_acc_at_k = []
        self.val_acc_at_k = []
        self.train_mrr = []
        self.val_mrr = []
        self.train_loss = []
        self.val_loss = []

    def forward(self, t_data, spa_data, similar_spa_data, centroid):
        """
        Forward pass through the network.

        Parameters:
        - t_data: Temporal sequence data.
        - spa_data: Spatial graph data.
        - similar_spa_data: Similar spatial graph data.
        - centroid: Centroid data for clusters.

        Returns:
        - x_combined: Output tensor after processing through the model.
        """
        # Temporal node embedding and LSTM
        x_temporal = self.temporal_node_embedding(t_data)
        out, (h, c) = self.lstm(x_temporal)
        x_temporal = h.squeeze(0)
        x_temporal = self.dropout(x_temporal)  # Dropout added after LSTM output

        # Spatial graph processing
        x_spatial, edge_index, batch_index = spa_data.x, spa_data.edge_index, spa_data.batch
        x_spatial = F.relu(self.gnn1(x_spatial, edge_index))
        x_spatial = self.dropout(x_spatial)  # Dropout added after first GNN layer
        x_spatial = F.relu(self.gnn2(x_spatial, edge_index))
        x_spatial = self.dropout(x_spatial)  # Dropout added after second GNN layer
        x_spatial = global_mean_pool(x_spatial, batch_index)

        # Similar spatial graph processing
        x_similar_spatial, similar_edge_index, similar_batch_index = similar_spa_data.x, similar_spa_data.edge_index, similar_spa_data.batch
        x_similar_spatial = F.relu(self.gnn1(x_similar_spatial, similar_edge_index))
        x_similar_spatial = self.dropout(x_similar_spatial)  # Dropout added after first GNN layer for similar graph
        x_similar_spatial = F.relu(self.gnn2(x_similar_spatial, similar_edge_index))
        x_similar_spatial = self.dropout(x_similar_spatial)  # Dropout added after second GNN layer for similar graph
        x_similar_spatial = global_mean_pool(x_similar_spatial, similar_batch_index)

        # Weighted average of spatial graphs
        x_total_spatial = 0.7 * x_spatial + 0.3 * x_similar_spatial

        # Cluster embedding and network
        x_cluster = self.cluster_embedding(centroid)
        x_cluster = self.cluster_net(x_cluster)

        # Concatenate features and pass through final layer
        x_combined = torch.cat((x_temporal, x_total_spatial, x_cluster), dim=1)
        x_combined = self.fc1(x_combined)

        return x_combined



    def training_step(self, batch, batch_idx):
        """
        Training step for each batch.

        Parameters:
        - batch: Input data batch.
        - batch_idx: Index of the batch.

        Returns:
        - loss: Calculated loss for the batch.
        """
        t_data, spa_data, similar_spa_data, centroid, labels, one_hot_labels = batch
        labels = labels.to(self.device).long()

        # Forward pass
        outputs = self.forward(t_data, spa_data, similar_spa_data, centroid)
        acc = self.accuracy_at_k(outputs, labels, self.k)
        mrr = self.mean_reciprocal_rank(outputs, one_hot_labels)
        loss_1 = self.loss_fn(outputs, one_hot_labels)
        loss_2 = self.multi_label_cross_entropy_loss(outputs, labels)
        loss = 0.85 * loss_1 + 0.15 * loss_2

        # Log metrics
        self.train_acc_at_k.append(acc)
        self.train_mrr.append(mrr)
        self.train_loss.append(loss)

        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log("train_acc", acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

    def validation_step(self, batch, batch_idx):
        """
        Validation step for each batch.

        Parameters:
        - batch: Input data batch.
        - batch_idx: Index of the batch.

        Returns:
        - loss: Calculated loss for the batch.
        """
        t_data, spa_data, similar_spa_data, centroid, labels, one_hot_labels = batch
        labels = labels.to(self.device).long()

        # Forward pass
        outputs = self.forward(t_data, spa_data, similar_spa_data, centroid)
        acc = self.accuracy_at_k(outputs, labels, self.k)
        mrr = self.mean_reciprocal_rank(outputs, one_hot_labels)
        loss_1 = self.loss_fn(outputs, one_hot_labels)
        loss_2 = self.multi_label_cross_entropy_loss(outputs, labels)
        loss = 0.85 * loss_1 + 0.15 * loss_2

        # Log metrics
        self.val_acc_at_k.append(acc)
        self.val_mrr.append(mrr)
        self.val_loss.append(loss)

        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log("val_acc", acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

    def test_step(self, batch, batch_idx):
        """
        Test step for each batch.

        Parameters:
        - batch: Input data batch.
        - batch_idx: Index of the batch.

        Returns:
        - loss: Calculated loss for the batch.
        """
        t_data, spa_data, similar_spa_data, centroid, labels, one_hot_labels= batch
        labels = labels.to(self.device).long()

        # Forward pass
        outputs = self.forward(t_data, spa_data, similar_spa_data, centroid)
        acc = self.accuracy_at_k(outputs, labels, self.k)
        mrr = self.mean_reciprocal_rank(outputs, one_hot_labels)
        loss_1 = self.loss_fn(outputs, one_hot_labels)
        loss_2 = self.multi_label_cross_entropy_loss(outputs, labels)
        loss = 0.85 * loss_1 + 0.15 * loss_2

        # Log metrics
        self.log("test_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log("test_acc", acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

    def accuracy_at_k(self, output, target, k):
        """
        Calculate accuracy at K for multi-label classification.

        Parameters:
        - output: Tensor of shape (batch_size, num_classes) with predicted probabilities.
        - target: Tensor of shape (batch_size, num_labels) with true labels.
        - k: The value of K for accuracy@K.

        Returns:
        - acc_k: Accuracy at K.
        """
        batch_size = target.size(0)
        num_labels = target.size(1)

        # Select the first k elements of each label in the batch if k is less than num_labels
        if k < num_labels:
            target = target[:, :k]

        _, pred = output.topk(k, dim=1, largest=True, sorted=True)
        target_expanded = target.view(batch_size, k, 1)

        # Check if the true labels are among the top K predictions
        pred_expanded = pred.view(batch_size, 1, k).expand(batch_size, k, k)
        correct = pred_expanded.eq(target_expanded)

        # Calculate the accuracy
        acc_k = correct.any(dim=1).float().mean().item()
        return acc_k

    def mean_reciprocal_rank(self, output, target):
        """
        Calculates the Mean Reciprocal Rank (MRR) for a batch of predictions.

        MRR measures the average rank of the correct label in the predicted list of classes.
        It emphasizes the importance of ranking the correct class as high as possible.

        Args:
        output: Tensor of shape (batch_size, num_classes) containing the predicted scores for each class.
        target: Tensor of shape (batch_size, num_classes) containing binary labels (0 or 1),
                where 1 indicates the correct class for that sample.

        Returns:
        mrr: The average reciprocal rank for the batch.
        """

        # Get the number of samples in the batch
        batch_size = target.size(0)

        # Initialize a list to store the reciprocal ranks
        ranks = []

        # Iterate over each sample in the batch
        for i in range(batch_size):
            # Find the indices of relevant classes (those with label 1)
            relevant = target[i].nonzero(as_tuple=True)[0]

            # Sort the predicted scores in descending order and get the sorted indices
            pred = output[i].argsort(descending=True)

            # For each relevant class (label 1), calculate the rank
            for idx in relevant:
                # Find the rank of the relevant class in the sorted predictions (1-based index)
                rank = (pred == idx).nonzero(as_tuple=True)[0].item() + 1

                # Compute the reciprocal rank (1/rank) and add it to the ranks list
                ranks.append(1 / rank)

                # Break after considering the first relevant item (for simplicity)
                break

        # Return the average of all reciprocal ranks in the batch, or 0 if no ranks were calculated
        return sum(ranks) / len(ranks) if ranks else 0

    def multi_label_cross_entropy_loss(self, output, target):
        """
        We tried combining BCE with cross-entropy but even if the loss results were more accurate, the accuracy dropped
        Calculates CrossEntropyLoss for each label in the target.

        Args:
        output: Tensor of shape (batch_size, num_classes) containing the predicted probabilities.
        target: Tensor of shape (batch_size, num_labels) containing the true labels for each example.

        Returns:
        loss: Mean CrossEntropyLoss calculated for each label in the target.
        """

        criterion = nn.CrossEntropyLoss()
        # Select the first k elements of each label in the batch if k is less than num_labels

        target = target[:, :self.k]
        batch_size = target.size(0)
        num_labels = target.size(1)
        total_loss = 0.0
        for i in range(batch_size):
            for j in range(num_labels):
                label = target[i, j].unsqueeze(0)  # The label must be a single label
                loss = criterion(output[i].unsqueeze(0), label)
                total_loss += loss

        # Return the average loss
        return total_loss / (batch_size * num_labels)

    def on_train_epoch_end(self):
        avg_train_acc_at_k = sum(self.train_acc_at_k) / len(self.train_acc_at_k) if self.train_acc_at_k else 0
        avg_train_mrr = sum(self.train_mrr) / len(self.train_mrr) if self.train_mrr else 0
        avg_train_loss = sum(self.train_loss) / len(self.train_loss) if self.train_loss else 0

        print(f'Epoch {self.current_epoch} Train\n'
              f'- Train Accuracy@{self.k}: {avg_train_acc_at_k:.4f},\n'
              f'- Train MRR: {avg_train_mrr:.4f},\n'
              f'- Train Loss: {avg_train_loss:.4f}')

        self.train_acc_at_k.clear()
        self.train_mrr.clear()
        self.train_loss.clear()

    def on_validation_epoch_end(self):
        avg_val_acc_at_k = sum(self.val_acc_at_k) / len(self.val_acc_at_k) if self.val_acc_at_k else 0
        avg_val_mrr = sum(self.val_mrr) / len(self.val_mrr) if self.val_mrr else 0
        avg_val_loss = sum(self.val_loss) / len(self.val_loss) if self.val_loss else 0

        print(f'Epoch {self.current_epoch} Validation\n'
              f'- Validation Accuracy@{self.k}: {avg_val_acc_at_k:.4f},\n'
              f'- Validation MRR: {avg_val_mrr:.4f},\n'
              f'- Validation Loss: {avg_val_loss:.4f}')

        self.val_acc_at_k.clear()
        self.val_mrr.clear()
        self.val_loss.clear()

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)

# Setup

**Weights & Biases**

We used wandb for plotting but since it requires a login key, we preferred to comment it for this colab's final version, in order to avoid errors during the examination of the code.

In [None]:
# wandb.login()   # use this when you do the first run
# wandb.finish()      # use this for a new run

# # Initialize WandbLogger
# wandb_logger = WandbLogger(
#     project='deep_learning',
#     log_model='all'
# )

In [None]:
# Define early stopping callback
early_stopping = pl.callbacks.EarlyStopping(
    monitor='val_acc',  # Metric to monitor for early stopping
    patience=5,
    verbose=True,
    mode='max'
)

# Initialize Trainer
trainer = pl.Trainer(
    max_epochs=100,  # Maximum number of epochs
    callbacks=[early_stopping],
    # logger=wandb_logger
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
# Model and training parameters
n_clusters = 6  # Number of clusters
batch_size = 64
embedding_dim = 100
hidden_dimension = 50
cluster_embedding_dim = 25
cluster_hidden_dim = 10
k = 20
gcn_or_gat = 'gcn'

# Initialize the model
my_model = My_Model(
    max_nodes=1001,
    num_nodes=len(venue_vocab),
    embedding_dim=embedding_dim,
    hidden_dimension=hidden_dimension,
    cluster_embedding_dim=cluster_embedding_dim,
    cluster_hidden_dim=cluster_hidden_dim,
    n_clusters=n_clusters,
    k=k,
    gcn_or_gat=gcn_or_gat,
    batch_size=batch_size
)

# Initialize the data module
my_dm = MyDataModule(
    train_df=train_df,
    val_df=val_df,
    test_df=test_df,
    spatial_graphs=spatial_graphs,
    venue_vocab=venue_vocab,
    labels=labels,
    padding_dim=100,
    similar_users=similar_users,
    centroid_labels=centroid_labels,
    batch_size=batch_size
)

# Fit

In [None]:
# Train the model
trainer.fit(my_model, my_dm)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name                    | Type              | Params | Mode 
----------------------------------------------------------------------
0 | dropout                 | Dropout           | 0      | train
1 | temporal_node_embedding | Embedding         | 100 K  | train
2 | spatial_node_embedding  | Embedding         | 1.6 M  | train
3 | cluster_embedding       | Embedding         | 150    | train
4 | lstm                    | LSTM              | 30.4 K | train
5 | gnn1                    | GCNConv           | 7.6 K  | train
6 | gnn2                    | GCNConv           | 3.8 K  | train
7 | cluster_net             | Sequential        | 260    | train
8 | fc1                     | Linear            | 27.9 K | train
9 | loss_fn                 | BCEWithLogitsLoss | 0      | train
----------------------------------------------------------------------
1.8 M     Tr

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Epoch 0 Validation
- Validation Accuracy@20: 0.0785,
- Validation MRR: 0.0302,
- Validation Loss: 1.4156


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved. New best score: 0.103


Epoch 0 Validation
- Validation Accuracy@20: 0.1031,
- Validation MRR: 0.1693,
- Validation Loss: 1.3792
Epoch 0 Train
- Train Accuracy@20: 0.0846,
- Train MRR: 0.1547,
- Train Loss: 1.4030


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.146 >= min_delta = 0.0. New best score: 0.249


Epoch 1 Validation
- Validation Accuracy@20: 0.2487,
- Validation MRR: 0.2240,
- Validation Loss: 1.3424
Epoch 1 Train
- Train Accuracy@20: 0.1568,
- Train MRR: 0.1954,
- Train Loss: 1.3633


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.099 >= min_delta = 0.0. New best score: 0.348


Epoch 2 Validation
- Validation Accuracy@20: 0.3479,
- Validation MRR: 0.2274,
- Validation Loss: 1.3006
Epoch 2 Train
- Train Accuracy@20: 0.2742,
- Train MRR: 0.2209,
- Train Loss: 1.3219


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.055 >= min_delta = 0.0. New best score: 0.403


Epoch 3 Validation
- Validation Accuracy@20: 0.4026,
- Validation MRR: 0.2356,
- Validation Loss: 1.2520
Epoch 3 Train
- Train Accuracy@20: 0.3693,
- Train MRR: 0.2310,
- Train Loss: 1.2749


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.027 >= min_delta = 0.0. New best score: 0.429


Epoch 4 Validation
- Validation Accuracy@20: 0.4292,
- Validation MRR: 0.2210,
- Validation Loss: 1.1964
Epoch 4 Train
- Train Accuracy@20: 0.4182,
- Train MRR: 0.2271,
- Train Loss: 1.2217


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.003 >= min_delta = 0.0. New best score: 0.432


Epoch 5 Validation
- Validation Accuracy@20: 0.4320,
- Validation MRR: 0.2133,
- Validation Loss: 1.1354
Epoch 5 Train
- Train Accuracy@20: 0.4349,
- Train MRR: 0.2260,
- Train Loss: 1.1623


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.001 >= min_delta = 0.0. New best score: 0.433


Epoch 6 Validation
- Validation Accuracy@20: 0.4331,
- Validation MRR: 0.2070,
- Validation Loss: 1.0723
Epoch 6 Train
- Train Accuracy@20: 0.4404,
- Train MRR: 0.2104,
- Train Loss: 1.0999


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.009 >= min_delta = 0.0. New best score: 0.442


Epoch 7 Validation
- Validation Accuracy@20: 0.4417,
- Validation MRR: 0.2052,
- Validation Loss: 1.0112
Epoch 7 Train
- Train Accuracy@20: 0.4474,
- Train MRR: 0.2106,
- Train Loss: 1.0388


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.004 >= min_delta = 0.0. New best score: 0.446


Epoch 8 Validation
- Validation Accuracy@20: 0.4461,
- Validation MRR: 0.2070,
- Validation Loss: 0.9590
Epoch 8 Train
- Train Accuracy@20: 0.4479,
- Train MRR: 0.2072,
- Train Loss: 0.9846


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9 Validation
- Validation Accuracy@20: 0.4456,
- Validation MRR: 0.1976,
- Validation Loss: 0.9319
Epoch 9 Train
- Train Accuracy@20: 0.4510,
- Train MRR: 0.1876,
- Train Loss: 0.9473


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.004 >= min_delta = 0.0. New best score: 0.450


Epoch 10 Validation
- Validation Accuracy@20: 0.4503,
- Validation MRR: 0.1369,
- Validation Loss: 0.9111
Epoch 10 Train
- Train Accuracy@20: 0.4521,
- Train MRR: 0.1580,
- Train Loss: 0.9262


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 11 Validation
- Validation Accuracy@20: 0.4482,
- Validation MRR: 0.1353,
- Validation Loss: 0.8898
Epoch 11 Train
- Train Accuracy@20: 0.4526,
- Train MRR: 0.1538,
- Train Loss: 0.9054


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.001 >= min_delta = 0.0. New best score: 0.452


Epoch 12 Validation
- Validation Accuracy@20: 0.4516,
- Validation MRR: 0.1331,
- Validation Loss: 0.8711
Epoch 12 Train
- Train Accuracy@20: 0.4544,
- Train MRR: 0.1383,
- Train Loss: 0.8821


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 13 Validation
- Validation Accuracy@20: 0.4503,
- Validation MRR: 0.1322,
- Validation Loss: 0.8576
Epoch 13 Train
- Train Accuracy@20: 0.4607,
- Train MRR: 0.1492,
- Train Loss: 0.8656


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.002 >= min_delta = 0.0. New best score: 0.453


Epoch 14 Validation
- Validation Accuracy@20: 0.4534,
- Validation MRR: 0.1915,
- Validation Loss: 0.8486
Epoch 14 Train
- Train Accuracy@20: 0.4581,
- Train MRR: 0.1428,
- Train Loss: 0.8510


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 15 Validation
- Validation Accuracy@20: 0.4523,
- Validation MRR: 0.1952,
- Validation Loss: 0.8424
Epoch 15 Train
- Train Accuracy@20: 0.4672,
- Train MRR: 0.1559,
- Train Loss: 0.8420


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.003 >= min_delta = 0.0. New best score: 0.457


Epoch 16 Validation
- Validation Accuracy@20: 0.4568,
- Validation MRR: 0.1921,
- Validation Loss: 0.8375
Epoch 16 Train
- Train Accuracy@20: 0.4721,
- Train MRR: 0.1743,
- Train Loss: 0.8346


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.006 >= min_delta = 0.0. New best score: 0.463


Epoch 17 Validation
- Validation Accuracy@20: 0.4628,
- Validation MRR: 0.1925,
- Validation Loss: 0.8333
Epoch 17 Train
- Train Accuracy@20: 0.4695,
- Train MRR: 0.1690,
- Train Loss: 0.8305


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.004 >= min_delta = 0.0. New best score: 0.467


Epoch 18 Validation
- Validation Accuracy@20: 0.4672,
- Validation MRR: 0.1921,
- Validation Loss: 0.8296
Epoch 18 Train
- Train Accuracy@20: 0.4862,
- Train MRR: 0.1775,
- Train Loss: 0.8249


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.002 >= min_delta = 0.0. New best score: 0.470


Epoch 19 Validation
- Validation Accuracy@20: 0.4695,
- Validation MRR: 0.1957,
- Validation Loss: 0.8267
Epoch 19 Train
- Train Accuracy@20: 0.4799,
- Train MRR: 0.1767,
- Train Loss: 0.8212


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 20 Validation
- Validation Accuracy@20: 0.4682,
- Validation MRR: 0.1967,
- Validation Loss: 0.8246
Epoch 20 Train
- Train Accuracy@20: 0.4820,
- Train MRR: 0.1819,
- Train Loss: 0.8194


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 21 Validation
- Validation Accuracy@20: 0.4661,
- Validation MRR: 0.1966,
- Validation Loss: 0.8231
Epoch 21 Train
- Train Accuracy@20: 0.4878,
- Train MRR: 0.1669,
- Train Loss: 0.8151


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 22 Validation
- Validation Accuracy@20: 0.4669,
- Validation MRR: 0.1965,
- Validation Loss: 0.8218
Epoch 22 Train
- Train Accuracy@20: 0.4938,
- Train MRR: 0.1965,
- Train Loss: 0.8142


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.006 >= min_delta = 0.0. New best score: 0.476


Epoch 23 Validation
- Validation Accuracy@20: 0.4758,
- Validation MRR: 0.1956,
- Validation Loss: 0.8207
Epoch 23 Train
- Train Accuracy@20: 0.4924,
- Train MRR: 0.1787,
- Train Loss: 0.8123


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.004 >= min_delta = 0.0. New best score: 0.480


Epoch 24 Validation
- Validation Accuracy@20: 0.4799,
- Validation MRR: 0.1971,
- Validation Loss: 0.8195
Epoch 24 Train
- Train Accuracy@20: 0.4919,
- Train MRR: 0.1828,
- Train Loss: 0.8106


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.004 >= min_delta = 0.0. New best score: 0.484


Epoch 25 Validation
- Validation Accuracy@20: 0.4839,
- Validation MRR: 0.1973,
- Validation Loss: 0.8185
Epoch 25 Train
- Train Accuracy@20: 0.5060,
- Train MRR: 0.1668,
- Train Loss: 0.8073


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.004 >= min_delta = 0.0. New best score: 0.488


Epoch 26 Validation
- Validation Accuracy@20: 0.4878,
- Validation MRR: 0.1982,
- Validation Loss: 0.8175
Epoch 26 Train
- Train Accuracy@20: 0.5068,
- Train MRR: 0.1625,
- Train Loss: 0.8057


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.005 >= min_delta = 0.0. New best score: 0.492


Epoch 27 Validation
- Validation Accuracy@20: 0.4924,
- Validation MRR: 0.2004,
- Validation Loss: 0.8165
Epoch 27 Train
- Train Accuracy@20: 0.5133,
- Train MRR: 0.1734,
- Train Loss: 0.8033


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.001 >= min_delta = 0.0. New best score: 0.493


Epoch 28 Validation
- Validation Accuracy@20: 0.4930,
- Validation MRR: 0.2080,
- Validation Loss: 0.8155
Epoch 28 Train
- Train Accuracy@20: 0.5125,
- Train MRR: 0.1776,
- Train Loss: 0.8004


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_acc improved by 0.001 >= min_delta = 0.0. New best score: 0.493


Epoch 29 Validation
- Validation Accuracy@20: 0.4935,
- Validation MRR: 0.2098,
- Validation Loss: 0.8145
Epoch 29 Train
- Train Accuracy@20: 0.5242,
- Train MRR: 0.1713,
- Train Loss: 0.7965


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 30 Validation
- Validation Accuracy@20: 0.4927,
- Validation MRR: 0.2097,
- Validation Loss: 0.8135
Epoch 30 Train
- Train Accuracy@20: 0.5208,
- Train MRR: 0.1693,
- Train Loss: 0.7960


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 31 Validation
- Validation Accuracy@20: 0.4935,
- Validation MRR: 0.2119,
- Validation Loss: 0.8126
Epoch 31 Train
- Train Accuracy@20: 0.5372,
- Train MRR: 0.1533,
- Train Loss: 0.7934


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 32 Validation
- Validation Accuracy@20: 0.4932,
- Validation MRR: 0.2126,
- Validation Loss: 0.8116
Epoch 32 Train
- Train Accuracy@20: 0.5359,
- Train MRR: 0.1758,
- Train Loss: 0.7897


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 33 Validation
- Validation Accuracy@20: 0.4857,
- Validation MRR: 0.2120,
- Validation Loss: 0.8106
Epoch 33 Train
- Train Accuracy@20: 0.5359,
- Train MRR: 0.1566,
- Train Loss: 0.7895


Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_acc did not improve in the last 5 records. Best score: 0.493. Signaling Trainer to stop.


Epoch 34 Validation
- Validation Accuracy@20: 0.4888,
- Validation MRR: 0.2127,
- Validation Loss: 0.8097
Epoch 34 Train
- Train Accuracy@20: 0.5453,
- Train MRR: 0.1668,
- Train Loss: 0.7851


In [None]:
trainer.test(my_model, my_dm)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss_epoch': 0.8076265454292297, 'test_acc_epoch': 0.4934895932674408}]