In [1]:
#libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import requests
import multiprocessing
import time

In [2]:
# Floyd-Warshall algorithm
def floyd_warshall(AdjMatrix):
    n = len(AdjMatrix)
    cost = np.copy(AdjMatrix)
    cost[cost == 0] = np.inf
    for k in range(n):
        for i in range(n):
            for j in range(n):
                cost[i, j] = min(cost[i, j], cost[i, k] + cost[k, j])
    return cost

In [3]:
def shortest_path_kernel1(S1, S2, k_walk):
    # Obtener índices donde las entradas son finitas
    indices_S1 = np.transpose(np.triu_indices_from(S1))
    indices_S2 = np.transpose(np.triu_indices_from(S2))
    
    # Filtrar valores finitos
    indices_S1 = indices_S1[np.isfinite(S1[indices_S1[:, 0], indices_S1[:, 1]])]
    indices_S2 = indices_S2[np.isfinite(S2[indices_S2[:, 0], indices_S2[:, 1]])]

    # Convertir las entradas relevantes en arrays
    S1_finite = S1[indices_S1[:, 0], indices_S1[:, 1]]
    S2_finite = S2[indices_S2[:, 0], indices_S2[:, 1]]

    # Calcular el kernel con producto cartesiano sin crear listas grandes
    K = 0
    for d1 in S1_finite:
        for d2 in S2_finite:
            K += k_walk(d1, d2)

    return K

def shortest_path_kernel2(S1, S2, k_walk):
    # Obtener índices donde las entradas son finitas
    indices_S1 = np.transpose(np.triu_indices_from(S1))
    indices_S2 = np.transpose(np.triu_indices_from(S2))
    
    # Filtrar valores finitos
    indices_S1 = indices_S1[np.isfinite(S1[indices_S1[:, 0], indices_S1[:, 1]])]
    indices_S2 = indices_S2[np.isfinite(S2[indices_S2[:, 0], indices_S2[:, 1]])]

    # Convertir las entradas relevantes en arrays
    S1_finite = S1[indices_S1[:, 0], indices_S1[:, 1]]
    S2_finite = S2[indices_S2[:, 0], indices_S2[:, 1]]
    
    # Calcular el kernel con producto cartesiano
    K = np.sum([k_walk(d1, d2) for d1 in S1_finite for d2 in S2_finite])
    
    return K

def shortest_path_kernel3(S1, S2, k_walk):
        # Obtener índices donde las entradas son finitas
    indices_S1 = np.transpose(np.triu_indices_from(S1))
    indices_S2 = np.transpose(np.triu_indices_from(S2))
    
    # Filtrar valores finitos
    indices_S1 = indices_S1[np.isfinite(S1[indices_S1[:, 0], indices_S1[:, 1]])]
    indices_S2 = indices_S2[np.isfinite(S2[indices_S2[:, 0], indices_S2[:, 1]])]

    # Convertir las entradas relevantes en arrays
    S1_finite = S1[indices_S1[:, 0], indices_S1[:, 1]]
    S2_finite = S2[indices_S2[:, 0], indices_S2[:, 1]]
    
    # Calcular el kernel con producto cartesiano
    K = np.sum(np.fromiter((k_walk(i, j) for i in S1_finite for j in S2_finite), dtype=float))
    return K

In [10]:
def _compute_kernel_element(args):
    """Helper function for parallel computation of kernel values."""
    d1, S2_finite, k_walk = args
    return sum(k_walk(d1, d2) for d2 in S2_finite)

def parallel_shortest_path_kernel1(S1, S2, k_walk, num_processes=min(multiprocessing.cpu_count(), 8)):
    """
    Computes the shortest path kernel between two matrices using parallel processing.
    
    Parameters:
    S1, S2: 2D numpy arrays (matrices of shortest paths)
    k_walk: kernel function that takes two distances as input
    num_processes: int, number of processes to use for parallelization
    
    Returns:
    K: kernel value
    """
    # Obtain indices where the entries are finite
    indices_S1 = np.transpose(np.triu_indices_from(S1))
    indices_S2 = np.transpose(np.triu_indices_from(S2))

    # Filter finite values
    indices_S1 = indices_S1[np.isfinite(S1[indices_S1[:, 0], indices_S1[:, 1]])]
    indices_S2 = indices_S2[np.isfinite(S2[indices_S2[:, 0], indices_S2[:, 1]])]

    # Convert the relevant entries into arrays
    S1_finite = S1[indices_S1[:, 0], indices_S1[:, 1]]
    S2_finite = S2[indices_S2[:, 0], indices_S2[:, 1]]

    # Prepare arguments for parallel computation
    args = [(d1, S2_finite, k_walk) for d1 in S1_finite]

    # Use multiprocessing Pool to compute kernel values in parallel
    with multiprocessing.Pool(processes=num_processes) as pool:
        results = pool.map(_compute_kernel_element, args)

    # Sum up the results to get the final kernel value
    K = sum(results)
    return K


In [14]:
# SHORTEST PATH KERNEL

def shortest_path_kernel1(S1, S2, k_walk):
    # Obtener índices donde las entradas son finitas
    indices_S1 = np.transpose(np.triu_indices_from(S1))
    indices_S2 = np.transpose(np.triu_indices_from(S2))
    
    # Filtrar valores finitos
    indices_S1 = indices_S1[np.isfinite(S1[indices_S1[:, 0], indices_S1[:, 1]])]
    indices_S2 = indices_S2[np.isfinite(S2[indices_S2[:, 0], indices_S2[:, 1]])]

    # Convertir las entradas relevantes en arrays
    S1_finite = S1[indices_S1[:, 0], indices_S1[:, 1]]
    S2_finite = S2[indices_S2[:, 0], indices_S2[:, 1]]

    # Calcular el kernel con producto cartesiano sin crear listas grandes
    K = 0
    for d1 in S1_finite:
        for d2 in S2_finite:
            K += k_walk(d1, d2)

    return K

def shortest_path_kernel2(S1, S2, k_walk):
    K = 0
    n = len(S1)
    m = len(S2)
    for i in range(n):
        for j in range(i, n):
            for ii in range(m):
                for jj in range(ii, m):
                    if np.isfinite(S1[i, j]) and np.isfinite(S2[ii, jj]):
                        K += k_walk(S1[i, j], S2[ii, jj])
    return K

def shortest_path_kernel(S1, S2, k_walk):
    try:
        return shortest_path_kernel1(S1, S2, k_walk)
    except Exception as e:
        print(f"Error: {e}, trying another approach")
        return shortest_path_kernel2(S1, S2, k_walk)

In [8]:
def dirac_kernel(a, b):
    return 1 if a == b else 0

def gaussian_kernel(a,b, sigma = 1/10):
    return np.exp(-((a-b)**2)*sigma)

In [9]:
def count_trips_mibici(data_user, threshold = 5, complement = False):
    viajes_user = data_user.groupby([data_user[['Origen_Id', 'Destino_Id']].min(axis=1), data_user[['Origen_Id', 'Destino_Id']].max(axis=1)]).size().reset_index(name='counts')
    viajes_user.columns = ['Est_A', 'Est_B', 'counts']
    if not complement:
        viajes_user = viajes_user[viajes_user['counts'] >= threshold]
    else:
        viajes_user = viajes_user[viajes_user['counts'] < threshold]
    if viajes_user.empty:
        return None
    total = viajes_user['counts'].sum()
    viajes_user['prob'] = viajes_user['counts']/total
    viajes_user = viajes_user.sort_values(by = 'prob', ascending = False).reset_index(drop=True)
    return viajes_user

def compute_matrix(counter_user, normalized = False, self_loops = False):
    if not self_loops:
        counter_user = counter_user[counter_user['Est_A'] != counter_user['Est_B']]
    vertex = list(set(counter_user['Est_A'].unique().tolist() + counter_user['Est_B'].unique().tolist()))
    matrix = np.zeros((len(vertex), len(vertex)))
    for i in range(len(counter_user)):
        current_trip = counter_user.iloc[i]
        count = current_trip["counts"]
        estA = current_trip["Est_A"]
        estB = current_trip["Est_B"]

        matrix[vertex.index(estA)][vertex.index(estB)] = count
        matrix[vertex.index(estB)][vertex.index(estA)] = count
    if normalized:
        D = np.sum(matrix, axis = 1)
        D = np.diag(D)
        D = np.linalg.inv(np.sqrt(D))
        matrix = D @ matrix @ D
    return matrix

In [16]:
# function to compute the gram matrix

def gram_matrix(data, k_function, normalized = False, save = False, directory = None):
    """This function computes the gram matrix of the data using the kernel function k_function
    Parameters:
    data: list of matrices
    k_function: kernel function which takes two matrices as input
    normalized: boolean, if True the gram matrix is normalized
    save: boolean, if True the gram matrix is saved in the current directory
    directory: string, directory where the gram matrix is saved
    Returns:
    gram: gram matrix of the data
    """
    n = len(data)
    gram = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            gram[i, j] = k_function(data[i], data[j])
            gram[j, i] = gram[i, j]
    if normalized:
        D = np.diag(np.diag(gram))
        D = np.linalg.inv(np.sqrt(D))
        gram = D @ gram @ D
    if save:
        if not os.path.exists(directory):
            os.makedirs(directory)
        np.save(directory + 'gram_matrix.npy', gram)
    return gram

In [11]:
#dir = '/home/user/Desktop/Datos/'
dir = '/Users/antoniomendez/Desktop/Tesis/Datos/datos_limpios/'

In [12]:
data_2019 = pd.read_csv(f'{dir}mibici/2019.csv')

In [13]:
# generate data
dates = [f"2019-01-{str(i).zfill(2)}" for i in range(1, 4)]
data = []
for date in dates:
    print("Procesando fecha: ", date)
    current_data = data_2019[data_2019['Inicio_del_viaje'].str.startswith(date)]
    current_counter = count_trips_mibici(current_data)
    current_matrix = compute_matrix(current_counter, self_loops=True)
    current_s = floyd_warshall(current_matrix)
    data.append(current_s)

Procesando fecha:  2019-01-01
Procesando fecha:  2019-01-02
Procesando fecha:  2019-01-03


Tests with the original funcions

In [10]:
# tests

kernel1 = lambda x, y: shortest_path_kernel1(x, y, dirac_kernel)
start = time.time()
gram = gram_matrix(data, kernel1)
print("Time kernel 1: ", time.time() - start)
print(gram)

kernel2 = lambda x, y: shortest_path_kernel2(x, y, dirac_kernel)
start = time.time()
gram = gram_matrix(data, kernel2)
print("Time kernel 2: ", time.time() - start)
print(gram)

kernel3 = lambda x, y: shortest_path_kernel3(x, y, dirac_kernel)
start = time.time()
gram = gram_matrix(data, kernel3)
print("Time kernel 3: ", time.time() - start)
print(gram)

Time kernel 1:  25.920093774795532
[[2.134000e+03 3.461800e+04 4.316500e+04]
 [3.461800e+04 2.293064e+06 2.805713e+06]
 [4.316500e+04 2.805713e+06 3.509210e+06]]
Time kernel 2:  38.85942792892456
[[2.134000e+03 3.461800e+04 4.316500e+04]
 [3.461800e+04 2.293064e+06 2.805713e+06]
 [4.316500e+04 2.805713e+06 3.509210e+06]]
Time kernel 3:  37.73068904876709
[[2.134000e+03 3.461800e+04 4.316500e+04]
 [3.461800e+04 2.293064e+06 2.805713e+06]
 [4.316500e+04 2.805713e+06 3.509210e+06]]


In [108]:
s = 0.07856742013183861

kernel1 = lambda x, y: shortest_path_kernel1(x, y,lambda a, b: gaussian_kernel(a, b, s))
start = time.time()
gram = gram_matrix(data, kernel1)
print("Time kernel 1: ", time.time() - start)
print(gram)

kernel2 = lambda x, y: shortest_path_kernel2(x, y, lambda a, b: gaussian_kernel(a, b, s))
start = time.time()
gram = gram_matrix(data, kernel2)
print("Time kernel 2: ", time.time() - start)
print(gram)

kernel3 = lambda x, y: shortest_path_kernel3(x, y, lambda a, b: gaussian_kernel(a, b, s))
start = time.time()
gram = gram_matrix(data, kernel3)
print("Time kernel 3: ", time.time() - start)
print(gram)

Time kernel 1:  217.63507294654846
[[6.06383916e+03 1.81904621e+05 2.27235929e+05]
 [1.81904621e+05 1.34589840e+07 1.64293635e+07]
 [2.27235929e+05 1.64293635e+07 2.02710210e+07]]
Time kernel 2:  225.69025683403015
[[6.06383916e+03 1.81904621e+05 2.27235929e+05]
 [1.81904621e+05 1.34589840e+07 1.64293636e+07]
 [2.27235929e+05 1.64293636e+07 2.02710210e+07]]
Time kernel 3:  214.90464687347412
[[6.06383916e+03 1.81904621e+05 2.27235929e+05]
 [1.81904621e+05 1.34589840e+07 1.64293636e+07]
 [2.27235929e+05 1.64293636e+07 2.02710210e+07]]


In [109]:
requests.post("https://ntfy.sh/My_Computer", data="Proceso terminado".encode('utf-8'))

<Response [200]>