In [25]:
#libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import time

In [38]:
# Floyd-Warshall algorithm
def floyd_warshall(AdjMatrix):
    n = len(AdjMatrix)
    cost = np.copy(AdjMatrix)
    cost[cost == 0] = np.inf
    for k in range(n):
        for i in range(n):
            for j in range(n):
                cost[i, j] = min(cost[i, j], cost[i, k] + cost[k, j])
    return cost

In [41]:
# shortest path kernel

def shortest_path_kernel(S1, S2, k_walk):
    n = len(S1)
    m = len(S2)
    K = 0
    for i1 in range(n):
        for j1 in range(i1, n):
            for i2 in range(m):
                for j2 in range(i2, m):
                    if S1[i1,j1] != np.inf and S2[i2,j2] != np.inf:
                        K += k_walk(S1[i1,j1], S2[i2,j2])
    return K

In [13]:
# kernel walk functions
def dirac_kernel(a, b):
    if a == b:
        return 1
    else:
        return 0

In [15]:
def count_trips_mibici(data_user, threshold = 5, complement = False):
    viajes_user = data_user.groupby([data_user[['Origen_Id', 'Destino_Id']].min(axis=1), data_user[['Origen_Id', 'Destino_Id']].max(axis=1)]).size().reset_index(name='counts')
    viajes_user.columns = ['Est_A', 'Est_B', 'counts']
    if not complement:
        viajes_user = viajes_user[viajes_user['counts'] >= threshold]
    else:
        viajes_user = viajes_user[viajes_user['counts'] < threshold]
    if viajes_user.empty:
        return None
    total = viajes_user['counts'].sum()
    viajes_user['prob'] = viajes_user['counts']/total
    viajes_user = viajes_user.sort_values(by = 'prob', ascending = False).reset_index(drop=True)
    return viajes_user

def compute_matrix(counter_user, normalized = False, self_loops = False):
    if not self_loops:
        counter_user = counter_user[counter_user['Est_A'] != counter_user['Est_B']]
    vertex = list(set(counter_user['Est_A'].unique().tolist() + counter_user['Est_B'].unique().tolist()))
    matrix = np.zeros((len(vertex), len(vertex)))
    for i in range(len(counter_user)):
        current_trip = counter_user.iloc[i]
        count = current_trip["counts"]
        estA = current_trip["Est_A"]
        estB = current_trip["Est_B"]

        matrix[vertex.index(estA)][vertex.index(estB)] = count
        matrix[vertex.index(estB)][vertex.index(estA)] = count
    if normalized:
        D = np.sum(matrix, axis = 1)
        D = np.diag(D)
        D = np.linalg.inv(np.sqrt(D))
        matrix = np.sqrt(D) @ matrix @ np.sqrt(D)
    return matrix

In [16]:
#dir = '/home/user/Desktop/Datos/'
dir = '/Users/antoniomendez/Desktop/Tesis/Datos/datos_limpios/'

In [19]:
data_2019 = pd.read_csv(f'{dir}mibici/2019.csv')
data1 = data_2019[data_2019['Inicio_del_viaje'].str.startswith('2019-01-01')]
data2 = data_2019[data_2019['Inicio_del_viaje'].str.startswith('2019-01-02')]

In [20]:
counter_user1 = count_trips_mibici(data1)
counter_user2 = count_trips_mibici(data2)

In [21]:
m1 = compute_matrix(counter_user1, self_loops=True)
m2 = compute_matrix(counter_user2, self_loops=True)

In [43]:
s1 = floyd_warshall(m1)
s2 = floyd_warshall(m2)

In [46]:
print(len(s1[s1 != np.inf]))

262


In [48]:
print(len(s2[s2 != np.inf]))

14946


In [42]:
inicio = time.time()    
K = shortest_path_kernel(s1, s1, dirac_kernel)
print(K)
fin = time.time()
print(fin - inicio)
inicio = time.time()
K = shortest_path_kernel(s1, s2, dirac_kernel)
print(K)
fin = time.time()
print(fin - inicio)
inicio = time.time()
K = shortest_path_kernel(s2, s2, dirac_kernel)
print(K)
fin = time.time()
print(fin - inicio)

2134
0.17227387428283691
34618
2.3209142684936523
2293064
52.57573914527893


In [49]:
34618 /(np.sqrt(2134*2293064))

0.494876325459772