In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd

In [3]:
from schemas import agency, calendar_dates, stop_times, routes, trips, stops

In [4]:
def read_file(path: str, schema: dict) -> pd.DataFrame:
    return pd.read_csv(filepath_or_buffer=path, dtype=schema, header=0)

In [5]:
calendar_df = read_file('../data/calendar_dates.txt', calendar_dates.schema)
stop_times_df = read_file('../data/stop_times.txt', stop_times.schema)
stops_df = read_file('../data/stops.txt', stops.schema)
trips_df = read_file('../data/trips.txt', trips.schema)
routes_df = read_file('../data/routes.txt', routes.schema)

In [6]:
active_services_df = calendar_df[calendar_df[calendar_dates.date] == "20220318"] [calendar_dates.service_id]

In [7]:
routes_related_stops = stop_times_df\
.merge(trips_df, on=trips.trip_id)\
.merge(active_services_df,on=trips.service_id)\
.merge(routes_df, on=routes.route_id)[
    [stop_times.trip_id,
    routes.route_short_name,
    stop_times.stop_id,
    stop_times.stop_sequence
    ]
]\
.drop_duplicates(ignore_index=True)


In [8]:
from gtfs_utils.harvesine import harvesine_distance

In [9]:
harvesine_dist = "harvesine_dist"
prev_stop_id = "prev_stop_id"
prev_stop_name = "prev_stop_name"
prev_stop_lon = "prev_stop_lon"
prev_stop_lat = "prev_stop_lat"

In [10]:
def calculate_distance_to_previous_stop(df: pd.DataFrame) -> pd.DataFrame:
    df[[prev_stop_id, prev_stop_name,prev_stop_lon,prev_stop_lat ]] = df\
        .sort_values(by=[stop_times.stop_sequence], ascending=True)\
        .groupby([stop_times.trip_id])\
        .shift(-1)[[stops.stop_id, stops.stop_name, stops.stop_lon, stops.stop_lat]]
    
    
    df[harvesine_dist] = df.apply(lambda x: harvesine_distance(
                x[stops.stop_lon],
                x[stops.stop_lat],
                x[prev_stop_lon],
                x[prev_stop_lat]
            ), axis=1
        )
    
    return df

In [11]:
ranked_results = calculate_distance_to_previous_stop(routes_related_stops
    .merge(stops_df, on=stops.stop_id))\
    .query(f"{stops.stop_id} != {prev_stop_id}")\
    .query(f"{stops.stop_name} !={prev_stop_name}")


In [12]:
ranked_results['rank'] = ranked_results[harvesine_dist].rank(na_option='bottom', method='dense')

In [13]:
ranked_results = ranked_results\
.drop(trips.trip_id, axis=1)\
.drop_duplicates(ignore_index=True)\
.sort_values(by=['rank'], ascending=True)

In [14]:
ranked_results

Unnamed: 0,route_short_name,stop_id,stop_sequence,stop_name,stop_lat,stop_lon,stop_url,prev_stop_id,prev_stop_name,prev_stop_lon,prev_stop_lat,harvesine_dist,rank
3886,52,1354,29,NUEVAS TECNOLOGÍAS,28.5192,-16.3282,http://www.titsa.com/correspondencias.php?idc=...,1301,I. NUEVAS TECNOLOGÍAS,-16.3281,28.5193,0.014802,1.0
12111,468,8165,28,FLAMINGO,28.0263,-16.7034,http://www.titsa.com/correspondencias.php?idc=...,8100,PALM MAR (T),-16.7037,28.0263,0.029446,2.0
2053,610,2613,11,EL PUENTE,28.4789,-16.3115,http://www.titsa.com/correspondencias.php?idc=...,1401,LOS MENCEYES,-16.3117,28.4787,0.029609,3.0
777,23,1227,41,EL POLE,28.4845,-16.4114,http://www.titsa.com/correspondencias.php?idc=...,1947,GUARDIA CIVIL,-16.4110,28.4844,0.040644,4.0
12625,972,9453,1,CUESTA PIEDRA,28.4682,-16.2796,http://www.titsa.com/correspondencias.php?idc=...,9200,EMILIO SERRA RUS,-16.2792,28.4681,0.040650,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8231,330,5095,72,LOS REALEJOS (T),28.3842,-16.5831,http://www.titsa.com/correspondencias.php?idc=...,,,,,,4405.0
8230,330,5095,51,LOS REALEJOS (T),28.3842,-16.5831,http://www.titsa.com/correspondencias.php?idc=...,,,,,,4405.0
8229,330,5095,64,LOS REALEJOS (T),28.3842,-16.5831,http://www.titsa.com/correspondencias.php?idc=...,,,,,,4405.0
456,11,2625,40,INTERCAMBIADOR LAGUNA (T),28.4805,-16.3170,http://www.titsa.com/correspondencias.php?idc=...,,,,,,4405.0


In [15]:
ranked_results\
.query(f"{routes.route_short_name} == 51")

Unnamed: 0,route_short_name,stop_id,stop_sequence,stop_name,stop_lat,stop_lon,stop_url,prev_stop_id,prev_stop_name,prev_stop_lon,prev_stop_lat,harvesine_dist,rank
1063,51,1949,54,CAPITOL,28.4777,-16.4144,http://www.titsa.com/correspondencias.php?idc=...,1268,PLAZA DE LA ESTACIÓN (T),-16.4149,28.4770,0.091907,66.0
4048,51,1932,37,MOYA,28.5213,-16.3831,http://www.titsa.com/correspondencias.php?idc=...,1933,EL LOMO,-16.3843,28.5209,0.125394,168.0
842,51,2586,71,TITSA LOS RODEOS,28.4895,-16.3503,http://www.titsa.com/correspondencias.php?idc=...,1275,EL PÚLPITO,-16.3487,28.4900,0.165957,439.0
4076,51,2155,13,LAS LAJAS,28.5158,-16.3124,http://www.titsa.com/correspondencias.php?idc=...,2156,FARIA,-16.3136,28.5170,0.177627,528.0
3538,51,1298,11,LA GORGOLANA,28.5140,-16.3083,http://www.titsa.com/correspondencias.php?idc=...,2154,JESÚS JIMÉNEZ,-16.3095,28.5152,0.177628,530.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3509,51,1294,8,REPÚBLICA ARGENTINA,28.5027,-16.3125,http://www.titsa.com/correspondencias.php?idc=...,2395,CARRETERA LAS CANTERAS,-16.3097,28.5085,0.700566,3691.0
567,51,2549,2,LEOCADIO MACHADO,28.4834,-16.3207,http://www.titsa.com/correspondencias.php?idc=...,1723,AVENIDA LA CANDELARIA,-16.3257,28.4882,0.723649,3740.0
1946,51,1945,50,LOS PERALES (T),28.4941,-16.4086,http://www.titsa.com/correspondencias.php?idc=...,1946,CALLEJÓN GRANDE,-16.4092,28.4870,0.791659,3824.0
867,51,2169,73,LUNA LLENA,28.4896,-16.3418,http://www.titsa.com/correspondencias.php?idc=...,2170,EL GOFIO (T),-16.3332,28.4900,0.841649,3866.0


# Most lines on the same stop 

In [16]:
stop_times_routes = stop_times_df\
.merge(trips_df, on=trips.trip_id)\
.merge(active_services_df, on=trips.service_id)\
.merge(routes_df, on=routes.route_id)


In [17]:
diff_routes = "diff_routes"

top_routes = stop_times_routes\
.groupby(stops.stop_id)\
.agg(diff_routes=(routes.route_id,'nunique') )


In [18]:
top_routes\
.merge(stops_df, on=stops.stop_id)\
[
    [stops.stop_id,
    stops.stop_name,
    diff_routes]
]\
.sort_values(by=diff_routes,ascending=False)

Unnamed: 0,stop_id,stop_name,diff_routes
3508,9181,INTERCAMBIADOR STA.CRUZ,44
1219,2625,INTERCAMBIADOR LAGUNA (T),36
3708,9413,MERIDIANO,25
3735,9450,INTERCAMBIADOR STA.CRUZ,23
1190,2582,COROMOTO (T),22
...,...,...,...
2111,4807,BARRANQUILLO,1
2110,4806,LA CARRERA,1
2104,4775,ROTONDA VIA RONDA,1
2103,4774,CUATRO CAMINOS,1


In [19]:
diff_stops = 'diff_stops'

top_routes\
.merge(stops_df, on=stops.stop_id)\
[[
    stops.stop_id,
    stops.stop_name,
    diff_routes
]]\
.groupby(stops.stop_name)\
.agg(stop_id=(stops.stop_id, lambda x: list(x)), diff_stops=(stops.stop_id, 'nunique'))\
.sort_values(by=diff_stops,ascending=False)


Unnamed: 0_level_0,stop_id,diff_stops
stop_name,Unnamed: 1_level_1,Unnamed: 2_level_1
CEMENTERIO,"[1137, 1141, 1204, 1225, 1376, 4074, 4124, 492...",16
CENTRO DE SALUD,"[1219, 1883, 1924, 1928, 2587, 2789, 7257, 736...",12
EL PINO,"[1636, 1647, 2130, 2145, 2314, 2704, 4957, 757...",11
EL CALVARIO,"[1203, 1226, 1258, 1259, 4016, 4035, 4217, 435...",10
EL MOLINO,"[1519, 1571, 1971, 1977, 2573, 2574, 4301, 430...",9
...,...,...
ISMAEL DOMÍNGUEZ,[1951],1
ISORANA,[7954],1
JARDIN BOTANICO,[4151],1
JARDINA,[2070],1


# Maximum Time Predicted for a Route

In [20]:
from gtfs_utils.time import fix_gtfs_time

In [21]:
stop_times_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 716138 entries, 0 to 716137
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   trip_id         716138 non-null  int64 
 1   arrival_time    716138 non-null  object
 2   departure_time  716138 non-null  object
 3   stop_id         716138 non-null  object
 4   stop_sequence   716138 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 27.3+ MB


In [22]:
elapsed = "elapsed"
rank = "rank"

elapsed_time = fix_gtfs_time(stop_times_df, [stop_times.arrival_time, stop_times.departure_time], '2022-03-18')\
.groupby(stop_times.trip_id)\
.agg(max_depart =(stop_times.departure_time, 'max'), min_arrival = (stop_times.arrival_time, 'min'))\

elapsed_time[elapsed] = elapsed_time['max_depart'] - elapsed_time['min_arrival']



In [23]:
elapsed_time = elapsed_time\
.merge(trips_df, on=trips.trip_id)\
.merge(routes_df, on=routes.route_id)[[
    routes.route_short_name,
    elapsed
]]\
.drop_duplicates(ignore_index=True)\

elapsed_time[rank] = elapsed_time[elapsed].rank(na_option='bottom', method='dense', ascending=False)

In [24]:
elapsed_time.sort_values(by=[rank], ascending=True)

Unnamed: 0,route_short_name,elapsed,rank
1399,330,0 days 02:47:22,1.0
1394,330,0 days 02:44:55,2.0
1382,325,0 days 02:36:24,3.0
1426,343,0 days 02:28:52,4.0
3563,342,0 days 02:14:25,5.0
...,...,...,...
1965,381,0 days 00:02:13,2378.0
1771,363,0 days 00:02:08,2379.0
1961,381,0 days 00:02:08,2379.0
1946,381,0 days 00:02:03,2380.0
