In [1]:
import os
import requests
from datetime import datetime, timedelta
import pandas as pd
import json
from google.transit import gtfs_realtime_pb2


FUENTES = {
    "ACES": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-ace",
        "lineas": ["A", "C", "E", "Sr"]
    },
    "BDFMS": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-bdfm",
        "lineas": ["B", "D", "F", "M", "Sf"]
    },
    "G": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-g",
        "lineas": ["G"]
    },
    "JZ": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-jz",
        "lineas": ["J", "Z"]
    },
    "NQRW": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-nqrw",
        "lineas": ["N", "Q", "R", "W"]
    },
    "L": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-l",
        "lineas": ["L"]
    },
    "1234567S": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs",
        "lineas": ["1", "2", "3", "4", "5", "6", "7", "S"]
    },
    "SIR": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-si",
        "lineas": ["SIR"]
    }
}

In [2]:
def extraccion_linea(url, linea):
    """
    Extrae los datos de una línea
    """
    response = requests.get(url)
    fuentes = gtfs_realtime_pb2.FeedMessage()
    fuentes.ParseFromString(response.content)

    datos_linea = []
    for entity in fuentes.entity:
        if entity.HasField('trip_update'):
            trayecto = entity.trip_update

            if trayecto.trip.route_id == linea:
                for stop in trayecto.stop_time_update:
                    campos = {
                        'viaje_id': trayecto.trip.trip_id,
                        'linea_id': trayecto.trip.route_id,
                        'hora_inicio': trayecto.trip.start_time if trayecto.trip.HasField('start_date') else None,
                        'fecha_inicio': trayecto.trip.start_date if trayecto.trip.HasField('start_time') else None,
                        'direccion': trayecto.trip.direction_id if trayecto.trip.HasField('direction_id') else None,
                        'parada_id': stop.stop_id,
                        'orden_parada': stop.stop_sequence if stop.HasField('stop_sequence') else None,
                        'hora_llegada': datetime.fromtimestamp(stop.arrival.time) if stop.HasField('arrival') else None,
                        'retraso_llegada': stop.arrival.delay if stop.HasField('arrival') and stop.arrival.HasField('delay') else None,
                        'hora_partida': datetime.fromtimestamp(stop.departure.time) if stop.HasField('departure') else None,
                        'retraso_partida': stop.departure.delay if stop.HasField('departure') and stop.departure.HasField('delay') else None,                       
                    }

                    datos_linea.append(campos)
    return datos_linea


In [3]:
def extraccion_datos():
    """
    Extrae todas las líneas y las convierte a DF
    """

    todas_las_lineas = []
    for info in FUENTES.values():
        todas_las_lineas.extend(info['lineas'])
    
    todos_los_datos = []
    for linea in todas_las_lineas:
        for grupo, info in FUENTES.items():
            if linea in info['lineas']:
                fuentes_url = info['url']
            todos_los_datos.extend(extraccion_linea(fuentes_url, linea))  

    return pd.DataFrame(todos_los_datos)
        

In [4]:
df = extraccion_datos()

In [5]:
df

Unnamed: 0,viaje_id,linea_id,hora_inicio,fecha_inicio,direccion,parada_id,orden_parada,hora_llegada,retraso_llegada,hora_partida,retraso_partida
0,019850_A..S,A,03:18:30,20260218,,H06S,,2026-02-18 10:49:22,,2026-02-18 10:49:22,
1,019850_A..S,A,03:18:30,20260218,,H07S,,2026-02-18 10:53:07,,2026-02-18 10:53:07,
2,019850_A..S,A,03:18:30,20260218,,H08S,,2026-02-18 10:56:22,,2026-02-18 10:56:22,
3,019850_A..S,A,03:18:30,20260218,,H09S,,2026-02-18 10:57:52,,2026-02-18 10:57:52,
4,019850_A..S,A,03:18:30,20260218,,H10S,,2026-02-18 10:59:52,,2026-02-18 10:59:52,
...,...,...,...,...,...,...,...,...,...,...,...
51476,031750_7..S,7,,,,721S,,2026-02-18 11:42:10,,2026-02-18 11:42:30,
51477,031750_7..S,7,,,,723S,,2026-02-18 11:47:10,,2026-02-18 11:47:30,
51478,031750_7..S,7,,,,724S,,2026-02-18 11:48:10,,2026-02-18 11:48:30,
51479,031750_7..S,7,,,,725S,,2026-02-18 11:49:40,,2026-02-18 11:50:00,


In [6]:
tot_filas = len(df)

for columna in df.columns:
    nulos = df[columna].isnull().sum()
    proporcion = nulos/tot_filas
    print(f"Proporcion nulos en {columna}: {proporcion}")


Proporcion nulos en viaje_id: 0.0
Proporcion nulos en linea_id: 0.0
Proporcion nulos en hora_inicio: 0.0
Proporcion nulos en fecha_inicio: 0.3313843942425361
Proporcion nulos en direccion: 0.9895106932654766
Proporcion nulos en parada_id: 0.0
Proporcion nulos en orden_parada: 0.9895106932654766
Proporcion nulos en hora_llegada: 0.0069151725879450674
Proporcion nulos en retraso_llegada: 0.9898020628969911
Proporcion nulos en hora_partida: 0.012645442007731008
Proporcion nulos en retraso_partida: 0.9900934325285057


In [7]:
df = df.drop(['direccion', 'orden_parada', 'retraso_llegada', 'retraso_partida', 'hora_inicio', 'fecha_inicio'], axis = 1)

In [8]:
df['hora_llegada'] = df['hora_llegada'].dt.tz_localize('UTC').dt.tz_convert('America/New_York')  
df['hora_partida'] = df['hora_partida'].dt.tz_localize('UTC').dt.tz_convert('America/New_York')

In [9]:
df['hora_llegada'] = df['hora_llegada'].dt.strftime('%H:%M')
df['hora_partida'] = df['hora_partida'].dt.strftime('%H:%M')

In [10]:
df

Unnamed: 0,viaje_id,linea_id,parada_id,hora_llegada,hora_partida
0,019850_A..S,A,H06S,05:49,05:49
1,019850_A..S,A,H07S,05:53,05:53
2,019850_A..S,A,H08S,05:56,05:56
3,019850_A..S,A,H09S,05:57,05:57
4,019850_A..S,A,H10S,05:59,05:59
...,...,...,...,...,...
51476,031750_7..S,7,721S,06:42,06:42
51477,031750_7..S,7,723S,06:47,06:47
51478,031750_7..S,7,724S,06:48,06:48
51479,031750_7..S,7,725S,06:49,06:50


In [11]:
norte = (df['parada_id'].str[-1] == 'N')
sur = (df['parada_id'].str[-1] == 'S')

df.loc[norte, 'direccion'] = 1
df.loc[sur, 'direccion'] = 0

df['direccion'] = df['direccion'].astype('Int64')


In [12]:
df

Unnamed: 0,viaje_id,linea_id,parada_id,hora_llegada,hora_partida,direccion
0,019850_A..S,A,H06S,05:49,05:49,0
1,019850_A..S,A,H07S,05:53,05:53,0
2,019850_A..S,A,H08S,05:56,05:56,0
3,019850_A..S,A,H09S,05:57,05:57,0
4,019850_A..S,A,H10S,05:59,05:59,0
...,...,...,...,...,...,...
51476,031750_7..S,7,721S,06:42,06:42,0
51477,031750_7..S,7,723S,06:47,06:47,0
51478,031750_7..S,7,724S,06:48,06:48,0
51479,031750_7..S,7,725S,06:49,06:50,0


In [13]:
df = df.dropna()
df

Unnamed: 0,viaje_id,linea_id,parada_id,hora_llegada,hora_partida,direccion
0,019850_A..S,A,H06S,05:49,05:49,0
1,019850_A..S,A,H07S,05:53,05:53,0
2,019850_A..S,A,H08S,05:56,05:56,0
3,019850_A..S,A,H09S,05:57,05:57,0
4,019850_A..S,A,H10S,05:59,05:59,0
...,...,...,...,...,...,...
51475,031750_7..S,7,720S,06:41,06:42,0
51476,031750_7..S,7,721S,06:42,06:42,0
51477,031750_7..S,7,723S,06:47,06:47,0
51478,031750_7..S,7,724S,06:48,06:48,0
