In [1]:
import os
import requests
from datetime import datetime, timedelta
import pandas as pd
import json
from google.transit import gtfs_realtime_pb2


FUENTES = {
    "ACES": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-ace",
        "lineas": ["A", "C", "E", "Sr"]
    },
    "BDFMS": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-bdfm",
        "lineas": ["B", "D", "F", "M", "Sf"]
    },
    "G": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-g",
        "lineas": ["G"]
    },
    "JZ": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-jz",
        "lineas": ["J", "Z"]
    },
    "NQRW": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-nqrw",
        "lineas": ["N", "Q", "R", "W"]
    },
    "L": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-l",
        "lineas": ["L"]
    },
    "1234567S": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs",
        "lineas": ["1", "2", "3", "4", "5", "6", "7", "S"]
    },
    "SIR": {
        "url": "https://api-endpoint.mta.info/Dataservice/mtagtfsfeeds/nyct%2Fgtfs-si",
        "lineas": ["SIR"]
    }
}

In [2]:
def extraccion_linea(url, linea):
    """
    Extrae los datos de una línea
    """
    response = requests.get(url)
    fuentes = gtfs_realtime_pb2.FeedMessage()
    fuentes.ParseFromString(response.content)

    datos_linea = []
    for entity in fuentes.entity:
        if entity.HasField('trip_update'):
            trayecto = entity.trip_update

            if trayecto.trip.route_id == linea:
                for stop in trayecto.stop_time_update:
                    campos = {
                        'viaje_id': trayecto.trip.trip_id,
                        'linea_id': trayecto.trip.route_id,
                        'hora_inicio': trayecto.trip.start_time if trayecto.trip.HasField('start_date') else None,
                        'fecha_inicio': trayecto.trip.start_date if trayecto.trip.HasField('start_time') else None,
                        'direccion': trayecto.trip.direction_id if trayecto.trip.HasField('direction_id') else None,
                        'parada_id': stop.stop_id,
                        'orden_parada': stop.stop_sequence if stop.HasField('stop_sequence') else None,
                        'hora_llegada': datetime.fromtimestamp(stop.arrival.time) if stop.HasField('arrival') else None,
                        'retraso_llegada': stop.arrival.delay if stop.HasField('arrival') and stop.arrival.HasField('delay') else None,
                        'hora_partida': datetime.fromtimestamp(stop.departure.time) if stop.HasField('departure') else None,
                        'retraso_partida': stop.departure.delay if stop.HasField('departure') and stop.departure.HasField('delay') else None,                       
                    }

                    datos_linea.append(campos)
    return datos_linea


In [3]:
def extraccion_datos():
    """
    Extrae todas las líneas y las convierte a DF
    """

    todas_las_lineas = []
    for info in FUENTES.values():
        todas_las_lineas.extend(info['lineas'])
    
    todos_los_datos = []
    for linea in todas_las_lineas:
        for grupo, info in FUENTES.items():
            if linea in info['lineas']:
                fuentes_url = info['url']
            todos_los_datos.extend(extraccion_linea(fuentes_url, linea))  

    return pd.DataFrame(todos_los_datos)
        

In [4]:
df = extraccion_datos()

In [5]:
df

Unnamed: 0,viaje_id,linea_id,hora_inicio,fecha_inicio,direccion,parada_id,orden_parada,hora_llegada,retraso_llegada,hora_partida,retraso_partida
0,059100_A..N55R,A,09:51:00,20260216,,A03N,,2026-02-16 17:26:52,,2026-02-16 17:26:52,
1,059100_A..N55R,A,09:51:00,20260216,,A02N,,2026-02-16 17:29:02,,2026-02-16 17:29:02,
2,060600_A..S58X054,A,10:06:00,20260216,,H02S,,2026-02-16 17:26:52,,2026-02-16 17:26:52,
3,060600_A..S58X054,A,10:06:00,20260216,,H03S,,2026-02-16 17:28:47,,2026-02-16 17:28:47,
4,060600_A..S58X054,A,10:06:00,20260216,,H04S,,2026-02-16 17:35:17,,2026-02-16 17:35:17,
...,...,...,...,...,...,...,...,...,...,...,...
68430,072050_7..N,7,,,,706N,,2026-02-16 18:30:40,,2026-02-16 18:31:00,
68431,072050_7..N,7,,,,705N,,2026-02-16 18:31:40,,2026-02-16 18:32:00,
68432,072050_7..N,7,,,,702N,,2026-02-16 18:34:10,,2026-02-16 18:34:30,
68433,072050_7..N,7,,,,701N,,2026-02-16 18:36:40,,NaT,


In [6]:
tot_filas = len(df)

for columna in df.columns:
    nulos = df[columna].isnull().sum()
    proporcion = nulos/tot_filas
    print(f"Proporcion nulos en {columna}: {proporcion}")


Proporcion nulos en viaje_id: 0.0
Proporcion nulos en linea_id: 0.0
Proporcion nulos en hora_inicio: 0.0
Proporcion nulos en fecha_inicio: 0.340936655220282
Proporcion nulos en direccion: 0.9779498794476511
Proporcion nulos en parada_id: 0.0
Proporcion nulos en orden_parada: 0.9779498794476511
Proporcion nulos en hora_llegada: 0.006721706729012932
Proporcion nulos en retraso_llegada: 0.978475926061226
Proporcion nulos en hora_partida: 0.01689194125812815
Proporcion nulos en retraso_partida: 0.9791773215459926


In [7]:
df = df.drop(['direccion', 'orden_parada', 'retraso_llegada', 'retraso_partida', 'hora_inicio', 'fecha_inicio'], axis = 1)

In [8]:
df['hora_llegada'] = df['hora_llegada'].dt.tz_localize('UTC').dt.tz_convert('America/New_York')  
df['hora_partida'] = df['hora_partida'].dt.tz_localize('UTC').dt.tz_convert('America/New_York')

In [9]:
df['hora_llegada'] = df['hora_llegada'].dt.strftime('%H:%M')
df['hora_partida'] = df['hora_partida'].dt.strftime('%H:%M')

In [10]:
df

Unnamed: 0,viaje_id,linea_id,parada_id,hora_llegada,hora_partida
0,059100_A..N55R,A,A03N,12:26,12:26
1,059100_A..N55R,A,A02N,12:29,12:29
2,060600_A..S58X054,A,H02S,12:26,12:26
3,060600_A..S58X054,A,H03S,12:28,12:28
4,060600_A..S58X054,A,H04S,12:35,12:35
...,...,...,...,...,...
68430,072050_7..N,7,706N,13:30,13:31
68431,072050_7..N,7,705N,13:31,13:32
68432,072050_7..N,7,702N,13:34,13:34
68433,072050_7..N,7,701N,13:36,


In [11]:
norte = (df['parada_id'].str[-1] == 'N')
sur = (df['parada_id'].str[-1] == 'S')

df.loc[norte, 'direccion'] = 1
df.loc[sur, 'direccion'] = 0

df['direccion'] = df['direccion'].astype('Int64')


In [12]:
df

Unnamed: 0,viaje_id,linea_id,parada_id,hora_llegada,hora_partida,direccion
0,059100_A..N55R,A,A03N,12:26,12:26,1
1,059100_A..N55R,A,A02N,12:29,12:29,1
2,060600_A..S58X054,A,H02S,12:26,12:26,0
3,060600_A..S58X054,A,H03S,12:28,12:28,0
4,060600_A..S58X054,A,H04S,12:35,12:35,0
...,...,...,...,...,...,...
68430,072050_7..N,7,706N,13:30,13:31,1
68431,072050_7..N,7,705N,13:31,13:32,1
68432,072050_7..N,7,702N,13:34,13:34,1
68433,072050_7..N,7,701N,13:36,,1


In [13]:
tot_filas = len(df)

for columna in df.columns:
    nulos = df[columna].isnull().sum()
    proporcion = nulos/tot_filas
    print(f"Proporcion nulos en {columna}: {proporcion}")

Proporcion nulos en viaje_id: 0.0
Proporcion nulos en linea_id: 0.0
Proporcion nulos en parada_id: 0.0
Proporcion nulos en hora_llegada: 0.006721706729012932
Proporcion nulos en hora_partida: 0.01689194125812815
Proporcion nulos en direccion: 0.0


In [14]:
df = df.dropna()
df

Unnamed: 0,viaje_id,linea_id,parada_id,hora_llegada,hora_partida,direccion
0,059100_A..N55R,A,A03N,12:26,12:26,1
1,059100_A..N55R,A,A02N,12:29,12:29,1
2,060600_A..S58X054,A,H02S,12:26,12:26,0
3,060600_A..S58X054,A,H03S,12:28,12:28,0
4,060600_A..S58X054,A,H04S,12:35,12:35,0
...,...,...,...,...,...,...
68428,072050_7..N,7,708N,13:28,13:28,1
68429,072050_7..N,7,707N,13:29,13:30,1
68430,072050_7..N,7,706N,13:30,13:31,1
68431,072050_7..N,7,705N,13:31,13:32,1
