1. Setup

In [43]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.types import VARCHAR, DOUBLE, INTEGER, DATE, TIME
from sqlalchemy.dialects.mysql import TINYINT
from datetime import datetime

2. Data Import & Push to Database

In [55]:
# Define the path to the GTFS data files
files_path = "../../../../../../Volumes/LACIE SHARE/ML_DATA_PROJECTS/Toronto_Transit_Dashboards/Go_Transit_Data/GO-GTFS-07-2025/"
# Load each GTFS file into a DataFrame
# Adjustements for some columns to ensure correct data types.
agency          = pd.read_csv(files_path + "agency.txt")
stops           = pd.read_csv(files_path + "stops.txt")
routes          = pd.read_csv(files_path + "routes.txt")
trips           = pd.read_csv(files_path + "trips.txt", dtype={'shape_id': str})
stop_times      = pd.read_csv(files_path + "stop_times.txt", dtype={'stop_headsign': str})
calendar_dates  = pd.read_csv(files_path + "calendar_dates.txt")
shapes          = pd.read_csv(files_path + "shapes.txt", dtype={'shape_id': str})
fare_attributes = pd.read_csv(files_path + "fare_attributes.txt")
fare_rules      = pd.read_csv(files_path + "fare_rules.txt")
transfers       = pd.read_csv(files_path + "transfers.txt")
stop_amenities  = pd.read_csv(files_path + "stop_amentities.txt")
feed_info       = pd.read_csv(files_path + "feed_info.txt")

In [65]:
# Some data type changes to make it more suitable for SQL.
# Change date columns to datetime.
calendar_dates['date'] = pd.to_datetime(calendar_dates['date'], format='%Y%m%d').dt.date
feed_info['feed_start_date'] = pd.to_datetime(feed_info['feed_start_date'], format='%Y%m%d').dt.date
feed_info['feed_end_date'] = pd.to_datetime(feed_info['feed_end_date'], format='%Y%m%d').dt.date

In [18]:
# Set up the database connection
engine = create_engine('mysql+pymysql://root:archit14411@localhost:3306/gtfs_db?charset=utf8mb4')

In [67]:
# Define the mapping of the dataframe columns to SQL data types
# Keys are the text file names, values are tuples of (table name, column definitions)
gtfs_mapping = {
    'agency.txt': ('agency', {'agency_id': VARCHAR(50),
                              'agency_name': VARCHAR(255),
                              'agency_url': VARCHAR(255),
                              'agency_timezone': VARCHAR(50),
                              'agency_lang': VARCHAR(50),
                              'agency_phone': VARCHAR(50),
                              'agency_fare_url': VARCHAR(255)}),

    'stops.txt': ('stops', {'stop_id': VARCHAR(50),
                            'stop_name': VARCHAR(255),
                            'stop_lat': DOUBLE,
                            'stop_lon': DOUBLE,
                            'zone_id': INTEGER,
                            'stop_url': VARCHAR(255),
                            'location_type': TINYINT,
                            'parent_station': VARCHAR(50),
                            'wheelchair_boarding': TINYINT,
                            'stop_code': DOUBLE}),

    'routes.txt': ('routes', {'route_id': VARCHAR(50),
                            'agency_id': VARCHAR(50),
                            'route_short_name': VARCHAR(50),
                            'route_long_name': VARCHAR(255),
                            'route_type': TINYINT,
                            'route_color': VARCHAR(10),
                            'route_text_color': VARCHAR(10)}),

    'trips.txt': ('trips', {'route_id': VARCHAR(50),
                            'service_id': VARCHAR(50),
                            'trip_id': VARCHAR(50),
                            'trip_headsign': VARCHAR(255),
                            'trip_short_name': VARCHAR(50),
                            'direction_id': TINYINT,
                            'block_id': VARCHAR(50),
                            'shape_id': VARCHAR(50),
                            'wheelchair_accessible': TINYINT,
                            'bikes_allowed': TINYINT,
                            'route_variant': VARCHAR(50)}),

    'stop_times.txt': ('stop_times', {'trip_id': VARCHAR(50),
                                      'arrival_time': TIME,
                                      'departure_time': TIME,
                                      'stop_id': VARCHAR(50),
                                      'stop_sequence': INTEGER,
                                      'pickup_type': TINYINT,
                                      'drop_off_type': TINYINT,
                                      'stop_headsign': VARCHAR(255)}),

    'calendar_dates.txt': ('calendar_dates', {'service_id': VARCHAR(50),
                                              'date' : DATE,
                                              'exception_type': TINYINT}),

    'shapes.txt': ('shapes', {'shape_id': VARCHAR(50),
                            'shape_pt_lat': DOUBLE,
                            'shape_pt_lon': DOUBLE,
                            'shape_pt_sequence': INTEGER}),

    'fare_attributes.txt': ('fare_attributes', {'fare_id': VARCHAR(50),
                                                'price': DOUBLE,
                                                'currency_type': VARCHAR(3),
                                                'payment_method': TINYINT,
                                                'transfers': TINYINT}),

    'fare_rules.txt': ('fare_rules', {'fare_id': VARCHAR(50),
                                      'origin_id': INTEGER,
                                      'destination_id': INTEGER}),
    

    'transfers.txt': ('transfers', {'from_stop_id': VARCHAR(50),
                                    'to_stop_id': VARCHAR(50),
                                    'transfer_type': TINYINT,
                                    'min_transfer_time': INTEGER}),

    'stop_amentities.txt': ('stop_amenities', {'stop_id': VARCHAR(50),
                                               'shelter': TINYINT,
                                               'washroom': TINYINT,
                                               'bike_rack': TINYINT,
                                               'bench': TINYINT}),
    'feed_info.txt': ('feed_info', {'feed_publisher_name': VARCHAR(255),
                                    'feed_publisher_url': VARCHAR(255),
                                    'feed_lang': VARCHAR(50),
                                    'feed_start_date': DATE,
                                    'feed_end_date': DATE,
                                    'feed_version': VARCHAR(50)})                                    
}

In [None]:
# Now we push the data to the database.
agency.to_sql(name='agency', con=engine, if_exists='append', index=False, dtype=gtfs_mapping['agency.txt'][1])
stops.to_sql(name='stops', con=engine, if_exists='append', index=False, dtype=gtfs_mapping['stops.txt'][1])
routes.to_sql(name='routes', con=engine, if_exists='append', index=False, dtype=gtfs_mapping['routes.txt'][1])
trips.to_sql(name='trips', con=engine, if_exists='append', index=False, dtype=gtfs_mapping['trips.txt'][1])
stop_times.to_sql(name='stop_times', con=engine, if_exists='append', index=False, dtype=gtfs_mapping['stop_times.txt'][1])
calendar_dates.to_sql(name='calendar_dates', con=engine, if_exists='append', index=False, dtype=gtfs_mapping['calendar_dates.txt'][1])
shapes.to_sql(name='shapes', con=engine, if_exists='append', index=False, dtype=gtfs_mapping['shapes.txt'][1])
fare_attributes.to_sql(name='fare_attributes', con=engine, if_exists='append', index=False, dtype=gtfs_mapping['fare_attributes.txt'][1])
fare_rules.to_sql(name='fare_rules', con=engine, if_exists='append', index=False, dtype=gtfs_mapping['fare_rules.txt'][1])
transfers.to_sql(name='transfers', con=engine, if_exists='append', index=False, dtype=gtfs_mapping['transfers.txt'][1])
stop_amenities.to_sql(name='stop_amenities', con=engine, if_exists='append', index=False, dtype=gtfs_mapping['stop_amentities.txt'][1])
feed_info.to_sql(name='feed_info', con=engine, if_exists='append', index=False, dtype=gtfs_mapping['feed_info.txt'][1])

# Close the database connection
engine.dispose()
print("Data has been successfully imported into the database.")

Data has been successfully imported into the database.


In [70]:
trips.shape

(69634, 11)