# Dependencies

In [6]:
# pip install gtfs-realtime-bindings pandas requests
from google.transit import gtfs_realtime_pb2
from google.protobuf.json_format import MessageToDict
import pandas as pd
from requests import get
import time
from datetime import datetime
import psycopg2
from sqlalchemy import create_engine

In [5]:
def request_api_rapidkl(category, watermark):

    URL = f'https://api.data.gov.my/gtfs-realtime/vehicle-position/prasarana?category={category}'

    # Parse the GTFS Realtime feed
    feed = gtfs_realtime_pb2.FeedMessage()
    response = get(URL)
    feed.ParseFromString(response.content)

    # Extract and print vehicle position information
    vehicle_positions = [MessageToDict(entity.vehicle) for entity in feed.entity]
    df = pd.json_normalize(vehicle_positions)

    if df.empty:
        print(f'ERROR: Dataframe is empty - {watermark}')
    else:
        print(f'STATUS: Dataframe created - {watermark}')

    return df


def generate_rapidkl_data(category, requests_amt):
    dfs = []
    for _ in range(requests_amt):
        df_output = request_api_rapidkl(category, datetime.now())
        dfs.append(df_output)
        time.sleep(30)

    if all([x.empty for x in dfs]):
        print('ERROR: All dataframe(s) is empty. Failed to generate dataset')
    else:
        df_concat = pd.concat(dfs)
        return df_concat
    
df_fetch = generate_rapidkl_data('rapid-bus-kl', 5)

STATUS: Dataframe created - 2025-01-20 08:55:24.893962
STATUS: Dataframe created - 2025-01-20 08:55:55.241082
STATUS: Dataframe created - 2025-01-20 08:56:25.512366
STATUS: Dataframe created - 2025-01-20 08:56:55.797025
STATUS: Dataframe created - 2025-01-20 08:57:26.144785


In [48]:
def rename_col(df):
    return df.rename({
        'trip.tripId': 'trip_id',
        'trip.startTime': 'start_time',
        'trip.startDate': 'start_date',
        'trip.routeId': 'route_id',
        'position.latitude': 'latitude',
        'position.longitude': 'longitude',
        'position.bearing': 'bearing',
        'position.speed': 'speed',
        'vehicle.id': 'vehicle_id',
        'vehicle.licensePlate': 'license_plate'
        }, axis=1)

def convert_unixtime_to_standard(unixtime):
    return datetime.fromtimestamp(int(unixtime))

In [49]:
df_rapid = rename_col(df_fetch)
df_rapid['timestamp'] = df_fetch.apply(lambda x: convert_unixtime_to_standard(x['timestamp']), axis=1)
df_rapid.to_csv('rapid-kl-bus.csv', index=False)

# Connecting DB

In [43]:
def connect_db():
    conn = psycopg2.connect(
        host="localhost",
        database="postgres",
        user="postgres",
        port='54320',
        password="postgres")
    return conn

def query_db(query:str):
    """ 
    Use for CREATE, INSERT syntax
    """
    conn = connect_db()
    cur = conn.cursor()
    try:
        cur.execute(query)
        print('syntax executed')
        cur.close()
        print('connection close')
        conn.commit()
    except:
        conn.rollback()


def connect_db_v2():
    engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost:54320/postgres')
    return engine

def fetch_db(query:str):
    """
    Use for SELECT syntax
    """
    engine = connect_db_v2()
    return pd.read_sql_query(query, con=engine)

In [8]:
fetch_db('SELECT * FROM dim_drivers')

Unnamed: 0,driver_id,plate_num
0,driver_0000,WVE5137
1,driver_0001,WVL602
2,driver_0002,WUW1483
3,driver_0003,VGG9462
4,driver_0004,WVB6987
...,...,...
205,driver_0205,SF2113398
206,driver_0206,VEN1373
207,driver_0207,WVE5068
208,driver_0208,WPC3856


In [44]:
# TESTING
# query_db("""
#             CREATE TABLE ammar_test (
#             name TEXT, 
#             PRIMARY KEY (name)
#         );
#         """)
# query_db("""
#         INSERT INTO ammar_test 
#             (name) 
#             VALUES ('jack')
#         ;
#         """)
# fetch_db("SELECT * FROM ammar_test")

syntax executed
connection close
syntax executed
connection close


Unnamed: 0,name
0,jack


# Database Schema

In [59]:
# query_db('CREATE SCHEMA rapidkl;')

# fact_daily_trip

In [56]:
engine = connect_db_v2()
df_rapid.to_sql('fact_daily_trip', con=engine, schema='rapidkl', if_exists='replace')

402

In [62]:
fetch_db('SELECT * FROM fact_daily_trip')

Unnamed: 0,index,timestamp,trip_id,start_time,start_date,route_id,latitude,longitude,bearing,speed,vehicle_id,license_plate
0,0,2025-01-20 08:54:31,weekday_U1700_U170002_3,08:46:31,20250120,U1700,3.254817,101.693990,0.0,0.00,WVD4971,WVD4971
1,1,2025-01-20 08:54:28,weekday_P0010_P001002_3,08:47:58,20250120,P0010,3.084800,101.627830,92.0,59.26,WA3714M,WA3714M
2,2,2025-01-20 08:54:37,weekday_U6400_U640001_3,08:20:06,20250120,U6400,3.081413,101.666470,279.5,27.22,WPA5621,WPA5621
3,3,2025-01-20 08:54:49,weekday_U3030_U303002_1,08:31:28,20250120,U3030,3.159460,101.744630,278.6,5.00,WVJ8197,WVJ8197
4,4,2025-01-20 08:53:57,weekday_U6520_U652002_0,07:40:52,20250120,U6520,3.057708,101.688690,268.6,5.74,WPC8505,WPC8505
...,...,...,...,...,...,...,...,...,...,...,...,...
1397,271,2025-01-20 08:56:26,weekday_U5900_U590002_2,07:41:27,20250120,U5900,3.114600,101.706500,44.0,24.08,WPV6941,WPV6941
1398,272,2025-01-20 08:56:47,weekday_U2000_U200002_0,08:11:37,20250120,U2000,3.199194,101.703770,168.2,37.41,WVP2546,WVP2546
1399,273,2025-01-20 08:56:27,weekday_U7720_U772001_1,08:04:04,20250120,U7720,3.145340,101.538445,246.9,39.26,WVH7906,WVH7906
1400,274,2025-01-20 08:56:31,weekday_U3030_U303002_1,08:01:01,20250120,U3030,3.157180,101.706170,354.8,0.00,WVL574,WVL574


# dim_drivers

In [83]:
df_trip = fetch_db('SELECT * FROM rapidkl.fact_daily_trip')
bus_plates = df_trip['license_plate'].unique()
driver_names = [f'driver_{str(x+1).zfill(5)}' for x in range(len(bus_plates))]
df_drivers = pd.DataFrame({'driver_id':[x+1 for x in range(len(bus_plates))], 'driver_name':driver_names})
df_drivers.to_sql('dim_drivers', con=engine, schema='rapidkl', if_exists='replace', index=False)

298

In [84]:
fetch_db("SELECT * FROM rapidkl.dim_drivers")

Unnamed: 0,driver_id,driver_name
0,1,driver_00001
1,2,driver_00002
2,3,driver_00003
3,4,driver_00004
4,5,driver_00005
...,...,...
293,294,driver_00294
294,295,driver_00295
295,296,driver_00296
296,297,driver_00297


# dim_busses

In [87]:
df_trip = fetch_db('SELECT * FROM rapidkl.fact_daily_trip')
bus_plates = sorted(df_trip['license_plate'].unique())
bus_id = [x+1 for x in range(len(bus_plates))]
df_bus = pd.DataFrame({'bus_id':bus_id, 'bus_plates': bus_plates})
df_bus.to_sql('dim_busses', con=engine, schema='rapidkl', if_exists='replace', index=False)

298

In [88]:
fetch_db("SELECT * FROM rapidkl.dim_busses")

Unnamed: 0,bus_id,bus_plates
0,1,BNG4014
1,2,CDH8296
2,3,CDH8332
3,4,PJK1473
4,5,PLA2875
...,...,...
293,294,WWC4592
294,295,WWC4624
295,296,WWC4681
296,297,WWC6423
