In [6]:
import pandas as pd
import re

from datetime import datetime
import os
import deltalake
from deltalake import DeltaTable
from deltalake.writer import write_deltalake
import duckdb
current_folder_path = os.path.abspath('.')

In [2]:
current_folder_path+ '\\data\\as_of\\'

'c:\\Users\\AzwanDesktop\\OneDrive\\Personal Data Projects\\DataGovMy\\data\\as_of\\'

In [2]:
def load_railway() -> pd.DataFrame:
    filters = [("frequency", "==", "daily")
            ]
    df = pd.read_parquet('https://storage.data.gov.my/dashboards/prasarana_timeseries.parquet', filters=filters)
    
    df['date'] = pd.to_datetime(df['date'], format="%Y-%B-%d")
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_name'] = df['date'].dt.day_name()

    df['station_code_origin'] = [re.search(r'(\w+):', x).group(1) for x in df['origin']]
    df['station_code_destination'] = [re.search(r'(\w+):', x).group(1) for x in df['destination']]
    df['station_name_origin'] = [re.search(r':\s*([\w\s]+)', x).group(1) + ' (origin)' for x in df['origin']]
    df['station_name_destination'] = [re.search(r':\s*([\w\s]+)', x).group(1) + ' (destination)' for x in df['destination']]

    df['line_origin'] = [re.search(r'(\D+)\d+',x).group(1) for x in df['station_code_origin']]
    df['line_destination'] = [re.search(r'(\D+)\d+',x).group(1) for x in df['station_code_destination']]

    df = df.loc[(df.line_origin != 'A') & (df.line_destination != 'A')].reset_index(drop=True)

    line_mapper_dict = {'A':'All Station', 
                        'AG':'LRT Ampang Line', 
                        'BRT':'Bus Rapid Transit', 
                        'KG':'MRT Kajang Line', 
                        'KJ':'LRT Kajang Line', 
                        'MR':'Monorail', 
                        'PYL':'MRT Putrajaya Line', 
                        'SP':'LRT Sri Petaling Line'}

    df['line_name_origin'] = df['line_origin'].map(line_mapper_dict)
    df['line_name_destination'] = df['line_destination'].map(line_mapper_dict)

    str_cols = [x for x in df.columns if x not in ('date', 'passengers')]
    df[str_cols]= df[str_cols].astype('string')
    print(f"Data contains {df.shape[0]} rows")
    return df

In [4]:
df = load_railway()
# write_deltalake(current_folder_path + '\\data\\ridership\\', df, mode='overwrite')
df.to_parquet('./data/railway/ridership/passengers_daily.parquet')


Data contains 1037172 rows


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1037172 entries, 0 to 1037171
Data columns (total 18 columns):
 #   Column                    Non-Null Count    Dtype         
---  ------                    --------------    -----         
 0   service                   1037172 non-null  string        
 1   frequency                 1037172 non-null  string        
 2   origin                    1037172 non-null  string        
 3   destination               1037172 non-null  string        
 4   date                      1037172 non-null  datetime64[ns]
 5   passengers                1037172 non-null  int64         
 6   year                      1037172 non-null  string        
 7   month                     1037172 non-null  string        
 8   day                       1037172 non-null  string        
 9   day_name                  1037172 non-null  string        
 10  station_code_origin       1037172 non-null  string        
 11  station_code_destination  1037172 non-null  string

In [25]:
def store_load_date(df:pd.DataFrame):
    data_as_of = df.date.max().date().isoformat()
    df_data_as_of = pd.DataFrame({'load_date':[data_as_of]})
    print('Data as of:',data_as_of)
    try: 
        pd.read_parquet("./data/as_of/").load_date.max()
        if pd.read_parquet("./data/as_of/").load_date.max() <= data_as_of:
            pass
        else:
            write_deltalake(current_folder_path + '\\data\\as_of\\', df_data_as_of, mode='append')
    except:
        write_deltalake(current_folder_path + '\\data\\as_of\\', df_data_as_of, mode='append')

store_load_date(df)

Data as of: 2024-01-25


In [41]:
df.sample(10)

Unnamed: 0,service,frequency,origin,destination,date,passengers,year,month,day,day_name,station_code_origin,station_code_destination,station_name_origin,station_name_destination,line_origin,line_destination,line_name_origin,line_name_destination
352498,rail,daily,KG34: Stadium Kajang,KJ27: Cgc Glenmarie,2023-12-10,0,2023,12,10,Sunday,KG34,KJ27,Stadium Kajang (origin),Cgc Glenmarie (destination),KG,KJ,MRT Kajang Line,LRT Kajang Line
756322,rail,daily,PYL13: Kampung Batu,SP12: Cheras,2024-01-15,5,2024,1,15,Monday,PYL13,SP12,Kampung Batu (origin),Cheras (destination),PYL,SP,MRT Putrajaya Line,LRT Sri Petaling Line
465703,rail,daily,KJ16: Bank Rakyat Bangsar,KG06: Kota Damansara,2023-12-13,29,2023,12,13,Wednesday,KJ16,KG06,Bank Rakyat Bangsar (origin),Kota Damansara (destination),KJ,KG,LRT Kajang Line,MRT Kajang Line
462609,rail,daily,KJ15: KL Sentral,MR03: Maharajalela,2024-01-23,0,2024,1,23,Tuesday,KJ15,MR03,KL Sentral (origin),Maharajalela (destination),KJ,MR,LRT Kajang Line,Monorail
475382,rail,daily,KJ17: Abdullah Hukum,KJ24: Kelana Jaya,2023-12-02,95,2023,12,2,Saturday,KJ17,KJ24,Abdullah Hukum (origin),Kelana Jaya (destination),KJ,KJ,LRT Kajang Line,LRT Kajang Line
824637,rail,daily,PYL24: Chan Sow Lin,SP22: Kinrara,2023-12-18,3,2023,12,18,Monday,PYL24,SP22,Chan Sow Lin (origin),Kinrara (destination),PYL,SP,MRT Putrajaya Line,LRT Sri Petaling Line
934704,rail,daily,SP15: Bandar Tasik Selatan,AG06: Bandaraya,2023-12-18,6,2023,12,18,Monday,SP15,AG06,Bandar Tasik Selatan (origin),Bandaraya (destination),SP,AG,LRT Sri Petaling Line,LRT Ampang Line
931878,rail,daily,SP14: Bandar Tun Razak,KJ23: Taman Bahagia,2024-01-11,1,2024,1,11,Thursday,SP14,KJ23,Bandar Tun Razak (origin),Taman Bahagia (destination),SP,KJ,LRT Sri Petaling Line,LRT Kajang Line
251545,rail,daily,KG17: Merdeka,SP18: Sri Petaling,2023-12-04,3,2023,12,4,Monday,KG17,SP18,Merdeka (origin),Sri Petaling (destination),KG,SP,MRT Kajang Line,LRT Sri Petaling Line
593325,rail,daily,KJ34: USJ 21,SP22: Kinrara,2023-12-12,16,2023,12,12,Tuesday,KJ34,SP22,USJ 21 (origin),Kinrara (destination),KJ,SP,LRT Kajang Line,LRT Sri Petaling Line


## Grouping

In [7]:

df_grouped = df.groupby(['origin',
            'line_origin',
            'line_name_origin',
            'date',
            'year',
            'month',
            'day',
            'day_name',]).agg({'passengers':'sum'}).reset_index()

In [49]:
df_grouped.sample(10)

Unnamed: 0,origin,line_origin,line_name_origin,date,year,month,day,day_name,passengers
6208,PYL11: Jinjang,PYL,MRT Putrajaya Line,2024-01-21,2024,1,21,Sunday,670
2817,KG31: Bukit Dukung,KG,MRT Kajang Line,2023-12-24,2023,12,24,Sunday,1962
4397,KJ25: Lembah Subang,KJ,LRT Kajang Line,2023-12-08,2023,12,8,Friday,3701
7840,SP15: Bandar Tasik Selatan,SP,LRT Sri Petaling Line,2023-12-31,2023,12,31,Sunday,6496
3074,KJ01: Gombak,KJ,LRT Kajang Line,2024-01-22,2024,1,22,Monday,4355
8117,SP20: Muhibbah,SP,LRT Sri Petaling Line,2023-12-23,2023,12,23,Saturday,1096
7911,SP16: Sungai Besi,SP,LRT Sri Petaling Line,2024-01-14,2024,1,14,Sunday,534
6550,PYL17: Titiwangsa,PYL,MRT Putrajaya Line,2024-01-21,2024,1,21,Sunday,455
6862,PYL24: Chan Sow Lin,PYL,MRT Putrajaya Line,2023-12-22,2023,12,22,Friday,824
1354,BRT06: South Quay-USJ 1,BRT,Bus Rapid Transit,2024-01-12,2024,1,12,Friday,1565


In [50]:
df_grouped.to_parquet('./data/railway/ridership/daily/passengers_by_station.parquet', )

# Test

df

In [21]:
fname = [x.date().isoformat() for x in df_grouped['date'].unique()]
fname

['2023-12-06',
 '2023-12-07',
 '2023-12-08',
 '2023-12-09',
 '2023-12-10',
 '2023-12-11',
 '2023-12-12',
 '2023-12-13',
 '2023-12-14',
 '2023-12-15',
 '2023-12-16',
 '2023-12-17',
 '2023-12-18',
 '2023-12-19',
 '2023-12-20',
 '2023-12-21',
 '2023-12-22',
 '2023-12-23',
 '2023-12-24',
 '2023-12-25',
 '2023-12-26',
 '2023-12-27',
 '2023-12-28',
 '2023-12-29',
 '2023-12-30',
 '2023-12-31',
 '2024-01-01',
 '2024-01-02',
 '2024-01-03',
 '2024-01-04',
 '2024-01-05',
 '2024-01-06',
 '2024-01-07',
 '2024-01-08',
 '2024-01-09',
 '2024-01-10',
 '2024-01-11',
 '2024-01-12',
 '2024-01-13',
 '2024-01-14',
 '2024-01-15',
 '2024-01-16',
 '2024-01-17',
 '2024-01-18',
 '2024-01-19',
 '2024-01-20',
 '2024-01-21',
 '2024-01-22',
 '2024-01-23',
 '2024-01-24',
 '2024-01-25',
 '2024-01-26',
 '2024-01-27',
 '2024-01-28',
 '2024-01-29',
 '2024-01-30',
 '2024-01-31']

In [24]:
df_grouped['date'] = df_grouped['date'].astype('str')

In [25]:
df_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8664 entries, 0 to 8663
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   origin            8664 non-null   string
 1   line_origin       8664 non-null   string
 2   line_name_origin  8664 non-null   string
 3   date              8664 non-null   object
 4   year              8664 non-null   string
 5   month             8664 non-null   string
 6   day               8664 non-null   string
 7   day_name          8664 non-null   string
 8   passengers        8664 non-null   int64 
dtypes: int64(1), object(1), string(7)
memory usage: 609.3+ KB


In [22]:
for t in fname:
    temp_ = df_grouped.loc[df_grouped['date'] == t].reset_index(drop=True)
    file_name = f'data/railway/daily_agg/ridership_{t}.parquet'
    temp_.to_parquet(file_name)