In [1]:
# Parameters
date = "2022-02"


In [2]:
import pandas as pd

from datetime import datetime
from pathlib import Path, PurePosixPath

In [3]:
# Note: This is for us to able to change where we run this code (either in my PC or Docker environment or from Airflow engine)
# In Docker env follows Linux - e.g /usr/local/airflow//working_files/data
# In PC follows Windows       - e.g. C:/Users/Azwan/Folder/DOSM/airflow/workingfiles/data/....

# In Docker
# -----------
# If we run manually from inside notebook in Docker, the notebook will be running from           /usr/local/airflow/working_files
# But if run from Airflow engine, it will run from entrypoint.sh $AIRFLOW_HOME variable which is /usr/local/airflow
# The following is a workaround

data_path = Path().absolute().as_posix().split('working_files')[0]  + '/working_files/data'
print(data_path)

/usr/local/airflow/working_files/data


In [4]:
# Take a parameter "date" from Airflow dag_ETL_pricecatcher.py
try:
    date_of_file = date
except:
    # If no parameter is passed, give it default value
    date_of_file = '2022-01'  #Format: YYYY-MM
    
# Read from DOSM API using pandas - Surveys
df = pd.read_parquet(f'https://storage.data.gov.my/pricecatcher/pricecatcher_{date_of_file}.parquet')
# Convert date column into datetime type
if 'date' in df.columns: df['date'] = pd.to_datetime(df['date'])
print('Number of rows loaded...',len(df))

# Premise
premise = pd.read_parquet('https://storage.data.gov.my/pricecatcher/lookup_premise.parquet')
print(len(premise))
premise.head()

# Items
items = pd.read_parquet('https://storage.data.gov.my/pricecatcher/lookup_item.parquet')
print(len(items))
items.head()

# Combine data
merged_data_premise = df.merge(premise, how = 'left', left_on = 'premise_code', right_on = 'premise_code')
merged_data = merged_data_premise.merge(items, how = 'left', left_on = 'item_code', right_on = 'item_code')

# Only get AEON Subang Jaya data
aeon_subang = merged_data.query('premise_code == 3178')
aeon_subang.loc[:,['etl_time']] = datetime.now().isoformat()
display(aeon_subang.sample(5))

Number of rows loaded... 1862495
2897


757


Unnamed: 0,date,premise_code,item_code,price,premise,address,premise_type,state,district,item,unit,item_group,item_category,etl_time
579687,2022-02-11,3178,114,5.4,AEON BIG SUBANG JAYA,"NO 3 JALAN SS16/1,47500 SUBANG JAYA, SELANGOR ...",Hypermarket,Selangor,Petaling,TOMATO,1kg,BARANGAN SEGAR,SAYUR-SAYURAN,2024-05-29T19:53:12.597947
1729033,2022-02-28,3178,1440,3.59,AEON BIG SUBANG JAYA,"NO 3 JALAN SS16/1,47500 SUBANG JAYA, SELANGOR ...",Hypermarket,Selangor,Petaling,BAWANG BESAR IMPORT (INDIA),1kg,BARANGAN KERING,BAWANG,2024-05-29T19:53:12.597947
1729023,2022-02-28,3178,1136,4.01,AEON BIG SUBANG JAYA,"NO 3 JALAN SS16/1,47500 SUBANG JAYA, SELANGOR ...",Hypermarket,Selangor,Petaling,KICAP MASIN ADABI,340ml,BARANGAN BERBUNGKUS,KICAP DAN SOS,2024-05-29T19:53:12.597947
760981,2022-02-14,3178,1610,6.35,AEON BIG SUBANG JAYA,"NO 3 JALAN SS16/1,47500 SUBANG JAYA, SELANGOR ...",Hypermarket,Selangor,Petaling,SANTAN KELAPA JENAMA M&S,500 ml,BARANGAN BERBUNGKUS,SANTAN (KOTAK),2024-05-29T19:53:12.597947
905756,2022-02-15,3178,64,16.9,AEON BIG SUBANG JAYA,"NO 3 JALAN SS16/1,47500 SUBANG JAYA, SELANGOR ...",Hypermarket,Selangor,Petaling,KEPALA IKAN MERAH,1kg,BARANGAN SEGAR,BAHAN LAUT,2024-05-29T19:53:12.597947


In [5]:
# Save the AEON Subang data to a parquet file
aeon_subang.to_parquet(data_path + '/price_catcher_raw/aeon.parquet')