In [5]:
import pandas as pd
import requests
from glob import glob

from deltalake import DeltaTable
from deltalake.writer import write_deltalake
import snowflake.connector
from datetime import datetime
from pathlib import Path, PurePosixPath

In [8]:
# Note: This is for us to able to change where we run this code (either in my PC or Docker environment or from Airflow engine)
# In Docker env follows Linux - e.g /usr/local/airflow//working_files/data
# In PC follows Windows       - e.g. C:/Users/Azwan/Folder/DOSM/airflow/workingfiles/data/....

# If we run manually from inside notebook in Docker, the notebook will be running from           /usr/local/airflow/working_files
# But if run from Airflow engine, it will run from entrypoint.sh $AIRFLOW_HOME variable which is /usr/local/airflow
# The following is a workaround
data_path = Path().absolute().as_posix().split('working_files')[0]  + '/working_files/data'
print(data_path)

/usr/local/airflow//working_files/data


In [18]:
try:
    date_of_file = date
except:
    date_of_file = '2022-01'  #Format: YYYY-MM
df = pd.read_parquet(f'https://storage.data.gov.my/pricecatcher/pricecatcher_{date_of_file}.parquet')
if 'date' in df.columns: df['date'] = pd.to_datetime(df['date'])
print('Number of rows loaded...',len(df))

premise = pd.read_parquet('https://storage.data.gov.my/pricecatcher/lookup_premise.parquet')
print(len(premise))
premise.head()


items = pd.read_parquet('https://storage.data.gov.my/pricecatcher/lookup_item.parquet')
print(len(items))
items.head()

merged_data_premise = df.merge(premise, how = 'left', left_on = 'premise_code', right_on = 'premise_code')
merged_data = merged_data_premise.merge(items, how = 'left', left_on = 'item_code', right_on = 'item_code')


aeon_subang = merged_data.query('premise_code == 3178')
aeon_subang.loc[:,['etl_time']] = datetime.now().isoformat()
display(aeon_subang.sample(5))

Number of rows loaded... 2454101
2897
757


Unnamed: 0,date,premise_code,item_code,price,premise,address,premise_type,state,district,item,unit,item_group,item_category,etl_time
45265,2022-01-01,3178,1142,6.9,AEON BIG SUBANG JAYA,"NO 3 JALAN SS16/1,47500 SUBANG JAYA, SELANGOR ...",Hypermarket,Selangor,Petaling,SARDIN CAP ADABI (SOS TOMATO DENGAN CILI),425 g,BARANGAN BERBUNGKUS,IKAN DALAM TIN,2024-05-26T19:02:56.147359
1786011,2022-01-24,3178,1116,5.5,AEON BIG SUBANG JAYA,"NO 3 JALAN SS16/1,47500 SUBANG JAYA, SELANGOR ...",Hypermarket,Selangor,Petaling,SOS TIRAM NONA,510 g,BARANGAN BERBUNGKUS,KICAP DAN SOS,2024-05-26T19:02:56.147359
1700653,2022-01-23,3178,109,3.88,AEON BIG SUBANG JAYA,"NO 3 JALAN SS16/1,47500 SUBANG JAYA, SELANGOR ...",Hypermarket,Selangor,Petaling,LOBAK MERAH,1kg,BARANGAN SEGAR,SAYUR-SAYURAN,2024-05-26T19:02:56.147359
519189,2022-01-05,3178,160,5.9,AEON BIG SUBANG JAYA,"NO 3 JALAN SS16/1,47500 SUBANG JAYA, SELANGOR ...",Hypermarket,Selangor,Petaling,UBI KENTANG RUSSET,1kg,BARANGAN KERING,UBI KENTANG,2024-05-26T19:02:56.147359
866433,2022-01-10,3178,1437,20.9,AEON BIG SUBANG JAYA,"NO 3 JALAN SS16/1,47500 SUBANG JAYA, SELANGOR ...",Hypermarket,Selangor,Petaling,IKAN SIAKAP (ANTARA 2 HINGGA 4 EKOR SEKILOGRAM),1kg,BARANGAN SEGAR,BAHAN LAUT,2024-05-26T19:02:56.147359


In [21]:
aeon_subang.to_parquet(data_path + '/price_catcher_raw/aeon.parquet')