In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os
import glob
import re
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
import json

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ====== LOGGING ======
trial_version = "0_17_2"
description = "Try using Known Covariate: national v2 "
log_dataset_used = ""
log_preprocessing_method = {}
log_feature_used = {}
log_modeling = {}
log_after_inference_method = {}

def logging(log_type, key, value):
    if isinstance(value, pd.Index):  # Convert pandas Index to list
        value = value.tolist()
    log_type[key] = value

In [3]:
log_dataset_used = "Time Series Transformed V1 + Ramadhan"

def to_snake_case(text):
    """Converts a string to snake_case with special handling for 'DKI Jakarta' and 'DI Yogyakarta'."""
    text = text.strip()  # Remove leading/trailing spaces
    text = text.replace("DKI Jakarta", "dki_jakarta").replace("DI Yogyakarta", "di_yogyakarta")  # Special cases
    text = re.sub(r'\s+', '_', text)  # Replace spaces with a single underscore
    text = re.sub(r'([a-z])([A-Z])', r'\1_\2', text)  # Add underscore before uppercase letters (camelCase)
    text = re.sub(r'[^a-zA-Z0-9_]', '', text)  # Remove special characters
    text = re.sub(r'__+', '_', text)  # Ensure only single underscores
    return text.lower()

def load_csv_from_directory(folder_path):
    """
    Loads all CSV files from a directory (including subdirectories) into a single DataFrame.
    Converts column names to snake_case.
    Adds a column 'bahan_pangan' containing the filename without '.csv'.
    """
    csv_files = glob.glob(os.path.join(folder_path, "**", "*.csv"), recursive=True)

    if not csv_files:
        print(f"No CSV files found in {folder_path}")
        return pd.DataFrame()

    df_list = []
    for file in csv_files:
        try:
            df = pd.read_csv(file, low_memory=False)
            df.columns = [to_snake_case(col) for col in df.columns]  # Convert to snake_case
            df["bahan_pangan"] = os.path.splitext(os.path.basename(file))[0]  # Add filename without .csv
            df_list.append(df)
        except Exception as e:
            print(f"Error loading {file}: {e}")

    final_df = pd.concat(df_list, ignore_index=True) if df_list else pd.DataFrame()

    print(f"Loaded {len(csv_files)} files from {folder_path}, total rows: {len(final_df)}")
    return final_df

def transform_to_timeseries(df):
    """
    Transforms a DataFrame into a time series format with columns: ['date', 'bahan_pangan', 'province', 'value'].
    """
    # Melt the DataFrame to long format
    df_long = df.melt(id_vars=["date", "bahan_pangan"], var_name="province", value_name="value")

    # # Drop NaN values (if any)
    # df_long = df_long.dropna().reset_index(drop=True)

    return df_long


df_train = load_csv_from_directory("../data/raw/harga_bahan_pangan/train/")
df_test = load_csv_from_directory("../data/raw/harga_bahan_pangan/test/")

df_train = transform_to_timeseries(df_train)
df_test = transform_to_timeseries(df_test)


Loaded 13 files from ../data/raw/harga_bahan_pangan/train/, total rows: 13052
Loaded 13 files from ../data/raw/harga_bahan_pangan/test/, total rows: 1196


Create Ramadhan DataFrame

## Feature Selection

In [4]:
logging(log_feature_used, "feature", df_train.columns)

## Modeling

In [5]:
df_train.head()

Unnamed: 0,date,bahan_pangan,province,value
0,2022-01-01,bawang_merah,aceh,28970.0
1,2022-01-02,bawang_merah,aceh,29900.0
2,2022-01-03,bawang_merah,aceh,28970.0
3,2022-01-04,bawang_merah,aceh,29600.0
4,2022-01-05,bawang_merah,aceh,29540.0


In [None]:
df_national = pd.read_csv("../data/interim/national/national_v2.csv")
df_islands = pd.read_csv("../data/interim/national/islands.csv")

In [7]:
df_national

Unnamed: 0,date,bahan_pangan,value
0,2022-01-01,bawang_merah,29203.235294
1,2022-01-02,bawang_merah,29352.647059
2,2022-01-03,bawang_merah,29137.058824
3,2022-01-04,bawang_merah,29277.647059
4,2022-01-05,bawang_merah,29241.470588
...,...,...,...
13691,2024-12-27 00:00:00,tepung_terigu_curah,10254.160118
13692,2024-12-28 00:00:00,tepung_terigu_curah,10252.664814
13693,2024-12-29 00:00:00,tepung_terigu_curah,10250.136340
13694,2024-12-30 00:00:00,tepung_terigu_curah,10249.479330


In [8]:
df_train['date'] = pd.to_datetime(df_train['date'])
df_national['date'] = pd.to_datetime(df_national['date'], format='mixed', errors='coerce')
df_islands['date'] = pd.to_datetime(df_islands['date'])
df_test['date'] = pd.to_datetime(df_test['date'])

In [9]:
df_train.head(1)


Unnamed: 0,date,bahan_pangan,province,value
0,2022-01-01,bawang_merah,aceh,28970.0


In [10]:
df_national.head(1)

Unnamed: 0,date,bahan_pangan,value
0,2022-01-01,bawang_merah,29203.235294


In [11]:
df_national['national'] = df_national['value']
df_national = df_national.drop(['value'],axis = 1)
df_national.head(1)

Unnamed: 0,date,bahan_pangan,national
0,2022-01-01,bawang_merah,29203.235294


In [None]:
df_train = df_train[df_train['date'] >= '2022-07-14']
df_test = df_test[df_test['date'] >= '2022-07-14']

In [12]:
df_train = pd.merge(df_train,df_national,how='left', on=['date','bahan_pangan'])

In [13]:
# def assign_island(province):
#     """Assigns an island category based on the province in snake_case format."""
#     sumatra = {"aceh", "sumatera_utara", "sumatera_barat", "riau", "jambi", "bengkulu", 
#                "sumatera_selatan", "lampung", "kepulauan_bangka_belitung", "kepulauan_riau"}
#     java = {"banten", "dki_jakarta", "jawa_barat", "jawa_tengah", "di_yogyakarta", "jawa_timur"}
#     kalimantan = {"kalimantan_barat", "kalimantan_tengah", "kalimantan_selatan", 
#                   "kalimantan_timur", "kalimantan_utara"}
#     sulawesi = {"sulawesi_utara", "sulawesi_tengah", "sulawesi_selatan", 
#                 "sulawesi_tenggara", "gorontalo", "sulawesi_barat"}
#     bali_nusa = {"bali", "nusa_tenggara_barat", "nusa_tenggara_timur"}
#     maluku_papua = {"maluku", "maluku_utara", "papua", "papua_barat"}

#     # province = to_snake_case(province)  # Remove this if it's already in snake_case

#     if province in sumatra:
#         return "Sumatra"
#     elif province in java:
#         return "Java"
#     elif province in kalimantan:
#         return "Kalimantan"
#     elif province in sulawesi:
#         return "Sulawesi"
#     elif province in bali_nusa:
#         return "Bali_Nusa_Tenggara"
#     elif province in maluku_papua:
#         return "Maluku_Papua"
#     else:
#         return "Unknown" 


# df_train['island'] = df_train['province'].apply(assign_island)


# df_train = df_train.merge(df_islands, on=['date', 'bahan_pangan', 'island'], how='left', suffixes=('', '_island'))

# df_train.head()

In [14]:
df_test = pd.merge(df_test,df_national,how='left', on=['date','bahan_pangan'])

# df_test['island'] = df_test['province'].apply(assign_island)

# df_test = df_test.merge(df_islands, on=['date', 'bahan_pangan', 'island'], how='left', suffixes=('', '_island'))

In [None]:
df_train = df_train[df_train['date'] >= '2022-07-14']
df_test = df_test[df_test['date'] >= '2022-07-14']

In [16]:
df_test.head()

Unnamed: 0,date,bahan_pangan,province,value,national
0,2024-10-01,bawang_merah,aceh,0.0,29794.004801
1,2024-10-02,bawang_merah,aceh,0.0,29798.202374
2,2024-10-03,bawang_merah,aceh,0.0,29801.261095
3,2024-10-04,bawang_merah,aceh,0.0,29826.961882
4,2024-10-05,bawang_merah,aceh,0.0,29860.877676


In [17]:
df_train["bahan_pangan-province"] = df_train["bahan_pangan"] + "-" + df_train["province"]
df_test["bahan_pangan-province"] = df_test["bahan_pangan"] + "-" + df_test["province"]

In [18]:
df_train = df_train.drop(['bahan_pangan','province'], axis = 1)
df_test = df_test.drop(['bahan_pangan','province'], axis = 1)

In [19]:
df_train['date'] = pd.to_datetime(df_train['date'])
df_test['date'] = pd.to_datetime(df_test['date'])

In [20]:
# df_train = pd.merge(df_train, df_known, how='left', on='date')

In [21]:
df_train.head()

Unnamed: 0,date,value,national,bahan_pangan-province
194,2022-07-14,57580.0,63597.666667,bawang_merah-aceh
195,2022-07-15,57280.0,62530.588235,bawang_merah-aceh
196,2022-07-16,55800.0,62221.470588,bawang_merah-aceh
197,2022-07-17,55630.0,62194.705882,bawang_merah-aceh
198,2022-07-18,55860.0,61938.235294,bawang_merah-aceh


In [22]:
train_data = TimeSeriesDataFrame.from_data_frame(
    df_train,
    id_column="bahan_pangan-province",
    timestamp_column="date",   
)

In [23]:
autogluon_config = {
    "eval_metric" : "MAPE",
    "freq" : "D",
    "presets" : "high_quality",
    "time_limit" : 60*30
}

logging(log_modeling, "autogluon_config", autogluon_config)

In [24]:
df_train.head()

Unnamed: 0,date,value,national,bahan_pangan-province
194,2022-07-14,57580.0,63597.666667,bawang_merah-aceh
195,2022-07-15,57280.0,62530.588235,bawang_merah-aceh
196,2022-07-16,55800.0,62221.470588,bawang_merah-aceh
197,2022-07-17,55630.0,62194.705882,bawang_merah-aceh
198,2022-07-18,55860.0,61938.235294,bawang_merah-aceh


In [26]:
predictor = TimeSeriesPredictor(
    prediction_length=92,
    path=os.path.join("../models",trial_version),
    target="value",
    eval_metric=autogluon_config['eval_metric'],
    freq=autogluon_config['freq'],
    known_covariates_names=['national'] 
)

predictor.fit(
    train_data,
    presets=autogluon_config['presets'],
    time_limit=autogluon_config['time_limit'],
    random_seed = 123
)

Beginning AutoGluon training... Time limit = 1800s
AutoGluon will save models to '/home/rayhanadi/project/arkav-2025/models/0_17_1'
AutoGluon Version:  1.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Tue Nov 5 00:21:55 UTC 2024
CPU Count:          12
GPU Count:          1
Memory Avail:       3.17 GB / 7.45 GB (42.5%)
Disk Space Avail:   809.56 GB / 1006.85 GB (80.4%)
Setting presets to: high_quality

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': MAPE,
 'freq': 'D',
 'hyperparameters': 'default',
 'known_covariates_names': ['national'],
 'num_val_windows': 1,
 'prediction_length': 92,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'value',
 'time_limit': 1800,
 'verbosity': 2}

Provided train_data has 358020 rows (NaN fraction=0.6%), 442 time series. Median time series le

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7f81f3c610c0>

In [14]:
score = predictor.leaderboard()

model_val = score['model'].tolist()
score_val = score['score_val'].tolist()

logging(log_modeling, "model_name", model_val)
logging(log_modeling, "val_score", score_val)

### Predict

In [43]:
df_test.head()

Unnamed: 0,date,value,national,value_island,bahan_pangan-province
0,2024-10-01,0.0,29794.004801,27317.660336,bawang_merah-aceh
1,2024-10-02,0.0,29798.202374,27280.240811,bawang_merah-aceh
2,2024-10-03,0.0,29801.261095,27351.69081,bawang_merah-aceh
3,2024-10-04,0.0,29826.961882,27479.128834,bawang_merah-aceh
4,2024-10-05,0.0,29860.877676,27646.01387,bawang_merah-aceh


In [None]:
# df_test = pd.merge(df_test, df_known, how='left', on='date')

In [27]:
df_test = df_test.drop(['value'], axis = 1)

In [28]:
test_data = TimeSeriesDataFrame.from_data_frame(
    df_test,
    id_column="bahan_pangan-province",
    timestamp_column="date",
    
)

In [29]:
predictions = predictor.predict(train_data, known_covariates=test_data)

Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [30]:
predictions

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
bawang_merah-aceh,2024-10-01,28252.767852,25981.453576,26806.535936,27371.092966,27841.971053,28252.767852,28684.183013,29147.463665,29683.826697,30499.506087
bawang_merah-aceh,2024-10-02,28222.048452,25218.917409,26288.219397,27028.271611,27651.170379,28222.048452,28775.431259,29403.152385,30138.127676,31199.144404
bawang_merah-aceh,2024-10-03,28164.862506,24582.202881,25852.125172,26733.157260,27480.498472,28164.862506,28855.715800,29600.860792,30481.774362,31770.952200
bawang_merah-aceh,2024-10-04,28192.391962,24139.451290,25558.718722,26589.588936,27425.945431,28192.391962,28977.074199,29828.362516,30831.448998,32293.837180
bawang_merah-aceh,2024-10-05,28251.268286,23744.301817,25316.360008,26436.178252,27377.667371,28251.268286,29107.699322,30034.726359,31169.068087,32774.579831
...,...,...,...,...,...,...,...,...,...,...,...
cabai_rawit_merah-sumatera_utara,2024-12-27,39895.915938,27160.872187,31887.902348,34988.440957,37584.871194,39895.915938,42234.164716,44738.222319,47876.988511,52453.537604
cabai_rawit_merah-sumatera_utara,2024-12-28,39815.051414,27166.214181,31733.120065,34851.987199,37511.771494,39815.051414,42243.467291,44740.250344,47950.745915,52725.243467
cabai_rawit_merah-sumatera_utara,2024-12-29,40099.578644,27168.047105,31879.132602,35058.627639,37744.037339,40099.578644,42546.965262,45183.027747,48296.967370,53156.933559
cabai_rawit_merah-sumatera_utara,2024-12-30,39887.575976,26900.039763,31695.057143,34816.620241,37448.727395,39887.575976,42252.460879,44897.291657,48063.702298,52830.993685


In [31]:
predictions = predictions.reset_index()

predictions[["bahan_pangan", "province"]] = predictions["item_id"].str.split("-", expand=True)

predictions = predictions[['timestamp', 'bahan_pangan', 'province', 'mean']]
predictions

Unnamed: 0,timestamp,bahan_pangan,province,mean
0,2024-10-01,bawang_merah,aceh,28252.767852
1,2024-10-02,bawang_merah,aceh,28222.048452
2,2024-10-03,bawang_merah,aceh,28164.862506
3,2024-10-04,bawang_merah,aceh,28192.391962
4,2024-10-05,bawang_merah,aceh,28251.268286
...,...,...,...,...
40659,2024-12-27,cabai_rawit_merah,sumatera_utara,39895.915938
40660,2024-12-28,cabai_rawit_merah,sumatera_utara,39815.051414
40661,2024-12-29,cabai_rawit_merah,sumatera_utara,40099.578644
40662,2024-12-30,cabai_rawit_merah,sumatera_utara,39887.575976


In [32]:
submission = pd.read_csv("../data/raw/harga_bahan_pangan/sample_submission.csv")

submission[["bahan_pangan", "province", "date"]] = submission["id"].str.split("/", expand=True)

# Convert bahan_pangan and province to snake_case
submission["bahan_pangan"] = submission["bahan_pangan"].str.lower().str.replace(" ", "_")
submission["province"] = submission["province"].str.lower().str.replace(" ", "_")

submission['date'] = pd.to_datetime(submission['date'])

submission["bahan_pangan"] = submission["bahan_pangan"].replace("tepung_terigu_(curah)", "tepung_terigu_curah")


In [33]:
submission.head()

Unnamed: 0,id,price,bahan_pangan,province,date
0,Bawang Merah/Aceh/2024-10-01,0,bawang_merah,aceh,2024-10-01
1,Bawang Merah/Aceh/2024-10-02,0,bawang_merah,aceh,2024-10-02
2,Bawang Merah/Aceh/2024-10-03,0,bawang_merah,aceh,2024-10-03
3,Bawang Merah/Aceh/2024-10-04,0,bawang_merah,aceh,2024-10-04
4,Bawang Merah/Aceh/2024-10-05,0,bawang_merah,aceh,2024-10-05


In [34]:
submission = pd.merge(submission, predictions, how='left', left_on=['bahan_pangan','province', 'date'], right_on=['bahan_pangan','province', 'timestamp'])

In [35]:
submission.head()

Unnamed: 0,id,price,bahan_pangan,province,date,timestamp,mean
0,Bawang Merah/Aceh/2024-10-01,0,bawang_merah,aceh,2024-10-01,2024-10-01,28252.767852
1,Bawang Merah/Aceh/2024-10-02,0,bawang_merah,aceh,2024-10-02,2024-10-02,28222.048452
2,Bawang Merah/Aceh/2024-10-03,0,bawang_merah,aceh,2024-10-03,2024-10-03,28164.862506
3,Bawang Merah/Aceh/2024-10-04,0,bawang_merah,aceh,2024-10-04,2024-10-04,28192.391962
4,Bawang Merah/Aceh/2024-10-05,0,bawang_merah,aceh,2024-10-05,2024-10-05,28251.268286


In [36]:
predictions.bahan_pangan.value_counts()

bahan_pangan
bawang_merah                       3128
daging_ayam_ras                    3128
cabai_merah_keriting               3128
minyak_goreng_curah                3128
beras_premium                      3128
telur_ayam_ras                     3128
gula_konsumsi                      3128
beras_medium                       3128
minyak_goreng_kemasan_sederhana    3128
bawang_putih_bonggol               3128
tepung_terigu_curah                3128
daging_sapi_murni                  3128
cabai_rawit_merah                  3128
Name: count, dtype: int64

In [37]:
submission.bahan_pangan.value_counts()

bahan_pangan
bawang_merah                       3128
bawang_putih_bonggol               3128
beras_medium                       3128
beras_premium                      3128
cabai_merah_keriting               3128
cabai_rawit_merah                  3128
daging_ayam_ras                    3128
daging_sapi_murni                  3128
gula_konsumsi                      3128
minyak_goreng_curah                3128
minyak_goreng_kemasan_sederhana    3128
telur_ayam_ras                     3128
tepung_terigu_curah                3128
Name: count, dtype: int64

In [38]:
submission['price'] = submission['mean']

In [39]:
submission.head()

Unnamed: 0,id,price,bahan_pangan,province,date,timestamp,mean
0,Bawang Merah/Aceh/2024-10-01,28252.767852,bawang_merah,aceh,2024-10-01,2024-10-01,28252.767852
1,Bawang Merah/Aceh/2024-10-02,28222.048452,bawang_merah,aceh,2024-10-02,2024-10-02,28222.048452
2,Bawang Merah/Aceh/2024-10-03,28164.862506,bawang_merah,aceh,2024-10-03,2024-10-03,28164.862506
3,Bawang Merah/Aceh/2024-10-04,28192.391962,bawang_merah,aceh,2024-10-04,2024-10-04,28192.391962
4,Bawang Merah/Aceh/2024-10-05,28251.268286,bawang_merah,aceh,2024-10-05,2024-10-05,28251.268286


In [40]:
submission = submission[['id','price']]

In [41]:
submission

Unnamed: 0,id,price
0,Bawang Merah/Aceh/2024-10-01,28252.767852
1,Bawang Merah/Aceh/2024-10-02,28222.048452
2,Bawang Merah/Aceh/2024-10-03,28164.862506
3,Bawang Merah/Aceh/2024-10-04,28192.391962
4,Bawang Merah/Aceh/2024-10-05,28251.268286
...,...,...
40659,Tepung Terigu (Curah)/Sumatera Utara/2024-12-27,10471.362714
40660,Tepung Terigu (Curah)/Sumatera Utara/2024-12-28,10467.534783
40661,Tepung Terigu (Curah)/Sumatera Utara/2024-12-29,10469.285453
40662,Tepung Terigu (Curah)/Sumatera Utara/2024-12-30,10471.980148


In [None]:
submission.to_csv(f'../submissions/submission_{trial_version}.csv', index=False)

log = {
    "description": description,
    "log_preprocessing_method" : log_preprocessing_method,
    "log_feature_used" : log_feature_used,
    "log_modeling" : log_modeling,
    "log_after_inference_method" : log_after_inference_method
}

with open(f"../log/log_{trial_version}.json", "w") as f:
    json.dump(log, f)

: 