In [222]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from sklearn.model_selection import TimeSeriesSplit
from pandas.tseries.offsets import DateOffset
import json
from datetime import datetime
import copy
import warnings
import wandb
import os
warnings.filterwarnings('ignore')
os.environ["WANDB_SILENT"] = "true"

In [223]:
%load_ext autoreload
%autoreload 2
from models.models import HoltWintersWrapper, ProphetWrapper

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [224]:
cpi = pd.read_csv('cpi.csv')
fuel_df = pd.read_csv('fuel.csv')
sabor_df = pd.read_csv('sabor.csv')
currency = pd.read_csv('currency_data.csv')
credit = pd.read_excel('credit.xlsx', header=1)
jse = pd.read_csv('jse_indices.csv')
cpi_weights = pd.read_excel('cpi_weights.xlsx', header=1)   

In [225]:
fuel_df.loc[fuel_df[fuel_df['date'] == '2021-08'].tail(1).index,'date'] = '2021-09'
fuel_df = fuel_df.drop_duplicates(subset=['date'], keep='first')

In [226]:
cpi_columns = list(cpi.columns[2:-1])

In [227]:
cpi = pd.merge(cpi,fuel_df, on="date")
cpi = pd.merge(cpi,sabor_df, on="date")

In [228]:
currency['Date'] = pd.to_datetime(currency['Date']) + pd.DateOffset(months=2)
currency['Date'] = currency['Date'].map(lambda x: x.strftime('%Y-%m'))

currency = currency.groupby('Date').mean().reset_index()

In [229]:
sabor_df['date'] = pd.to_datetime(sabor_df['date']) + pd.DateOffset(months=1)
sabor_df['date'] = sabor_df['date'].map(lambda x: x.strftime('%Y-%m'))

sabor_df = sabor_df.groupby('date').mean().reset_index()

In [230]:
currency

Unnamed: 0,Date,USD/ZAR,GBP/ZAR,EUR/ZAR
0,2018-03,12.197131,16.812471,14.852223
1,2018-04,11.818639,16.556277,14.617775
2,2018-05,11.843666,16.529828,14.597329
3,2018-06,12.067608,16.989794,14.818127
4,2018-07,12.513433,16.881612,14.814248
...,...,...,...,...
61,2023-04,17.916716,21.611035,19.168071
62,2023-05,18.264031,22.163418,19.565940
63,2023-06,18.156630,22.598746,19.953257
64,2023-07,19.089853,23.817278,20.761558


In [235]:
fuel_offeset_df = fuel_df.copy()
fuel_offeset_2_df = fuel_df.copy()
fuel_offeset_3_df = fuel_df.copy()

In [236]:
fuel_offeset_df['date'] = pd.to_datetime(fuel_offeset_df['date']) + pd.DateOffset(months=1)
fuel_offeset_df['date'] = fuel_offeset_df['date'].map(lambda x: x.strftime('%Y-%m'))
fuel_offeset_df.dropna(inplace=True)

In [237]:
fuel_offeset_2_df['date'] = pd.to_datetime(fuel_offeset_2_df['date']) + pd.DateOffset(months=2)
fuel_offeset_2_df['date'] = fuel_offeset_2_df['date'].map(lambda x: x.strftime('%Y-%m'))
fuel_offeset_2_df.dropna(inplace=True)

In [238]:
fuel_offeset_3_df['date'] = pd.to_datetime(fuel_offeset_3_df['date']) + pd.DateOffset(months=3)
fuel_offeset_3_df['date'] = fuel_offeset_3_df['date'].map(lambda x: x.strftime('%Y-%m'))
fuel_offeset_3_df.dropna(inplace=True)

In [239]:
merged_df = pd.merge(fuel_df, sabor_df, on='date', how='left')

fuel_offeset_df.rename(columns={'fuel_price': 'fuel_offset_1'}, inplace=True)

merged_df = pd.merge(merged_df, fuel_offeset_df, on='date', how='left')

fuel_offeset_3_df.rename(columns={'fuel_price': 'fuel_offset_3'}, inplace=True)
currency.rename(columns={'Date': 'date'}, inplace=True)

merged_df = pd.merge(merged_df, fuel_offeset_3_df, on='date', how='left')
merged_df = pd.merge(merged_df, currency, on='date', how='left')

In [240]:
merged_df.dropna(inplace=True)

In [242]:
models = [  HoltWintersWrapper(seasonal_periods=3),
            HoltWintersWrapper(seasonal_periods=12),
            ProphetWrapper(extra_data=merged_df, name_postfix="_all_data", changepoint_prior_scale=10, changepoint_range=0.8),
            ProphetWrapper(extra_data=merged_df, name_postfix="_all_data", changepoint_prior_scale=5, changepoint_range=0.8, n_changepoints=20), 
            ProphetWrapper(extra_data=merged_df, name_postfix="_all_data", changepoint_prior_scale=1, changepoint_range=0.8),]

In [262]:
test_set = cpi[cpi['date'] >= '2023-04']
train_val_set = cpi[cpi['date'] < '2023-04']
tscv = TimeSeriesSplit(n_splits=10,test_size=1)

In [263]:
results = {}
for model in models:
    model_results = []


    for category in tqdm(cpi_columns):
        
        intermediate_results = {'test': [], 'pred': []}
        for i, (train_index, test_index) in enumerate(tscv.split(train_val_set[['date',category]])):
            model.fit(model.getExtraData(), cpi[['date',category]].iloc[train_index])
            intermediate_results['test'].append(*cpi[category].iloc[test_index].values)
            intermediate_results['pred'].append(*model.predict(len(test_index)))
        result = mean_squared_error(intermediate_results['test'], intermediate_results['pred'], squared=True)
        model_results.append(result)
        
    results[model.getModelName()] = model_results

100%|██████████| 13/13 [00:15<00:00,  1.22s/it]
100%|██████████| 13/13 [00:16<00:00,  1.26s/it]
100%|██████████| 13/13 [00:40<00:00,  3.14s/it]
100%|██████████| 13/13 [00:48<00:00,  3.73s/it]
100%|██████████| 13/13 [00:41<00:00,  3.17s/it]


In [264]:
results_df = pd.DataFrame(results, index=cpi_columns)

In [265]:
best_model_table = dict(results_df.T.apply(lambda x: x.idxmin()))
best_model_table

{'Food and non-alcoholic beverages': 'HoltWinters_mul_mul_3',
 'Alcoholic beverages and tobacco': 'HoltWinters_mul_mul_12',
 'Clothing and footwear': 'HoltWinters_mul_mul_12',
 'Housing and utilities': 'HoltWinters_mul_mul_12',
 'Household contents and services': 'HoltWinters_mul_mul_12',
 'Health': 'HoltWinters_mul_mul_12',
 'Transport': 'Prophet_0.8_20_1_all_data',
 'Communication': 'HoltWinters_mul_mul_12',
 'Recreation and culture': 'HoltWinters_mul_mul_3',
 'Education': 'HoltWinters_mul_mul_12',
 'Restaurants and hotels': 'HoltWinters_mul_mul_3',
 'Miscellaneous goods and services': 'HoltWinters_mul_mul_12',
 'headline CPI': 'HoltWinters_mul_mul_12'}

In [266]:
model_table  = {}
for model in models:
    model_table[model.getModelName()] = copy.deepcopy(model)

In [267]:
results_table = {}
for date in test_set['date']:
    input_data = cpi[cpi['date'] < date]
    for entry in best_model_table:
        model_table[best_model_table[entry]].fit(model.getExtraData(), input_data[['date',str(entry)]])
        #print(f"{entry} {model_table[best_model_table[entry]].predict(1)}")
        results_table[entry] = model_table[best_model_table[entry]].predict(1)[0]
    rmse = mean_squared_error(cpi[cpi['date'] == date][list(results_table.keys())], pd.DataFrame(results_table, index=[0]), squared=True)
    print(f"{date} {rmse}")

2023-04 0.35092155807303566
2023-05 0.32389634087640845
2023-06 0.12839952347355083
2023-07 0.2159328066234497


In [268]:
results_table = {}
for date in ['2023-08']:
    input_data = cpi[cpi['date'] < date]
    for entry in best_model_table:
        model_table[best_model_table[entry]].fit(model.getExtraData(), input_data[['date',str(entry)]])
        #print(f"{entry} {model_table[best_model_table[entry]].predict(1)}")
        results_table[entry] = model_table[best_model_table[entry]].predict(1)[0]
    results_table

In [269]:
results_table

{'Food and non-alcoholic beverages': 119.22656711494491,
 'Alcoholic beverages and tobacco': 112.03677886969798,
 'Clothing and footwear': 104.82534385021496,
 'Housing and utilities': 108.51651669094672,
 'Household contents and services': 108.30808272553672,
 'Health': 110.7138412271973,
 'Transport': 113.85694696366238,
 'Communication': 99.44978987827471,
 'Recreation and culture': 105.85196499754676,
 'Education': 110.36316804610668,
 'Restaurants and hotels': 110.582794359208,
 'Miscellaneous goods and services': 109.92233291474076,
 'headline CPI': 111.04380921645371}

In [270]:
cpi[cpi['date'] == '2023-06']

Unnamed: 0.1,Unnamed: 0,index,Food and non-alcoholic beverages,Alcoholic beverages and tobacco,Clothing and footwear,Housing and utilities,Household contents and services,Health,Transport,Communication,Recreation and culture,Education,Restaurants and hotels,Miscellaneous goods and services,headline CPI,date,fuel_price,sabor
77,77,cpi_M202306,118.3,110.9,104.3,105.4,107.7,110.8,112.3,99.6,105.3,110.4,110.0,109.6,109.8,2023-06,21.58,8.218091


In [250]:
cpi[cpi['date'] == '2023-06'][list(results_table.keys())]
pd.DataFrame(results_table, index=[0])

Unnamed: 0,Food and non-alcoholic beverages,Alcoholic beverages and tobacco,Clothing and footwear,Housing and utilities,Household contents and services,Health,Transport,Communication,Recreation and culture,Education,Restaurants and hotels,Miscellaneous goods and services,headline CPI
0,118.959332,111.400766,104.355187,107.322153,107.843473,110.795098,112.750539,99.491498,105.650763,110.359932,110.158978,110.059098,110.82407


In [251]:
mean_squared_error(cpi[cpi['date'] == '2023-06'][list(results_table.keys())], pd.DataFrame(results_table, index=[0]), squared=True)

0.4636901147023273

In [252]:
sabor_df

Unnamed: 0,date,sabor
0,2007-09,9.358478
1,2007-10,9.596100
2,2007-11,9.914913
3,2007-12,10.120636
4,2008-01,10.508286
...,...,...
187,2023-04,7.223304
188,2023-05,7.713450
189,2023-06,7.804391
190,2023-07,8.218091


In [253]:
mean_squared_error(intermediate_results['test'], intermediate_results['pred'], squared=True)

0.27713192409482845

In [254]:
for date in test_set['date']:
    print(date)

2023-04
2023-05
2023-06
2023-07


In [255]:
merged_df

Unnamed: 0,date,fuel_price,sabor,fuel_offset_1,fuel_offset_3,USD/ZAR,GBP/ZAR,EUR/ZAR
0,2023-08,21.71,8.207190,21.34,22.29,18.799594,23.672070,20.292954
1,2023-07,21.34,8.218091,21.58,21.92,19.089853,23.817278,20.761558
2,2023-06,21.58,7.804391,22.29,22.00,18.156630,22.598746,19.953257
3,2023-05,22.29,7.713450,21.92,20.73,18.264031,22.163418,19.565940
4,2023-04,21.92,7.223304,22.00,20.45,17.916716,21.611035,19.168071
...,...,...,...,...,...,...,...,...
61,2018-07,15.43,6.477952,15.20,13.89,12.513433,16.881612,14.814248
62,2018-06,15.20,6.495217,14.38,13.27,12.067608,16.989794,14.818127
63,2018-05,14.38,6.504286,13.89,13.63,11.843666,16.529828,14.597329
64,2018-04,13.89,6.732000,13.27,13.93,11.818639,16.556277,14.617775


In [232]:
currency

Unnamed: 0,Date,USD/ZAR,GBP/ZAR,EUR/ZAR
0,2018-03,12.197131,16.812471,14.852223
1,2018-04,11.818639,16.556277,14.617775
2,2018-05,11.843666,16.529828,14.597329
3,2018-06,12.067608,16.989794,14.818127
4,2018-07,12.513433,16.881612,14.814248
...,...,...,...,...
61,2023-04,17.916716,21.611035,19.168071
62,2023-05,18.264031,22.163418,19.565940
63,2023-06,18.156630,22.598746,19.953257
64,2023-07,19.089853,23.817278,20.761558


In [233]:
sabor_df

Unnamed: 0,date,sabor
0,2007-09,9.358478
1,2007-10,9.596100
2,2007-11,9.914913
3,2007-12,10.120636
4,2008-01,10.508286
...,...,...
187,2023-04,7.223304
188,2023-05,7.713450
189,2023-06,7.804391
190,2023-07,8.218091


In [234]:
fuel_df

Unnamed: 0,date,fuel_price
0,2023-08,21.71
1,2023-07,21.34
2,2023-06,21.58
3,2023-05,22.29
4,2023-04,21.92
...,...,...
181,2008-05,9.29
182,2008-04,8.74
183,2008-03,8.07
184,2008-02,7.46


In [337]:
prediction_df = pd.DataFrame.from_dict(results_table,orient='index',columns=['Value'])

In [338]:
pred_month = 'August'

In [339]:
# Create a dictionary to update the category names to allow a submission to Zindi.
# month="May"
pred_category_dict = [pred_month+"_food and non-alcoholic beverages",
                      pred_month+"_alcoholic beverages and tobacco",
                      pred_month+"_clothing and footwear",
                      pred_month+"_housing and utilities",
                      pred_month+"_household contents and services",
                      pred_month+"_health",
                      pred_month+"_transport",
                      pred_month+"_communication",
                      pred_month+"_recreation and culture",
                      pred_month+"_education",
                      pred_month+"_restaurants and hotels",
                      pred_month+"_miscellaneous goods and services",
                      pred_month+"_headline CPI"
]

In [340]:
prediction_df['ID'] = pred_category_dict

In [341]:
# output to csv file
prediction_df[['ID', 'Value']].to_csv( f'submissions/multi_model_{pred_month}.csv', index = 0)

In [336]:
prediction_df

Unnamed: 0_level_0,Value,ID
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
Food and non-alcoholic beverages,119.226567,August_food and non-alcoholic beverages
Alcoholic beverages and tobacco,112.036779,August_alcoholic beverages and tobacco
Clothing and footwear,104.825344,August_clothing and footwear
Housing and utilities,108.516517,August_housing and utilities
Household contents and services,108.308083,August_household contents and services
Health,110.713841,August_health
Transport,113.856947,August_transport
Communication,99.44979,August_communication
Recreation and culture,105.851965,August_recreation and culture
Education,110.363168,August_education


In [304]:
results_table

{'Food and non-alcoholic beverages': 119.22656711494491,
 'Alcoholic beverages and tobacco': 112.03677886969798,
 'Clothing and footwear': 104.82534385021496,
 'Housing and utilities': 108.51651669094672,
 'Household contents and services': 108.30808272553672,
 'Health': 110.7138412271973,
 'Transport': 113.85694696366238,
 'Communication': 99.44978987827471,
 'Recreation and culture': 105.85196499754676,
 'Education': 110.36316804610668,
 'Restaurants and hotels': 110.582794359208,
 'Miscellaneous goods and services': 109.92233291474076,
 'headline CPI': 111.04380921645371}