In [217]:
import pandas as pd
import plotnine as pn
from plotnine import aes, ggplot
import pathlib
from datetime import datetime
from prophet import Prophet

# Import Datasets
The following datasets are used:
- cpi file as supplied by zindi
- currency as supplied by zindi
- jse data as supplied by zindi
-  Monthly credit detail excel released by the reserve bank (https://www.resbank.co.za/en/home/what-we-do/statistics/releases/selected-statistics)
- Money and banking detail excel released by the reserve bank (https://www.resbank.co.za/en/home/what-we-do/statistics/releases/selected-statistics)
- fuel prices pulled from open price engin

In [218]:
# get directory path
path = str(pathlib.Path().cwd().parent.parent.resolve())

In [219]:
cpi_raw = pd.read_excel(path + '/data/EXCEL - CPI(5 and 8 digit) from Jan 2017 (202306).xlsx', dtype="object")
currency = pd.read_csv(path + '/data/currency_data.csv')
credit = pd.read_excel(path + '/data/credit.xlsx', header=1)
jse = pd.read_csv(path + '/data/jse_indices.csv')
fuel = pd.read_csv(path + '/data/fuel_df.csv').drop(['Unnamed: 0'], axis=1)
sabor = pd.read_csv(path + '/data/sabor.csv')
money = pd.read_excel(path + '/data/money.xlsx', header=1)

There are 12 main categories that are used to calculate the Headline CPI. Each category has a number of sub categories. This data set contains the CPI on sub-category level.

Create a dataset where the CPI is calculated on the category level. These values will then be used in the models to predict the CPI for the different categories as well as the Headline CPI.

In terms of cleaning the data, I will do the following:
- Remove unnecessary columns.
- Change column headers to make them more explanatory.
- Replace all `..` entries with a `0`. There are products which were included in the CPI calculations at a later stage and some products that were removed from the CPI calculation. Categories with no value at the time has a `..`, I will replace them with a `0`. The type of the column can then be updated to float.
- Combine the `Super maize` and `Special maize` categories into a single `Maize meal` category, to correspond with the current use of maize meal.
- Create a function to calculate the CPI values for each month using the weights provided in the file.

In [220]:
def get_montly_cpi(raw_df):
    """Function that takes the raw cpi data for each product from statssa and calculates the cpi value per category

    Arguments:
    ----------
    raw_df: pandas dataframe
            dataframe containing raw data from statsa

    Return:
    -------
    df_cpi: pandas dataframe
            dataframe containing the monthly cpi per category
    """

    # 1. remove unecessary columns and rename
    list_cols_to_drop = ["H01", "H02", "H05", "H06", "H07"]
    cat_cpi_df = raw_df.copy().drop(list_cols_to_drop, axis=1).copy()

    cat_cpi_df.rename(
        columns={
            "H03": "category_codes",
            "H04": "category_descr",
            "Weight (All urban)": "weights_urban",
        },
        inplace=True,
    )

    # 2. replace .. with zeros
    cat_cpi_df.replace("..", 0, inplace=True)

    # 3. combine maize meal categories
    cat_cpi_df.iloc[17:19] = (
        cat_cpi_df.iloc[17:19].copy().apply(pd.to_numeric, errors="coerce")
    )
    divided_row = (cat_cpi_df.iloc[17].copy() + cat_cpi_df.iloc[18].copy()) / 2
    cat_cpi_df.iloc[15] = [
        divided_row[i] if value == 0 else value
        for i, value in enumerate(cat_cpi_df.iloc[15].copy())
    ]
    cat_cpi_df.drop([cat_cpi_df.index[17], cat_cpi_df.index[18]], inplace=True)

    # Convert the 'weights_urban' column to float
    cat_cpi_df["weights_urban"] = cat_cpi_df["weights_urban"].astype("float")

    # 4. calculate cpi
    # Assign a main category code to each raw data row.
    main_category = []
    for index, row in cat_cpi_df.iterrows():
        if (len(row["category_codes"]) == 8) & (
            row["category_codes"][:2] in ["01", "02"]
        ):
            main_category.append(row["category_codes"][:2])
        elif (
            len(row["category_codes"]) == 5
        ):  # & (row['category_codes'][:2] not in ["04","07"]):
            main_category.append(row["category_codes"][:2])
        else:
            main_category.append("no")

    cat_cpi_df["main_category_code"] = main_category

    # Drop the rows where the main_category_code is "no". That is to prevent double counting.
    # Some categories have a sub category included in the data.
    cat_cpi_df.drop(
        cat_cpi_df[cat_cpi_df["main_category_code"] == "no"].index, inplace=True
    )

    # Sum the weights for each category
    sum_weights = cat_cpi_df.groupby("main_category_code")["weights_urban"].sum()

    # create new cpi dataframe
    cpi_df = pd.DataFrame()

    # For each month create the headline CPI value and the CPI value per category.
    for col in range(3, cat_cpi_df.shape[1] - 1):
        cat_cpi_df = cat_cpi_df.copy()
        column_name = cat_cpi_df.columns[col]
        cat_cpi_df["weighted_index_" + column_name] = (
            cat_cpi_df["weights_urban"] * cat_cpi_df[column_name]
        )

        sum_weighted_index = cat_cpi_df.groupby("main_category_code")[
            "weighted_index_" + column_name
        ].sum()

        # Concatenate the DataFrames horizontally
        concat_df = pd.concat([sum_weights, sum_weighted_index], axis=1)

        # Add a row that sums the values in the columns
        sums_df = pd.DataFrame(
            concat_df.sum().values.reshape(1, -1), columns=concat_df.columns
        )
        sums_df = sums_df.set_index(pd.Index(["headline"]))

        # Concatenate the headline dataframe to the categories
        month_cpi_df = pd.concat([concat_df, sums_df], axis=0)

        # Calculate the CPI value
        month_cpi_df["cpi_" + column_name] = (
            month_cpi_df["weighted_index_" + column_name]
            / month_cpi_df["weights_urban"]
        ).round(1)

        cpi_df = pd.concat(
            [cpi_df, month_cpi_df[["weights_urban", "cpi_" + column_name]]], axis=1
        )

    # Remove duplicate weights columns and reset the index
    cpi_df = cpi_df.loc[:, ~cpi_df.columns.duplicated()]
    cpi_df = cpi_df.reset_index().rename(columns={"index": "category"})

    # Dataframe with just the CPI values:
    cpi_df = cpi_df.drop("weights_urban", axis=1).copy()
    transposed_cpi_df = cpi_df.set_index("category").transpose().reset_index()
    transposed_cpi_df["date"] = transposed_cpi_df["index"].apply(
        lambda x: x.split("M")[-1]
    )
    transposed_cpi_df["date"] = transposed_cpi_df["date"].apply(
        lambda x: x[:4] + "-" + x[-2:]
    )
    # change month to datetime format
    transposed_cpi_df["date"] = pd.to_datetime(transposed_cpi_df["date"]).dt.strftime(
        "%Y-%m"
    )

    return transposed_cpi_df.drop(['index'], axis=1)

In [221]:
cpi = get_montly_cpi(raw_df=cpi_raw)

In [222]:
# Create a category dictionary with the category code and description
category_dict = {
    "01": "Food and non-alcoholic beverages",
    "02": "Alcoholic beverages and tobacco",
    "03": "Clothing and footwear",
    "04": "Housing and utilities",
    "05": "Household contents and services",
    "06": "Health",
    "07": "Transport",
    "08": "Communication",
    "09": "Recreation and culture",
    "10": "Education",
    "11": "Restaurants and hotels",
    "12": "Miscellaneous goods and services",
    "headline": "headline CPI",
}

In [223]:
cpi.rename(columns = category_dict, inplace = True)


Change dates to datetime

In [224]:
# change month to datetime format
cpi['Date'] = pd.to_datetime(cpi['date']).dt.strftime('%Y-%m')
currency['Date'] = pd.to_datetime(currency['Date']).dt.strftime('%Y-%m')
jse['Date'] = pd.to_datetime(jse['date']).dt.strftime('%Y-%m')
credit['date'] = pd.to_datetime(credit['Date'], format='%b, %Y')
credit['Date'] = pd.to_datetime(credit['date']).dt.strftime('%Y-%m')
money['date'] = pd.to_datetime(money['Date'], format='%b, %Y')
money['Date'] = pd.to_datetime(money['date']).dt.strftime('%Y-%m')
fuel['Date'] = pd.to_datetime(fuel['date']).dt.strftime('%Y-%m')
sabor['Date'] = pd.to_datetime(sabor['date']).dt.strftime('%Y-%m')

Only select dates from 2022

In [225]:
# change month to datetime format
cpi_new = cpi[cpi['Date'] > '2018']
currency_new = currency[currency['Date'] > '2018']
jse_new = jse[jse['Date'] > '2018']
credit_new= credit[credit['Date'] > '2018']
money_new= money[money['Date'] > '2018']
fuel_new= fuel[fuel['Date'] > '2018']
sabor_new= sabor[sabor['Date'] > '2018']

In the credit and national excels the commas in values need to be stripped and values turned in to intergers

In [226]:
def remove_commas_and_convert_to_float(value):
    return float(value.replace(',', ''))

In [227]:
credit_new = credit_new.drop(['Share of corporations as a % of total credit',
       'Share of corporations as a % of total loans & advances',
       'Share of households as a % of total credit',
       'Share of households as % of total loans & advances', 'Investments'], axis=1)

In [228]:
credit_columns = ['Instalment sale credit', 'Leasing finance',
       'Mortgage advances', 'Overdrafts', 'General loans and advances',
       'Credit card advances', 'Of which: Total to households',
       'Total loans and advances : Households',
       'Claims on the domestic private sector',
       'Total loans and advances (excl. investments & bills)',
       'Bills discounted', 'Instalment sale credit.1', 'Leasing finance.1',
       'Mortgage advances.1', 'Overdrafts.1', 'General loans and advances.1',
       'Credit card advances.1', 'Of which: Total to corporations',
       'Claims on the domestic private sector.1',
       'Total loans and advances : Corporations',]

In [229]:
credit_new[credit_columns] = credit_new[credit_columns].applymap(remove_commas_and_convert_to_float)


In [230]:
money_columns = ['M0', 'M1A', 'M1', 'M2', 'Total monetary (M3) deposits',
       'M3 Seasonally adjusted',
       'Net foreign assets', 'Net claims on Government sector:',
       '-> Gross claims', '-> Government deposits',
       'Claims on the private sector', 'Net other assets', 'Change in M3',
       'Claims on the domestic private sector (seasonally adjusted)',
       'Claims on the domestic private sector', 'Investments',
       'Bills discounted',
       'Total loans and advances (excl. investments & bills)',
       '---> Instalment sales credit', '---> Leasing finance',
       '---> Mortgage advances', '---> Other loans and advances',
       'Of which: Total to households', 'Net claims on the government sector',
       'Total domestic credit extension']

In [231]:
money_new[money_columns] = money_new[money_columns].applymap(remove_commas_and_convert_to_float)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Create Features

In some of the excel sheets we do not have up to date. We will there use the feature from 2 months ao as the feature for now (ie march is the predictor for July)

In [232]:
print(jse_new['Date'].min())
print(credit_new['Date'].min())
print(currency_new['Date'].min())
print(cpi_new['Date'].min())
print(fuel_new['Date'].min())
print(sabor_new['Date'].min())
print(money_new['Date'].min())

2018-01
2018-01
2018-01
2018-01
2018-01
2018-01
2018-01


In [233]:
print(jse_new['Date'].max())
print(credit_new['Date'].max())
print(currency_new['Date'].max())
print(cpi_new['Date'].max())
print(fuel_new['Date'].max())
print(sabor_new['Date'].max())
print(money_new['Date'].max())

2023-06
2023-06
2023-06
2023-06
2023-07
2023-06
2023-06


In [234]:
jse_new['newDate'] = (pd.to_datetime(jse_new['Date']) + pd.DateOffset(months=2)).dt.strftime('%Y-%m')
credit_new['newDate'] = (pd.to_datetime(credit_new['Date']) + pd.DateOffset(months=2)).dt.strftime('%Y-%m')
currency_new['newDate'] = (pd.to_datetime(currency_new['Date']) + pd.DateOffset(months=2)).dt.strftime('%Y-%m')
fuel_new['newDate'] = (pd.to_datetime(fuel_new['Date']) + pd.DateOffset(months=2)).dt.strftime('%Y-%m')
sabor_new['newDate'] = (pd.to_datetime(sabor_new['Date']) + pd.DateOffset(months=2)).dt.strftime('%Y-%m')
money_new['newDate'] = (pd.to_datetime(money_new['Date']) + pd.DateOffset(months=2)).dt.strftime('%Y-%m')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [235]:
jse_new = jse_new.drop(['Date', 'date'], axis=1)
credit_new = credit_new.drop(['Date', 'date'], axis=1)
currency_new = currency_new.drop(['Date'], axis=1)
fuel_new = fuel_new.drop(['Date', 'date'], axis=1)
sabor_new = sabor_new.drop(['Date', 'date'], axis=1)
money_new = money_new.drop(['Date', 'date'], axis=1)


## Get mean per Month for jse and currency

In [236]:
credit_mean = credit_new.groupby(['newDate']).mean().reset_index().add_suffix("mean")
jse_mean = jse_new.groupby(['newDate']).mean().reset_index().add_suffix("mean")
currency_mean = currency_new.groupby(['newDate']).mean().reset_index().add_suffix("mean")
fuel_new = fuel_new.groupby(['newDate']).mean().reset_index()
sabor_new = sabor_new.groupby(['newDate']).mean().reset_index()
money_mean = money_new.groupby(['newDate']).mean().reset_index().add_suffix("mean")

credit_std = credit_new.groupby(['newDate']).std().reset_index().add_suffix("std")
jse_std = jse_new.groupby(['newDate']).std().reset_index().add_suffix("std")
currency_std = currency_new.groupby(['newDate']).std().reset_index().add_suffix("std")
money_std = money_new.groupby(['newDate']).std().reset_index().add_suffix("std")

credit_min = credit_new.groupby(['newDate']).min().reset_index().add_suffix("min")
jse_min = jse_new.groupby(['newDate']).min().reset_index().add_suffix("min")
currency_min = currency_new.groupby(['newDate']).min().reset_index().add_suffix("min")
money_min = money_new.groupby(['newDate']).min().reset_index().add_suffix("min")

credit_max = credit_new.groupby(['newDate']).max().reset_index().add_suffix("max")
jse_max = jse_new.groupby(['newDate']).max().reset_index().add_suffix("max")
currency_max = currency_new.groupby(['newDate']).max().reset_index().add_suffix("max")
money_max = money_new.groupby(['newDate']).max().reset_index().add_suffix("max")


In [237]:
credit_new = credit_mean.merge(credit_std, left_on='newDatemean', right_on='newDatestd', how='left').drop(['newDatestd'], axis=1)
credit_new = credit_new.merge(credit_max, left_on='newDatemean', right_on='newDatemax', how='left').drop(['newDatemax'], axis=1)
credit_new = credit_new.merge(credit_min, left_on='newDatemean', right_on='newDatemin', how='left').drop(['newDatemin'], axis=1)
credit_new = credit_new.rename(columns={"newDatemean": "newDate"})

In [238]:
money_new = money_mean.merge(money_std, left_on='newDatemean', right_on='newDatestd', how='left').drop(['newDatestd'], axis=1)
money_new = money_new.merge(money_max, left_on='newDatemean', right_on='newDatemax', how='left').drop(['newDatemax'], axis=1)
money_new = money_new.merge(money_min, left_on='newDatemean', right_on='newDatemin', how='left').drop(['newDatemin'], axis=1)
money_new = money_new.rename(columns={"newDatemean": "newDate"})

In [239]:
jse_new = jse_mean.merge(jse_std, left_on='newDatemean', right_on='newDatestd', how='left').drop(['newDatestd'], axis=1)
jse_new = jse_new.merge(jse_max, left_on='newDatemean', right_on='newDatemax', how='left').drop(['newDatemax'], axis=1)
jse_new = jse_new.merge(jse_min, left_on='newDatemean', right_on='newDatemin', how='left').drop(['newDatemin'], axis=1)
jse_new = jse_new.rename(columns={"newDatemean": "newDate"})

In [240]:
currency_new = currency_mean.merge(currency_std, left_on='newDatemean', right_on='newDatestd', how='left').drop(['newDatestd'], axis=1)
currency_new = currency_new.merge(currency_max, left_on='newDatemean', right_on='newDatemax', how='left').drop(['newDatemax'], axis=1)
currency_new = currency_new.merge(currency_min, left_on='newDatemean', right_on='newDatemin', how='left').drop(['newDatemin'], axis=1)
currency_new = currency_new.rename(columns={"newDatemean": "newDate"})

# Merge data together for features

In [241]:
cpi_all = cpi_new.drop(['date'], axis=1).merge(jse_new, right_on='newDate', left_on='Date', how='left')
cpi_all = cpi_all.drop(['newDate'], axis=1).merge(credit_new, right_on='newDate', left_on='Date', how='left')
cpi_all = cpi_all.drop(['newDate'], axis=1).merge(currency_new, right_on='newDate', left_on='Date', how='left')
cpi_all = cpi_all.drop(['newDate'], axis=1).merge(fuel_new, right_on='newDate', left_on='Date', how='left')
cpi_all = cpi_all.drop(['newDate'], axis=1).merge(sabor_new, right_on='newDate', left_on='Date', how='left')
cpi_all = cpi_all.drop(['newDate'], axis=1).merge(money_new, right_on='newDate', left_on='Date', how='left')

In [242]:
cpi_all = cpi_all[cpi_all['Date'] > '2018-03']

# Model
We will model one index at a time to see what works the best

In [243]:
from models.models import HoltWintersWrapper, ProphetWrapper
from sklearn.metrics import mean_squared_error

In [246]:
def combined_model(cpi, month):

    hw_cpi_cat = ['Health', 'Education']
    cpi_cat = ['headline CPI',
                'Food and non-alcoholic beverages',
                'Alcoholic beverages and tobacco',
                'Clothing and footwear',
                'Housing and utilities',
                'Household contents and services',
                'Transport',
                'Communication',
                'Recreation and culture',
                'Restaurants and hotels',
                'Miscellaneous goods and services']

    hw_6_results = []
    prophet_multi = []

    # fit hw
    for cat in hw_cpi_cat:
        # df = cpi_all[cpi_all['Category'] == cat]
        df = cpi.copy()

        hw_6 = HoltWintersWrapper(seasonal_periods=6)

        hw_6.fit(y=df[df['Date'] < month].sort_values(by='Date')[cat].values)

        hw_6_results.append(hw_6.predict(forcast=1)[0])

    df_hw_results = pd.DataFrame({'cat':hw_cpi_cat, 'pred':hw_6_results})

    df_features = sabor_new.merge(fuel_new, right_on='newDate', left_on='newDate', how='left')

    for cat in cpi_cat:
        # df = cpi_all[cpi_all['Category'] == cat]
        df = cpi.copy()

        # reorder and drop columns
        df.insert(0, 'ds', df.pop('Date'))
        df.insert(1, 'y', df.pop(cat))
        df = df[['ds', 'y', 'sabor']]

        # model
        prophet = ProphetWrapper(n_changepoints=2, seasonality_mode="multiplicative")
        prophet.fit(df[df['ds'] < month].sort_values(by='ds').sort_values(by='ds'))

        df_predict = pd.DataFrame({'ds': [month]})
        df_predict['ds'] = pd.to_datetime(df_predict['ds']).dt.strftime('%Y-%m')

        df_predict = df_predict.merge(df_features, left_on='ds', right_on='newDate').drop(['newDate'], axis=1)
        df_predict_columns = df.drop(['y'], axis=1).columns
        prophet_multi.append(prophet.predict(df_predict[df_predict_columns])[0])

    df_prophet_results = pd.DataFrame({'cat':cpi_cat, 'pred':prophet_multi})

    df_results = pd.concat([df_hw_results, df_prophet_results])

    return df_results
        

## Get April predictions

In [247]:
df_results_apr = combined_model(cpi=cpi_all, month='2023-04')



In [251]:
df_results_apr

Unnamed: 0,cat,pred
0,Health,109.248368
1,Education,110.400057
0,headline CPI,110.22344
1,Food and non-alcoholic beverages,118.78771
2,Alcoholic beverages and tobacco,110.328788
3,Clothing and footwear,103.828139
4,Housing and utilities,104.646256
5,Household contents and services,108.242326
6,Transport,120.258265
7,Communication,99.627989


In [248]:
cpi_apr = cpi_all[cpi_all['Date'] == '2023-04']

In [261]:
cpi_apr = cpi_apr[list(category_dict.values())].transpose().reset_index()
cpi_apr.columns = ['Category', 'Value']

In [262]:
df_results_apr = df_results_apr.merge(cpi_apr[['Category', 'Value']], left_on='cat', right_on='Category')

In [263]:
rmse_apr = mean_squared_error(df_results_apr['pred'], df_results_apr['Value'], squared=False)
print('Apr RMSE prophet multi: ', rmse_apr)

Apr RMSE prophet multi:  2.138942006530779


In [None]:
df_results_apr = df_results_apr[['cat', 'pred']]
df_results_apr.columns = ['ID', 'Value']

month = "April"

pred_map = {'Headline_CPI': f'{month}_headline CPI',
'Alcoholic beverages and tobacco': f'{month}_alcoholic beverages and tobacco',
'Clothing and footwear': f'{month}_clothing and footwear',
'Communication': f'{month}_communication',
'Education': f'{month}_education',
'Food and non-alcoholic beverages': f'{month}_food and non-alcoholic beverages',
'Health': f'{month}_health',
'Household contents and services': f'{month}_household contents and services',
'Housing and utilities': f'{month}_housing and utilities',
'Miscellaneous goods and services': f'{month}_miscellaneous goods and services',
'Recreation and culture': f'{month}_recreation and culture',
'Restaurants and hotels ': f'{month}_restaurants and hotels',
'Transport': f'{month}_transport'}

df_results_apr = df_results_apr.replace(pred_map)

df_results_apr.to_csv(path + '/submissions/apr_testing.csv', index=False)

## Get May predictions

In [None]:
df_results_may = combined_model(cpi_all=cpi_all, month='2023-05')

In [None]:
cpi_may = cpi_all[cpi_all['Date'] == '2023-05']

In [None]:
df_results_may = df_results_may.merge(cpi_may[['Category', 'Value']], left_on='cat', right_on='Category')

In [None]:
rmse_may = mean_squared_error(df_results_may['pred'], df_results_may['Value'], squared=False)
print('May RMSE prophet multo: ', rmse_may)

In [None]:
df_results_may = df_results_may[['cat', 'pred']]
df_results_may.columns = ['ID', 'Value']

month = "May"

pred_map = {'Headline_CPI': f'{month}_headline CPI',
'Alcoholic beverages and tobacco': f'{month}_alcoholic beverages and tobacco',
'Clothing and footwear': f'{month}_clothing and footwear',
'Communication': f'{month}_communication',
'Education': f'{month}_education',
'Food and non-alcoholic beverages': f'{month}_food and non-alcoholic beverages',
'Health': f'{month}_health',
'Household contents and services': f'{month}_household contents and services',
'Housing and utilities': f'{month}_housing and utilities',
'Miscellaneous goods and services': f'{month}_miscellaneous goods and services',
'Recreation and culture': f'{month}_recreation and culture',
'Restaurants and hotels ': f'{month}_restaurants and hotels',
'Transport': f'{month}_transport'}

df_results_may = df_results_may.replace(pred_map)

df_results_may.to_csv(path + '/submissions/may_testing.csv', index=False)

## Get June predictions

In [None]:
df_results_june = combined_model(cpi_all=cpi_all, month='2023-06')

In [None]:
df_results_june['Value'] = [110.8, 110.4, 109.8, 118.3, 110.9, 104.3, 105.4, 107.7, 112.3, 99.6, 105.3, 110.0, 109.6]

In [None]:
rmse_June = mean_squared_error(df_results_june['pred'], df_results_june['Value'], squared=False)
print('June RMSE prophet multo: ', rmse_June)

In [None]:
df_results_june = df_results_june[['cat', 'pred']]
df_results_june.columns = ['ID', 'Value']

month = "June"

pred_map = {'Headline_CPI': f'{month}_headline CPI',
'Alcoholic beverages and tobacco': f'{month}_alcoholic beverages and tobacco',
'Clothing and footwear': f'{month}_clothing and footwear',
'Communication': f'{month}_communication',
'Education': f'{month}_education',
'Food and non-alcoholic beverages': f'{month}_food and non-alcoholic beverages',
'Health': f'{month}_health',
'Household contents and services': f'{month}_household contents and services',
'Housing and utilities': f'{month}_housing and utilities',
'Miscellaneous goods and services': f'{month}_miscellaneous goods and services',
'Recreation and culture': f'{month}_recreation and culture',
'Restaurants and hotels ': f'{month}_restaurants and hotels',
'Transport': f'{month}_transport'}

df_results_june = df_results_june.replace(pred_map)

df_results_june.to_csv(path + '/submissions/june_testing.csv', index=False)

# Get predictions for July

In [None]:
df_results_july = combined_model(cpi_all=cpi_all, month='2023-07')

In [None]:
df_results_july.columns = ['ID', 'Value']

In [None]:
month = 'July'

In [None]:
pred_map = {'Headline_CPI': f'{month}_headline CPI',
'Alcoholic beverages and tobacco': f'{month}_alcoholic beverages and tobacco',
'Clothing and footwear': f'{month}_clothing and footwear',
'Communication': f'{month}_communication',
'Education': f'{month}_education',
'Food and non-alcoholic beverages': f'{month}_food and non-alcoholic beverages',
'Health': f'{month}_health',
'Household contents and services': f'{month}_household contents and services',
'Housing and utilities': f'{month}_housing and utilities',
'Miscellaneous goods and services': f'{month}_miscellaneous goods and services',
'Recreation and culture': f'{month}_recreation and culture',
'Restaurants and hotels ': f'{month}_restaurants and hotels',
'Transport': f'{month}_transport'}

In [None]:
df_results_july = df_results_july.replace(pred_map)

In [None]:
df_results_july.to_csv(path + '/submissions/2023-08-13_multi_july.csv', index=False)

# Get August submission

In [None]:
df_results_aug = combined_model(cpi_all=cpi_all, month='2023-08')

In [None]:
df_results_aug.columns = ['ID', 'Value']

In [None]:
month = 'August'

In [None]:
pred_map = {'Headline_CPI': f'{month}_headline CPI',
'Alcoholic beverages and tobacco': f'{month}_alcoholic beverages and tobacco',
'Clothing and footwear': f'{month}_clothing and footwear',
'Communication': f'{month}_communication',
'Education': f'{month}_education',
'Food and non-alcoholic beverages': f'{month}_food and non-alcoholic beverages',
'Health': f'{month}_health',
'Household contents and services': f'{month}_household contents and services',
'Housing and utilities': f'{month}_housing and utilities',
'Miscellaneous goods and services': f'{month}_miscellaneous goods and services',
'Recreation and culture': f'{month}_recreation and culture',
'Restaurants and hotels ': f'{month}_restaurants and hotels',
'Transport': f'{month}_transport'}

In [None]:
df_results_aug = df_results_aug.replace(pred_map)

In [None]:
df_results_aug.to_csv(path + '/submissions/2023-08-13_multi_aug.csv', index=False)