# Data input structure

## Each row represents
- One **Brand**
- In one **Country**
- At one **Month** (Year + Month)

---

## Inputs to the model (`X`)
Each row must provide:

1. **Time features**
   - `time_idx` (int): monotonic month index (0, 1, 2, …).
   - `month_sin` (float): `sin(2π * month/12)`.
   - `month_cos` (float): `cos(2π * month/12)`.

2. **Categorical features**
   - `country_id` (int): encoded Country.
   - `brand_id` (int): encoded Brand.

3. **Controllable features (vector)**
   - Shape: `(n_controls,)`.
   - Example (if n_controls=10):
     `[promotions_index, media_invest, discount_pct, inflation, temperature_c, holiday_flag, Meaning, Difference, Salience, Premium]`
   - Values must be standardized (mean=0, std=1).
   - At **forecast time**: feed a vector of zeros.

---

## Extra data (not part of model inputs but needed in training)
- `group_id` (int): unique ID for each (Country × Date).  
  - All brands in the same Country+Date share the same `group_id`.  
  - Used in loss to apply softmax within groups.

- `y_true` (float): actual Power percentage for that row (0–100).  
  - During training: available.  
  - During forecast: unknown (to be predicted).

---

## Tensor shapes during training
- `time_idx`: `(batch, 1)`
- `month_sin`: `(batch, 1)`
- `month_cos`: `(batch, 1)`
- `country_id`: `(batch, 1)`
- `brand_id`: `(batch, 1)`
- `controls`: `(batch, n_controls)`
- `y_true`: `(batch,)`
- `group_id`: `(batch,)`

---

## Model outputs
- `logits`: `(batch, 1)` — unnormalized additive scores per row.
- `ctrl_out`: `(batch, 1)` — controllable contribution (for regularization).

---

## Loss function
- Uses `(logits, group_id)` to compute **softmax per group** (so all brands in same Country×Date sum to 1).
- Compares predicted fractions × 100 to `y_true` (MSE).
- Adds small L2 penalty on `ctrl_out`.


In [1]:
import pandas as pd
import numpy as np
import pickle as pkl

# BG Data

In [2]:
# import files for bg data
input_path = "C:/Users/40107904/OneDrive - Anheuser-Busch InBev/ABI/WORK/hackathon_power/hackathon_lt_equity/dummy_data"
bg_data = pd.read_csv(f"{input_path}/bg.csv")

bg_data = bg_data[bg_data['PERIOD_TYPE'] == 'R12M']
bg_data["year"] = bg_data["LABEL_PERIOD"].str.slice(0, 4).astype(int)
bg_data["month"] = bg_data["LABEL_PERIOD"].str.slice(4, 8).str.upper().str.strip()

month_map = {
    'JAN': 1,
    'FEB': 2,
    'MAR': 3,
    'APR': 4,
    'MAY': 5,
    'JUN': 6,
    'JUL': 7,
    'AUG': 8,
    'SEP': 9,
    'OCT': 10,
    'NOV': 11,
    'DEC': 12,
    'YTD': 12
}

bg_data["month"] = bg_data["month"].apply(lambda x: month_map[x] if x in month_map else 12)
bg_data = bg_data.drop(columns=["LABEL_PERIOD", "PERIOD_TYPE", "COHORT", "COHORT_NAME"])
bg_data = bg_data.rename(columns={"COUNTRY_CODE": "country", "BRAND_DESC": "brand"})
bg_data.columns = bg_data.columns.str.lower()
bg_data = bg_data.sort_values(by=["country", "brand", "year", "month"])
bg_data = bg_data.reset_index(drop=True)
bg_numeric_cols = []
for col in bg_data.select_dtypes(include=['float64', 'int64']).columns:
    if col not in ['year', 'month']:
        bg_numeric_cols.append(col)
bg_data = bg_data[[col for col in bg_data.columns if col not in bg_numeric_cols] + ['power']]
bg_data['country'] = 'brazil'
bg_data['brand'] = bg_data['brand'].str.lower().str.strip()
bg_data

Unnamed: 0,country,brand,year,month,power
0,brazil,amstel,2021,3,3.1
1,brazil,amstel,2021,4,3.1
2,brazil,amstel,2021,5,3.1
3,brazil,amstel,2021,6,3.1
4,brazil,amstel,2021,7,3.2
...,...,...,...,...,...
1758,brazil,tiger,2024,3,0.4
1759,brazil,tiger,2024,4,0.4
1760,brazil,tiger,2024,5,0.4
1761,brazil,tiger,2024,6,0.4


In [3]:
bg_data = bg_data[bg_data['brand'].isin(['becks', 'brahma', 'budweiser', 'corona', 'skol', 'spaten', 'stella artois'])].reset_index(drop=True)

# for each year-month, the sum of power for all brands should be 100
bg_data['power'] = bg_data['power'] / bg_data.groupby(['year', 'month'])['power'].transform('sum') * 100
# round the power to 1 decimal places
bg_data['power'] = bg_data['power'].round(1)

bg_data

Unnamed: 0,country,brand,year,month,power
0,brazil,brahma,2021,3,52.0
1,brazil,brahma,2021,4,52.2
2,brazil,brahma,2021,5,52.2
3,brazil,brahma,2021,6,52.4
4,brazil,brahma,2021,7,52.2
...,...,...,...,...,...
200,brazil,stella artois,2024,3,11.6
201,brazil,stella artois,2024,4,11.5
202,brazil,stella artois,2024,5,11.5
203,brazil,stella artois,2024,6,11.5


In [4]:
bg_data_backup = bg_data.copy()

# Weather data

In [5]:
# import files for weather data
weather_data = pd.read_csv(f"{input_path}/mroi/brazil_weather_data.csv")

weather_data['timedesc'] = pd.to_datetime(weather_data['timedesc'], format='%Y-%m-%d')
weather_data['year'] = weather_data['timedesc'].dt.year
weather_data['month'] = weather_data['timedesc'].dt.month
weather_data = weather_data.drop(columns=['timedesc'])
weather_data = weather_data[[x for x in weather_data.columns if x != 'state']].groupby(['country','year', 'month']).mean().reset_index()
weather_data = weather_data.sort_values(by=['country', 'year', 'month']).reset_index(drop=True)
weather_data

Unnamed: 0,country,year,month,avgtemp,maxtemp,mintemp,prcp
0,brazil,2007,1,79.966611,94.850000,67.770536,0.226270
1,brazil,2007,2,78.827847,93.760714,66.831250,0.288651
2,brazil,2007,3,79.015594,93.469286,66.038571,0.192972
3,brazil,2007,4,77.859795,92.122321,64.649107,0.229469
4,brazil,2007,5,75.079357,89.963393,58.626786,0.139833
...,...,...,...,...,...,...,...
214,brazil,2024,11,80.000545,96.285714,65.895000,0.162963
215,brazil,2024,12,78.036464,92.826786,65.503571,0.269948
216,brazil,2025,1,79.103295,93.280357,67.039286,0.309204
217,brazil,2025,2,79.223506,93.354464,67.253571,0.277279


# Macroeco data

In [6]:
# import files for macroeconomic data
macro_data = pd.read_csv(f"{input_path}/mroi/brazil_macroeconomics_data.csv")

macro_data['timedesc'] = pd.to_datetime(macro_data['timedesc'], format='%Y-%m-%d')
macro_data['year'] = macro_data['timedesc'].dt.year
macro_data['month'] = macro_data['timedesc'].dt.month
macro_data = macro_data.drop(columns=['timedesc'])
macro_data = macro_data.groupby(['country','year', 'month']).mean().reset_index()

macro_data

Unnamed: 0,country,year,month,inflation_rate,unemployment_rate
0,brazil,2015,1,7.137737,7.354279
1,brazil,2015,2,7.701587,7.354279
2,brazil,2015,3,8.128505,7.354279
3,brazil,2015,4,8.171487,7.953818
4,brazil,2015,5,8.472943,7.953818
...,...,...,...,...,...
118,brazil,2024,11,3.715704,7.804767
119,brazil,2024,12,3.566220,7.804767
120,brazil,2025,1,5.228843,6.383003
121,brazil,2025,2,5.310633,6.383003


# Media data

In [7]:
# import files for media data
media_data = pd.read_csv(f"{input_path}/mroi/brazil_weekly_decomps_data.csv")

media_data_backup = media_data.copy()

In [8]:
media_data = media_data_backup.copy()
cols_to_pick = ['country', 'brand', 'vehicle', 'date', 'spend', 'maco']
media_data = media_data[cols_to_pick]


media_data['date'] = pd.to_datetime(media_data['date'], format='%Y-%m-%d')
media_data['year'] = media_data['date'].dt.year
media_data['month'] = media_data['date'].dt.month
media_data = media_data.drop(columns=['date'])

non_digital = ["radio", "paytv", "ooh", "print", "cinema"]
# lable all other vehicles as digital
media_data['vehicle'] = media_data['vehicle'].apply(lambda x: x if x in non_digital else 'digital')

media_data = media_data.groupby(['country', 'brand', 'vehicle', 'year', 'month']).sum().reset_index()
# media_data = media_data.drop(columns=['vehicle'])
media_data = media_data.sort_values(by=['country', 'brand', 'vehicle', 'year', 'month']).reset_index(drop=True)

media_data = media_data.drop(columns=['maco'])
media_data_pivot = media_data.pivot_table(index=['country', 'brand', 'year', 'month'], columns='vehicle', values='spend', fill_value=0).reset_index()
media_data_pivot['spend'] = media_data_pivot[non_digital + ['digital']].sum(numeric_only=True, axis=1)

# spend shares by row
for col in non_digital + ['digital']:
    media_data_pivot[col] = media_data_pivot[col] / media_data_pivot['spend'] 

# media_data['roi'] = media_data['maco'] / media_data['spend']

media_data_pivot

vehicle,country,brand,year,month,cinema,digital,ooh,paytv,print,radio,spend
0,brazil,becks,2020,3,0.0,1.0,0.0,0.0,0.0,0.0,2291.378003
1,brazil,becks,2020,4,0.0,1.0,0.0,0.0,0.0,0.0,26.806220
2,brazil,becks,2020,6,0.0,1.0,0.0,0.0,0.0,0.0,9120.339331
3,brazil,becks,2020,7,0.0,1.0,0.0,0.0,0.0,0.0,201116.827180
4,brazil,becks,2020,8,0.0,1.0,0.0,0.0,0.0,0.0,69736.318720
...,...,...,...,...,...,...,...,...,...,...,...
574,brazil,ze,2024,8,0.0,1.0,0.0,0.0,0.0,0.0,111798.011739
575,brazil,ze,2024,9,0.0,1.0,0.0,0.0,0.0,0.0,171049.182372
576,brazil,ze,2024,10,0.0,1.0,0.0,0.0,0.0,0.0,111698.796975
577,brazil,ze,2024,11,0.0,1.0,0.0,0.0,0.0,0.0,256145.016071


In [9]:
print(media_data['brand'].value_counts())

brand
brahma           207
budweiser        163
skol             135
stella artois    130
corona           129
guarana          108
spaten            98
becks             68
ze                59
Name: count, dtype: int64


# Neilsen

In [10]:
# import files for pos data
pos_data = pd.read_csv(f"{input_path}/mroi/brazil_pos_data.csv")

pos_data = pos_data[pos_data['regiondesc'] == "ambev_total_brazil"]
pos_data['vol*wd'] = pos_data['volume_hl'] * pos_data['wd']
pos_data = pos_data.drop(columns=['regiondesc', 'channeldesc', 'product', 'wd'])
pos_data['timedesc'] = pd.to_datetime(pos_data['timedesc'], format='%Y-%m-%d')
pos_data['year'] = pos_data['timedesc'].dt.year
pos_data['month'] = pos_data['timedesc'].dt.month
pos_data = pos_data.drop(columns=['timedesc'])

pos_data = pos_data.groupby(['country', 'brand', 'year', 'month']).sum().reset_index()

pos_data['wd'] = pos_data['vol*wd'] / pos_data['volume_hl']
pos_data = pos_data.drop(columns=['vol*wd'])
pos_data['price_usd'] = pos_data['sales_usd'] / pos_data['volume_hl']

pos_data

Unnamed: 0,country,brand,year,month,volume_hl,price_usd,sales_usd,wd
0,brazil,becks,2021,4,15434.858220,333.266130,5.143915e+06,46.006752
1,brazil,becks,2021,5,17181.147640,349.999377,6.013391e+06,39.852145
2,brazil,becks,2021,6,17317.632840,362.576849,6.278973e+06,38.497249
3,brazil,becks,2021,7,18390.469200,356.069613,6.548287e+06,40.243897
4,brazil,becks,2021,8,18475.981540,356.806456,6.592349e+06,40.000063
...,...,...,...,...,...,...,...,...
352,brazil,stella artois,2025,2,161553.590000,322.455414,5.209383e+07,45.723218
353,brazil,stella artois,2025,3,181431.400000,319.028870,5.788185e+07,46.700260
354,brazil,stella artois,2025,4,157264.200000,321.187051,5.051122e+07,45.397008
355,brazil,stella artois,2025,5,158956.800000,319.539801,5.079302e+07,46.411804


# Concantenate

In [11]:
final_df = bg_data.copy()
final_df = final_df.merge(weather_data, on=['country', 'year', 'month'], how='left')
final_df = final_df.merge(macro_data, on=['country', 'year', 'month'], how='left')
final_df = final_df.merge(media_data_pivot, on=['country', 'brand', 'year', 'month'], how='left')
final_df = final_df.merge(pos_data, on=['country', 'brand', 'year', 'month'], how='left')
final_df

Unnamed: 0,country,brand,year,month,power,avgtemp,maxtemp,mintemp,prcp,inflation_rate,...,digital,ooh,paytv,print,radio,spend,volume_hl,price_usd,sales_usd,wd
0,brazil,brahma,2021,3,52.0,78.217462,92.275893,66.104464,0.285617,6.099479,...,1.000000,0.000000,0.000000,0.0,0.0,2.039287e+06,,,,
1,brazil,brahma,2021,4,52.2,77.396357,91.137500,64.177679,0.186533,6.759304,...,1.000000,0.000000,0.000000,0.0,0.0,2.585893e+06,1.839577e+06,182.123021,3.350294e+08,58.048098
2,brazil,brahma,2021,5,52.2,75.128937,89.091429,60.326429,0.141963,8.056065,...,0.918411,0.000000,0.081589,0.0,0.0,2.331956e+06,1.902111e+06,185.212284,3.522944e+08,58.210640
3,brazil,brahma,2021,6,52.4,74.388775,88.361607,59.716964,0.103733,8.347072,...,0.936590,0.006847,0.056563,0.0,0.0,1.534533e+06,1.792848e+06,185.524685,3.326176e+08,57.777443
4,brazil,brahma,2021,7,52.2,72.554099,88.608571,54.508571,0.081499,8.994823,...,0.872856,0.106817,0.020327,0.0,0.0,1.292131e+06,1.752503e+06,186.505429,3.268512e+08,58.643422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,brazil,stella artois,2024,3,11.6,80.208747,93.802143,68.008571,0.278326,4.254281,...,0.928653,0.041389,0.029958,0.0,0.0,2.176583e+05,1.180883e+05,395.237621,4.667294e+07,44.719785
201,brazil,stella artois,2024,4,11.5,79.001819,91.968750,66.563393,0.243686,3.919996,...,0.721692,0.263971,0.014337,0.0,0.0,2.347760e+06,1.162377e+05,388.451738,4.515273e+07,45.651960
202,brazil,stella artois,2024,5,11.5,78.316303,91.694643,63.684821,0.175895,3.797215,...,0.556629,0.350116,0.060601,0.0,0.0,1.524281e+06,1.210071e+05,386.609308,4.678246e+07,47.091780
203,brazil,stella artois,2024,6,11.5,75.155107,89.387857,60.156429,0.175970,4.206327,...,0.912635,0.039876,0.047489,0.0,0.0,1.341561e+06,1.198886e+05,386.514679,4.633870e+07,47.800273


In [12]:
final_df = final_df.fillna(0)
print(final_df.isna().sum())

country              0
brand                0
year                 0
month                0
power                0
avgtemp              0
maxtemp              0
mintemp              0
prcp                 0
inflation_rate       0
unemployment_rate    0
cinema               0
digital              0
ooh                  0
paytv                0
print                0
radio                0
spend                0
volume_hl            0
price_usd            0
sales_usd            0
wd                   0
dtype: int64


In [13]:
final_df

Unnamed: 0,country,brand,year,month,power,avgtemp,maxtemp,mintemp,prcp,inflation_rate,...,digital,ooh,paytv,print,radio,spend,volume_hl,price_usd,sales_usd,wd
0,brazil,brahma,2021,3,52.0,78.217462,92.275893,66.104464,0.285617,6.099479,...,1.000000,0.000000,0.000000,0.0,0.0,2.039287e+06,0.000000e+00,0.000000,0.000000e+00,0.000000
1,brazil,brahma,2021,4,52.2,77.396357,91.137500,64.177679,0.186533,6.759304,...,1.000000,0.000000,0.000000,0.0,0.0,2.585893e+06,1.839577e+06,182.123021,3.350294e+08,58.048098
2,brazil,brahma,2021,5,52.2,75.128937,89.091429,60.326429,0.141963,8.056065,...,0.918411,0.000000,0.081589,0.0,0.0,2.331956e+06,1.902111e+06,185.212284,3.522944e+08,58.210640
3,brazil,brahma,2021,6,52.4,74.388775,88.361607,59.716964,0.103733,8.347072,...,0.936590,0.006847,0.056563,0.0,0.0,1.534533e+06,1.792848e+06,185.524685,3.326176e+08,57.777443
4,brazil,brahma,2021,7,52.2,72.554099,88.608571,54.508571,0.081499,8.994823,...,0.872856,0.106817,0.020327,0.0,0.0,1.292131e+06,1.752503e+06,186.505429,3.268512e+08,58.643422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,brazil,stella artois,2024,3,11.6,80.208747,93.802143,68.008571,0.278326,4.254281,...,0.928653,0.041389,0.029958,0.0,0.0,2.176583e+05,1.180883e+05,395.237621,4.667294e+07,44.719785
201,brazil,stella artois,2024,4,11.5,79.001819,91.968750,66.563393,0.243686,3.919996,...,0.721692,0.263971,0.014337,0.0,0.0,2.347760e+06,1.162377e+05,388.451738,4.515273e+07,45.651960
202,brazil,stella artois,2024,5,11.5,78.316303,91.694643,63.684821,0.175895,3.797215,...,0.556629,0.350116,0.060601,0.0,0.0,1.524281e+06,1.210071e+05,386.609308,4.678246e+07,47.091780
203,brazil,stella artois,2024,6,11.5,75.155107,89.387857,60.156429,0.175970,4.206327,...,0.912635,0.039876,0.047489,0.0,0.0,1.341561e+06,1.198886e+05,386.514679,4.633870e+07,47.800273


# Model Input Creation

In [14]:
final_df_backup = final_df.copy()

In [18]:
df = final_df_backup.copy()

# time_idx column
df = df.sort_values(by=['country', 'year', 'month', 'brand']).reset_index(drop=True)
df

df = df.sort_values(['year', 'month'])
min_year, min_month = df['year'].min(), df['month'][df['year'] == df['year'].min()].min()

df['time_idx'] = (df['year'] - min_year) * 12 + (df['month'] - min_month)

# 3. Cyclical month encoding
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12.0).round(6)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12.0).round(6)
df

Unnamed: 0,country,brand,year,month,power,avgtemp,maxtemp,mintemp,prcp,inflation_rate,...,print,radio,spend,volume_hl,price_usd,sales_usd,wd,time_idx,month_sin,month_cos
0,brazil,brahma,2021,3,52.0,78.217462,92.275893,66.104464,0.285617,6.099479,...,0.0,0.000000,2.039287e+06,0.000000e+00,0.000000,0.000000e+00,0.000000,0,1.0,0.000000
1,brazil,budweiser,2021,3,19.8,78.217462,92.275893,66.104464,0.285617,6.099479,...,0.0,0.000000,1.851504e+05,0.000000e+00,0.000000,0.000000e+00,0.000000,0,1.0,0.000000
2,brazil,corona,2021,3,11.5,78.217462,92.275893,66.104464,0.285617,6.099479,...,0.0,0.000000,4.065109e+05,0.000000e+00,0.000000,0.000000e+00,0.000000,0,1.0,0.000000
3,brazil,spaten,2021,3,4.0,78.217462,92.275893,66.104464,0.285617,6.099479,...,0.0,0.000000,1.926752e+03,0.000000e+00,0.000000,0.000000e+00,0.000000,0,1.0,0.000000
4,brazil,stella artois,2021,3,12.7,78.217462,92.275893,66.104464,0.285617,6.099479,...,0.0,0.000000,4.283199e+04,0.000000e+00,0.000000,0.000000e+00,0.000000,0,1.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,brazil,brahma,2024,7,46.4,73.665021,89.108036,57.716071,0.077463,3.835883,...,0.0,0.000379,3.722585e+06,1.748808e+06,245.770147,4.298048e+08,49.579925,40,-0.5,-0.866025
201,brazil,budweiser,2024,7,19.0,73.665021,89.108036,57.716071,0.077463,3.835883,...,0.0,0.000000,1.410161e+06,2.878564e+05,265.411869,7.640051e+07,48.479478,40,-0.5,-0.866025
202,brazil,corona,2024,7,13.5,73.665021,89.108036,57.716071,0.077463,3.835883,...,0.0,0.000000,3.374824e+06,5.791244e+04,430.635230,2.493914e+07,68.194613,40,-0.5,-0.866025
203,brazil,spaten,2024,7,9.3,73.665021,89.108036,57.716071,0.077463,3.835883,...,0.0,0.000000,2.026831e+06,1.993254e+05,331.999014,6.617583e+07,56.403063,40,-0.5,-0.866025


In [19]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

numeric_cols = []
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    if col not in ['year', 'month', 'power', 'time_idx'] and col not in ['digital'] + non_digital:
        numeric_cols.append(col)
for col in numeric_cols:
    if col in media_data.columns:
        media_data[col] = (media_data[col] - df[col].min()) / df[col].max()
    df[col] = (df[col] - df[col].min()) / df[col].max()
    df[col] = df[col].astype(np.float32)
df

Unnamed: 0,country,brand,year,month,power,avgtemp,maxtemp,mintemp,prcp,inflation_rate,...,print,radio,spend,volume_hl,price_usd,sales_usd,wd,time_idx,month_sin,month_cos
0,brazil,brahma,2021,3,52.0,0.068728,0.045359,0.169615,0.705794,0.262406,...,0.0,0.000000,0.027797,0.000000,0.000000,0.000000,0.000000,0,2.0,1.000000
1,brazil,budweiser,2021,3,19.8,0.068728,0.045359,0.169615,0.705794,0.262406,...,0.0,0.000000,0.002524,0.000000,0.000000,0.000000,0.000000,0,2.0,1.000000
2,brazil,corona,2021,3,11.5,0.068728,0.045359,0.169615,0.705794,0.262406,...,0.0,0.000000,0.005541,0.000000,0.000000,0.000000,0.000000,0,2.0,1.000000
3,brazil,spaten,2021,3,4.0,0.068728,0.045359,0.169615,0.705794,0.262406,...,0.0,0.000000,0.000026,0.000000,0.000000,0.000000,0.000000,0,2.0,1.000000
4,brazil,stella artois,2021,3,12.7,0.068728,0.045359,0.169615,0.705794,0.262406,...,0.0,0.000000,0.000584,0.000000,0.000000,0.000000,0.000000,0,2.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,brazil,brahma,2024,7,46.4,0.013482,0.013491,0.046917,0.065781,0.075721,...,0.0,0.000379,0.050742,0.764100,0.570715,0.851189,0.644790,40,0.5,0.133975
201,brazil,budweiser,2024,7,19.0,0.013482,0.013491,0.046917,0.065781,0.075721,...,0.0,0.000000,0.019222,0.125772,0.616326,0.151304,0.630479,40,0.5,0.133975
202,brazil,corona,2024,7,13.5,0.013482,0.013491,0.046917,0.065781,0.075721,...,0.0,0.000000,0.046001,0.025303,1.000000,0.049390,0.886875,40,0.5,0.133975
203,brazil,spaten,2024,7,9.3,0.013482,0.013491,0.046917,0.065781,0.075721,...,0.0,0.000000,0.027627,0.087091,0.770952,0.131055,0.733525,40,0.5,0.133975


In [29]:
df.columns

Index(['country', 'brand', 'year', 'month', 'power', 'avgtemp', 'maxtemp',
       'mintemp', 'prcp', 'inflation_rate', 'unemployment_rate', 'cinema',
       'digital', 'ooh', 'paytv', 'print', 'radio', 'spend', 'volume_hl',
       'price_usd', 'sales_usd', 'wd', 'time_idx', 'month_sin', 'month_cos',
       'group_id'],
      dtype='object')

In [20]:
months_for_test = 3
index_for_train_test_cut = df['time_idx'].max() - months_for_test
train_cut_off = df[df['time_idx'] <= index_for_train_test_cut].index.max()
print(train_cut_off)

189


In [21]:
# group id by country-year-month
df['group_id'] = df.groupby(['country', 'year', 'month']).ngroup()

group_id = df['group_id'].values
print(group_id)

[ 0  0  0  0  0  1  1  1  1  1  2  2  2  2  2  3  3  3  3  3  4  4  4  4
  4  5  5  5  5  5  6  6  6  6  6  7  7  7  7  7  8  8  8  8  8  9  9  9
  9  9 10 10 10 10 10 11 11 11 11 11 12 12 12 12 12 13 13 13 13 13 14 14
 14 14 14 15 15 15 15 15 16 16 16 16 16 17 17 17 17 17 18 18 18 18 18 19
 19 19 19 19 20 20 20 20 20 21 21 21 21 21 22 22 22 22 22 23 23 23 23 23
 24 24 24 24 24 25 25 25 25 25 26 26 26 26 26 27 27 27 27 27 28 28 28 28
 28 29 29 29 29 29 30 30 30 30 30 31 31 31 31 31 32 32 32 32 32 33 33 33
 33 33 34 34 34 34 34 35 35 35 35 35 36 36 36 36 36 37 37 37 37 37 38 38
 38 38 38 39 39 39 39 39 40 40 40 40 40]


In [22]:
# time_idx variable
time_idx = (df['time_idx'] + 1).values
print(time_idx)

# month sin and cos
month_sin = df['month_sin'].values
month_cos = df['month_cos'].values
print(month_sin)
print(month_cos)

[ 1  1  1  1  1  2  2  2  2  2  3  3  3  3  3  4  4  4  4  4  5  5  5  5
  5  6  6  6  6  6  7  7  7  7  7  8  8  8  8  8  9  9  9  9  9 10 10 10
 10 10 11 11 11 11 11 12 12 12 12 12 13 13 13 13 13 14 14 14 14 14 15 15
 15 15 15 16 16 16 16 16 17 17 17 17 17 18 18 18 18 18 19 19 19 19 19 20
 20 20 20 20 21 21 21 21 21 22 22 22 22 22 23 23 23 23 23 24 24 24 24 24
 25 25 25 25 25 26 26 26 26 26 27 27 27 27 27 28 28 28 28 28 29 29 29 29
 29 30 30 30 30 30 31 31 31 31 31 32 32 32 32 32 33 33 33 33 33 34 34 34
 34 34 35 35 35 35 35 36 36 36 36 36 37 37 37 37 37 38 38 38 38 38 39 39
 39 39 39 40 40 40 40 40 41 41 41 41 41]
[2.       2.       2.       2.       2.       1.866025 1.866025 1.866025
 1.866025 1.866025 1.5      1.5      1.5      1.5      1.5      1.
 1.       1.       1.       1.       0.5      0.5      0.5      0.5
 0.5      0.133975 0.133975 0.133975 0.133975 0.133975 0.       0.
 0.       0.       0.       0.133975 0.133975 0.133975 0.133975 0.133975
 0.5      0.5      0.5     

In [23]:
# country and brand categorical variables
country_id = df['country'].astype('category').cat.codes.values
brand_id = df['brand'].astype('category').cat.codes.values
print(country_id)
print(brand_id)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1
 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3
 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0
 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2
 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4
 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4 0 1 2 3 4]


In [28]:
# vehicle categorical variables
vehicles = sorted(non_digital + ['digital'])
vehicle_id = pd.DataFrame({'vehicle': vehicles})['vehicle']
vehicle_id = vehicle_id.astype('category').cat.codes.values
print(vehicle_id)

[0 1 2 3 4 5]


In [48]:
# price_controls
n_price_controls = 11
price_control_cols = [x for x in df.columns if x not in ['country', 'brand', 'year', 'month', 'power', 'time_idx', 'month_sin', 'month_cos', 'group_id'] \
                and x not in ['avgtemp', 'maxtemp', 'mintemp', 'prcp', 'inflation_rate', 'unemployment_rate']
                and x not in vehicles\
                and x not in ['spend']]
n_price_controls = len(price_control_cols)

price_control_ids = df[price_control_cols]


print(n_price_controls, price_control_cols, price_control_ids)
print()
print(np.array(price_control_cols).shape)

4 ['volume_hl', 'price_usd', 'sales_usd', 'wd']      volume_hl  price_usd  sales_usd        wd
0     0.000000   0.000000   0.000000  0.000000
1     0.000000   0.000000   0.000000  0.000000
2     0.000000   0.000000   0.000000  0.000000
3     0.000000   0.000000   0.000000  0.000000
4     0.000000   0.000000   0.000000  0.000000
..         ...        ...        ...       ...
200   0.764100   0.570715   0.851189  0.644790
201   0.125772   0.616326   0.151304  0.630479
202   0.025303   1.000000   0.049390  0.886875
203   0.087091   0.770952   0.131055  0.733525
204   0.053114   0.896186   0.092909  0.633030

[205 rows x 4 columns]

(4,)


In [34]:
# vehicle controls
n_vehicle_controls = len(vehicles)
vehicle_control_ids = df[vehicles]
print(n_vehicle_controls, vehicles, vehicle_control_ids)
print()
print(np.array(vehicles).shape)

6 ['cinema', 'digital', 'ooh', 'paytv', 'print', 'radio']      cinema   digital       ooh     paytv  print     radio
0       0.0  1.000000  0.000000  0.000000    0.0  0.000000
1       0.0  1.000000  0.000000  0.000000    0.0  0.000000
2       0.0  1.000000  0.000000  0.000000    0.0  0.000000
3       0.0  1.000000  0.000000  0.000000    0.0  0.000000
4       0.0  1.000000  0.000000  0.000000    0.0  0.000000
..      ...       ...       ...       ...    ...       ...
200     0.0  0.962508  0.028077  0.009036    0.0  0.000379
201     0.0  1.000000  0.000000  0.000000    0.0  0.000000
202     0.0  0.760056  0.234428  0.005516    0.0  0.000000
203     0.0  0.649623  0.344896  0.005481    0.0  0.000000
204     0.0  0.881417  0.000000  0.118583    0.0  0.000000

[205 rows x 6 columns]

(6,)


In [36]:
# macro controls
macro_control_cols = ['avgtemp', 'maxtemp', 'mintemp', 'prcp', 'inflation_rate', 'unemployment_rate']
n_macro_controls = len(macro_control_cols)

macro_control_ids = df[macro_control_cols]
print(n_macro_controls, macro_control_cols, macro_control_ids)
print()
print(np.array(macro_control_cols).shape)

6 ['avgtemp', 'maxtemp', 'mintemp', 'prcp', 'inflation_rate', 'unemployment_rate']       avgtemp   maxtemp   mintemp      prcp  inflation_rate  unemployment_rate
0    0.068728  0.045359  0.169615  0.705794        0.262406           0.465659
1    0.068728  0.045359  0.169615  0.705794        0.262406           0.465659
2    0.068728  0.045359  0.169615  0.705794        0.262406           0.465659
3    0.068728  0.045359  0.169615  0.705794        0.262406           0.465659
4    0.068728  0.045359  0.169615  0.705794        0.262406           0.465659
..        ...       ...       ...       ...             ...                ...
200  0.013482  0.013491  0.046917  0.065781        0.075721           0.009396
201  0.013482  0.013491  0.046917  0.065781        0.075721           0.009396
202  0.013482  0.013491  0.046917  0.065781        0.075721           0.009396
203  0.013482  0.013491  0.046917  0.065781        0.075721           0.009396
204  0.013482  0.013491  0.046917  0.065781     

In [50]:
# total_spend
total_spend = df['spend'].values
print(total_spend)

[2.7796963e-02 2.5237347e-03 5.5410396e-03 2.6263031e-05 5.8383116e-04
 3.5247609e-02 9.8047685e-04 4.1724541e-03 1.2042266e-05 9.3251478e-04
 3.1786256e-02 3.2333634e-03 4.7805356e-03 0.0000000e+00 5.4668328e-03
 2.0916807e-02 9.0628034e-03 5.5561708e-03 8.8502584e-06 1.4566814e-02
 1.7612686e-02 1.9724997e-02 3.3312789e-03 1.2282558e-02 4.4567562e-03
 2.7381459e-02 9.8508308e-03 4.1539455e-03 1.6418373e-03 1.7706649e-02
 1.2071464e-02 2.8511485e-02 5.3633377e-03 3.2418493e-02 5.1871613e-03
 8.9212237e-03 1.0391875e-02 3.6167668e-03 2.0661201e-02 1.7804322e-03
 2.8724415e-02 1.0436062e-02 4.0772064e-03 4.7609238e-03 1.6873959e-03
 6.5499637e-03 2.6855767e-02 4.2145918e-03 8.8086922e-04 5.8211279e-03
 4.0906146e-02 6.1460743e-03 4.2596059e-03 2.4312163e-02 3.3897772e-03
 5.6299482e-02 1.3200964e-02 2.7442817e-03 4.2335037e-03 1.9226283e-03
 2.4783611e-02 4.7969725e-02 2.8766259e-03 8.1990304e-04 2.1251501e-03
 4.2379919e-02 1.6892731e-02 7.5678397e-03 2.2672422e-02 1.4816176e-02
 3.756

In [51]:
y_true = df['power'].values
print(y_true.shape)

(205,)


In [52]:
def fix_shape(arr, dtype=None):
    arr = np.array(arr)  # ensure ndarray
    arr = arr.reshape(arr.shape[0], -1)
    print(arr.shape)
    if arr.shape[0] < arr.shape[1]:
        arr = arr.T
    if dtype is not None:
        arr = arr.astype(dtype)
    print(arr.shape)
    print()
    return arr

time_idx   = fix_shape(time_idx, dtype="int32")
month_sin  = fix_shape(month_sin, dtype="float32")
month_cos  = fix_shape(month_cos, dtype="float32")
country_id = fix_shape(country_id, dtype="int32")
brand_id   = fix_shape(brand_id, dtype="int32")
vehicle_id = fix_shape(vehicle_id, dtype="int32")
price_controls = fix_shape(price_control_ids, dtype="float32")
vehicle_controls = fix_shape(vehicle_control_ids, dtype="float32")
macro_controls = fix_shape(macro_control_ids, dtype="float32")
total_spend = np.array(total_spend, dtype="float32").reshape(-1)   # (N,)
y_true     = np.array(y_true, dtype="float32").reshape(-1)   # (N,)
group_id   = np.array(group_id, dtype="int32").reshape(-1)   # (N,)

(205, 1)
(205, 1)

(205, 1)
(205, 1)

(205, 1)
(205, 1)

(205, 1)
(205, 1)

(205, 1)
(205, 1)

(6, 1)
(6, 1)

(205, 4)
(205, 4)

(205, 6)
(205, 6)

(205, 6)
(205, 6)



In [53]:
# train/test split
def train_test_split(arr, train_cut_off=train_cut_off):
    """Split the data into training and testing sets. 
    Input:
        arr: numpy array of (rows, cols)
        train_cut_off: index to split the data into training and testing sets

    Args:
        arr (np.ndarray): Input array to split.
        train_cut_off (int, optional): Index to split the data into training and testing sets. Defaults to train_cut_off.
    Returns:
        tuple: A tuple containing the training and testing sets for that array.
    """
    return arr[:train_cut_off+1], arr[train_cut_off+1:]

time_idx_train, time_idx_test = train_test_split(time_idx)
month_sin_train, month_sin_test = train_test_split(month_sin)
month_cos_train, month_cos_test = train_test_split(month_cos)
country_id_train, country_id_test = train_test_split(country_id)
brand_id_train, brand_id_test = train_test_split(brand_id)
vehicle_id_train, vehicle_id_test = train_test_split(vehicle_id)
price_controls_train, price_controls_test = train_test_split(price_controls)
vehicle_controls_train, vehicle_controls_test = train_test_split(vehicle_controls)
macro_controls_train, macro_controls_test = train_test_split(macro_controls)
total_spend_train, total_spend_test = train_test_split(total_spend)
y_true_train, y_true_test = train_test_split(y_true)    
group_id_train, group_id_test = train_test_split(group_id)

print(time_idx_train.shape, time_idx_test.shape)

(190, 1) (15, 1)


In [None]:
# controls_test = np.zeros_like(controls_test)

# Pickle Output

In [55]:
# dump in a pickle file
output_path = "C:/Users/40107904/OneDrive - Anheuser-Busch InBev/ABI/WORK/hackathon_power/hackathon_lt_equity/dummy_data/processed_data"
output_file = f"{output_path}/preprocessed_data_2.pkl"

input_data_dict = {
    "time_idx": time_idx_train,
    "month_sin": month_sin_train,
    "month_cos": month_cos_train,
    "country_id": country_id_train,
    "brand_id": brand_id_train,
    "vehicle_id": vehicle_id_train,
    "price_controls": price_controls_train,
    "vehicle_controls": vehicle_controls_train,
    "macro_controls": macro_controls_train,
    "total_spend": total_spend_train,
    "y_true": y_true_train,
    "group_id": group_id_train
}

output_data_dict = {
    "time_idx": time_idx_test,
    "month_sin": month_sin_test,
    "month_cos": month_cos_test,
    "country_id": country_id_test,
    "brand_id": brand_id_test,
    "vehicle_id": vehicle_id_test,
    "price_controls": price_controls_test,
    "vehicle_controls": vehicle_controls_test,
    "macro_controls": macro_controls_test,
    "total_spend": total_spend_test,
    "y_true": y_true_test,
    "group_id": group_id_test
}

final_dict = {
    "input_data": input_data_dict,
    "output_data": output_data_dict
}

with open(output_file, "wb") as f:
    pkl.dump(final_dict, f)