# BTK Datathon 2022 Competition Study

Our goal in this competition is to predict the selling price of each product. You have to guess the value of the Product Price variable for each id in the test set.

https://www.kaggle.com/competitions/datathon2022

## Modules

In [1]:
import warnings, random, itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.api import VAR
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import kpss, adfuller, grangercausalitytests
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tools.eval_measures import rmse
from statsmodels.tools.sm_exceptions import ConvergenceWarning, ValueWarning

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer, MinMaxScaler
from sklearn.pipeline import make_pipeline

warnings.simplefilter('ignore', ConvergenceWarning)
warnings.simplefilter('ignore', ValueWarning)

## Data Collection

In [2]:
# df_cpi source: https://evds2.tcmb.gov.tr/index.php?/evds/serieMarket
# df_currency source: https://evds2.tcmb.gov.tr/index.php?/evds/serieMarket

df_train = pd.read_csv(filepath_or_buffer    = 'https://github.com/aParsecFromFuture/BTK-Akademi-DataThon-2022/blob/main/dataset/train.csv?raw=True',
                       index_col             = ['tarih'],
                       parse_dates           = ['tarih'],
                       dtype                 = {'ürün': 'category',
                                               'ürün kategorisi': 'category',
                                               'ürün üretim yeri': 'category',
                                               'market': 'category',
                                               'şehir': 'category'})

df_test = pd.read_csv(filepath_or_buffer     = 'https://github.com/aParsecFromFuture/BTK-Akademi-DataThon-2022/blob/main/dataset/test.csv?raw=True',
                      index_col              = ['tarih'],
                      parse_dates            = ['tarih'],
                      dtype                  = {'ürün': 'category',
                                               'ürün kategorisi': 'category',
                                               'ürün üretim yeri': 'category',
                                               'market': 'category',
                                               'şehir': 'category'})

df_cpi = pd.read_csv(filepath_or_buffer      = 'https://github.com/aParsecFromFuture/BTK-Akademi-DataThon-2022/raw/main/dataset/cpi.csv?raw=True',
                     index_col               = 'Tarih',
                     parse_dates             = ['Tarih'])

df_currency = pd.read_csv(filepath_or_buffer = 'https://github.com/aParsecFromFuture/BTK-Akademi-DataThon-2022/blob/main/dataset/currency.csv?raw=True', 
                          index_col          = 'Tarih',
                          parse_dates        = ['Tarih'])

In [3]:
df_train.info()

In [4]:
df_test.info()

In [5]:
df_currency.info()

In [6]:
df_cpi.info()

In [7]:
categories = list(df_train['ürün kategorisi'].cat.categories)
products   = list(df_train['ürün'].cat.categories)
countries  = list(df_train['ürün üretim yeri'].cat.categories)
markets    = list(df_train['market'].cat.categories)
cities     = list(df_train['şehir'].cat.categories)

print(categories, products, countries, markets, cities, sep='\n')

## Data Wrangling

In [8]:
idx_col = []
groupped = df_train['ürün kategorisi'].groupby(df_train['ürün']).unique()

for product, country, city, market in itertools.product(products, countries, cities, markets):
  idx_col.append((groupped[product][0], product, country, city, market))

idx_col = pd.MultiIndex.from_tuples(idx_col, names=['category', 'product', 'country', 'city', 'market'])

In [9]:
time_series = []

for product, country, city, market in itertools.product(products, countries, cities, markets):
  mask = (df_train['ürün'] == product) & \
         (df_train['ürün üretim yeri'] == country) & \
         (df_train['şehir'] == city) & \
         (df_train['market'] == market)
  
  masked = df_train.loc[mask, 'ürün fiyatı']
  time_series.append(masked.rename('{0}_{1}_{2}_{3}'.format(product, country, city, market)))

df_train = pd.concat(time_series, axis=1)
df_train = df_train.set_axis(idx_col, axis=1)

df_train = df_train.sort_index(axis=0)
df_train = df_train.sort_index(axis=1)

In [10]:
df_train.head()

## Exploratory Data Analysis

### Average Price Based on Products

In [11]:
plt.rcParams["figure.figsize"] = (16, 12)

use_products = ['Bulgur', 'hindi', 'badem', 'Armut', 'Bakla-kuru', 'Kaşar peyniri']
use_dates = pd.date_range('2016-01-01', '2020-12-01', freq='MS')

fig, axes = plt.subplots(3, 1, sharex=True)

axes[0].set_title('Original Data')
axes[1].set_title('1st Differencing')
axes[2].set_title('2st Differencing')

for ax in axes:
  ax.set_ylabel('Turkish Lira(₺)')
  ax.grid()

for product in use_products:
  vals = df_train.loc[use_dates, pd.IndexSlice[:, product]].mean(axis=1)
  axes[0].plot(vals)
  axes[1].plot(vals.diff())
  axes[2].plot(vals.diff().diff())

plt.legend(use_products)
plt.show()

### Average Price Based on Categories

In [12]:
plt.rcParams["figure.figsize"] = (16, 12)

use_categories = ['et', 'kuruyemiş', 'meyve', 'sebze ve bakliyat', 'süt ürünleri ve kahvaltılık', 'tahıl ve ürünleri']
use_dates = pd.date_range('2016-01-01', '2020-12-01', freq='MS')

fig, axes = plt.subplots(3, 1, sharex=True)

axes[0].set_title('Original Data')
axes[1].set_title('1st Differencing')
axes[2].set_title('2st Differencing')

for ax in axes:
  ax.set_ylabel('Turkish Lira(₺)')
  ax.grid()

for category in use_categories:
  vals = df_train.loc[use_dates, pd.IndexSlice[category]].mean(axis=1)
  axes[0].plot(vals)
  axes[1].plot(vals.diff())
  axes[2].plot(vals.diff().diff())

plt.legend(use_categories)
plt.show()

### Product Price Based on Countries

In [13]:
plt.rcParams["figure.figsize"] = (12, 8)
barWidth = 0.20

groupped_prices = df_train.groupby(['product', 'country'], axis=1).mean()

use_products = ['Kereviz-baş', 'Kiraz', 'Kırmızı-pancar', 'Lahana', 'Limon', 'Makarna']
use_countries = ['Yurt içi', 'Yurt dışı']
use_date = '2020-01-01'

for i, country in enumerate(use_countries):  
  vals = groupped_prices.loc[use_date, pd.IndexSlice[use_products, country]]
  x_ticks = [x + 0.1 + i * barWidth for x in range(len(vals))]
  plt.bar(x_ticks, vals, width=barWidth, label=product, edgecolor='black')

plt.title(f'Product Prices Based on Countries - {use_date}')
plt.ylabel('Turkish Lira(₺)')
plt.xticks([r + barWidth for r in range(len(use_products))], use_products)
 
plt.legend(use_countries)
plt.show()

### Product Price Based on Cities

In [14]:
plt.rcParams["figure.figsize"] = (12, 8)
barWidth = 0.20

groupped_prices = df_train.groupby(['product', 'city'], axis=1).mean()

use_products = ['Kereviz-baş', 'Kiraz', 'Kırmızı-pancar', 'Lahana', 'Limon', 'Makarna']
use_cities = ['A', 'B', 'C']
use_date = '2020-01-01'

for i, city in enumerate(use_cities):  
  vals = groupped_prices.loc[use_date, pd.IndexSlice[use_products, city]]
  x_ticks = [x + i * barWidth for x in range(len(vals))]
  plt.bar(x_ticks, vals, width=barWidth, label=product, edgecolor='black')

plt.title(f'Product Prices Based on Cities - {use_date}')
plt.ylabel('Turkish Lira(₺)')
plt.xticks([r + barWidth for r in range(len(use_products))], use_products)
 
plt.legend(use_cities)
plt.show()

### Product Price Based on Markets

In [15]:
plt.rcParams["figure.figsize"] = (12, 8)
barWidth = 0.20

groupped_prices = df_train.groupby(['product', 'market'], axis=1).mean()

use_products = ['Kereviz-baş', 'Kiraz', 'Kırmızı-pancar', 'Lahana', 'Limon', 'Makarna']
use_markets = ['B', 'C', 'M']
use_date = '2020-01-01'

for i, market in enumerate(use_markets):  
  vals = groupped_prices.loc[use_date, pd.IndexSlice[use_products, market]]
  x_ticks = [x + i * barWidth for x in range(len(vals))]
  plt.bar(x_ticks, vals, width=barWidth, label=product, edgecolor='black')

plt.title(f'Product Prices Based on Markets - {use_date}')
plt.ylabel('Turkish Lira(₺)')
plt.xticks([r + barWidth for r in range(len(use_products))], use_products)
 
plt.legend(use_markets)
plt.show()

### Statistical Tests

#### Seasonal Decomposition

Info

* The Seasonal Decomposition procedure decomposes a series into a seasonal component, a combined trend and an "error" component.

Conclusion

* No real-world seasonal price fluctuations were observed in products.
* No significant seasonal price fluctuation was observed in products.

In [16]:
plt.rcParams["figure.figsize"] = (16, 4)

use_products = ['İncir', 'Armut', 'Üzüm',     # september products 
                'Muz', 'Nar', 'Portakal',     # winter products
                'Marul', 'Havuç', 'Ispanak',  # spring products
                'Karpuz', 'Kavun', 'Şeftali'] # summer products

fig, ax = plt.subplots(1, 1, sharex=True)

ax.set_title('Seasonal Effect on the Monthly Price Changes')
ax.set_ylabel('Turkish Lira(₺)')
ax.grid()

for product in use_products:
  vals = df_train.loc[:, pd.IndexSlice[:, product]].mean(axis=1)
  results = seasonal_decompose(vals, model='additive', period=12)
  ax.plot(results.seasonal)

plt.legend(use_products)
plt.show()

#### KPSS test

Info

* Kwiatkowski–Phillips–Schmidt–Shin (KPSS) tests are used for testing a null hypothesis that an observable time series is stationary around a deterministic trend (i.e. trend-stationary) against the alternative of a unit root.

Conclusion

* The data has non stationary trend.

In [17]:
use_series = random.choice(df_train.columns)
vals = df_train.loc[:, use_series]

statistic, p_value, n_lags, critical_values = kpss(vals, nlags=1)

print(f'KPSS Statistic: {statistic}')
print(f'p-value: {p_value}')
print(f'num lags: {n_lags}')
print('Critial Values:')

for key, value in critical_values.items():
  print(f'\t{key} : {value}')

print(f'Result: The series is {"not " if p_value < 0.05 else ""}stationary')

#### Dickey Fuller Test

Info

* Dickey–Fuller test tests the null hypothesis that a unit root is present in an autoregressive time series model.

Conclusion

* The data has a quadratic trend.

In [18]:
use_series = random.choice(df_train.columns)
vals = df_train.loc[:, use_series]

print(f'Adfuller Test For "{use_series[1]}"\n\n')
for order in [0, 1, 2]:
    statistic, p_value, _, _, critical_values, _ = adfuller(vals)
    
    print(f'Order: {order}')
    print(f'ADF Statistic: {statistic}')
    print(f'p-value: {p_value}')
    print('Critical Values:')

    for key, value in critical_values.items():
        print(f'\t{key}: {value:.3f}')
    print(f'Result: The series is {"not " if p_value > 0.05 else ""}stationary\n')
        
    vals = vals.diff().dropna()

#### Granger Casuality Tests

Info

* The Granger causality test is a statistical hypothesis test for determining whether one time series is useful in forecasting another.

Conclusion

* USD exchange is useful for predicting price productions.

In [19]:
use_series = random.choice(df_train.columns)
use_currency = 'USD'
use_dates = pd.date_range('2016-01-01', '2020-12-01', freq='MS')

vals = df_train.loc[use_dates, use_series]
vals = vals.diff().diff().dropna()

curr = df_currency.loc[use_dates, use_currency]
curr = curr.diff().diff().dropna()

results = grangercausalitytests(pd.concat([vals, curr], axis=1), maxlag=3)

In [20]:
use_series = random.choice(df_train.columns)
use_stats = 'CPI'

vals = df_train.loc[use_dates, use_series]
vals = vals.diff().diff().dropna()

stat = df_cpi.loc[use_dates, use_stats]
stat = stat.diff().diff().dropna()

results = grangercausalitytests(pd.concat([vals, stat], axis=1), maxlag=3)

#### Auto Correlation and Partial Auto Correlation

Info

* Auto Correlation is the similarity between observations as a function of the time lag between them. 

* Partial Auto Correlation gives the partial correlation of a stationary time series with its own lagged values, regressed the values of the time series at all shorter lags.

In [21]:
use_series = random.choice(df_train.columns)
vals = df_train.loc[:, use_series].diff().diff().dropna()

fig, axes = plt.subplots(2, 1, figsize=(16, 8))

_ = plot_acf(vals, ax=axes[0], lags=12)
_ = plot_pacf(vals, ax=axes[1], lags=12, method='ywm')

### Comparing Exhange Rate and Product Prices

In [22]:
df_currency.head()

In [23]:
plt.rcParams["figure.figsize"] = (16, 12)

use_currency = ['EUR', 'USD']
use_products = ['Bulgur', 'hindi', 'badem']
use_dates    = pd.date_range('2016-01-01', '2020-12-01', freq='MS')

fig, axes = plt.subplots(3, 1, sharex=True)

axes[0].set_title('Original Data')
axes[1].set_title('1st Differencing')
axes[2].set_title('2st Differencing')

for ax in axes:
  ax.set_ylabel('Turkish Lira(₺)')
  ax.grid()

for product in use_products:
  vals = df_train.loc[use_dates, pd.IndexSlice[:, product]].mean(axis=1)
  axes[0].plot(vals)
  axes[1].plot(vals.diff())
  axes[2].plot(vals.diff().diff())

for currency in use_currency:
  vals = df_currency.loc[use_dates, currency]
  axes[0].plot(vals)
  axes[1].plot(vals.diff())
  axes[2].plot(vals.diff().diff())

plt.legend(use_products + use_currency)
plt.show()

### Correlation Matrices between Product Prices and Exchange Rates

#### Correlation Matrix Based on Categories

In [24]:
use_currency = ['EUR', 'USD', 'CNY', 'GBP', 'JPY', 'RUB']
use_category = ['et', 'kuruyemiş', 'meyve', 'sebze ve bakliyat', 'süt ürünleri ve kahvaltılık', 'tahıl ve ürünleri']

corr_matrix = np.zeros((len(use_currency), len(use_categories)))

for i, j in itertools.product(range(len(use_currency)), range(len(use_categories))):
  currency = df_currency.loc[:, use_currency[i]]
  product = df_train.loc[:, pd.IndexSlice[use_categories[j]]].mean(axis=1)
  corr_matrix[i, j] = product.corr(currency)

plt.figure(figsize = (8, 8))
plt.matshow(corr_matrix, fignum=1, cmap=plt.cm.copper)

plt.xticks(range(len(use_categories)), use_categories, rotation='vertical')
plt.yticks(range(len(use_currency)), use_currency, rotation='horizontal')

for i, j in itertools.product(range(len(use_currency)), range(len(use_categories))):
  plt.text(j, i, f'{corr_matrix[i, j]:0.4f}', color='white', horizontalalignment='center')

plt.show()

#### Correlation Matrix Based on Products

In [25]:
use_currency = ['EUR', 'USD', 'CNY', 'GBP', 'JPY', 'RUB']
use_products = ['Bulgur', 'hindi', 'badem', 'Çilek', 'Elma', 'hindi']

corr_matrix = np.zeros((len(use_currency), len(use_products)))

for i, j in itertools.product(range(len(use_currency)), range(len(use_products))):
  currency = df_currency.loc[:, use_currency[i]]
  product = df_train.loc[:, pd.IndexSlice[:, use_products[j]]].mean(axis=1)
  corr_matrix[i, j] = product.corr(currency)

plt.figure(figsize = (8, 8))
plt.matshow(corr_matrix, fignum=1, cmap=plt.cm.copper)

plt.xticks(range(len(use_products)), use_products, rotation='vertical')
plt.yticks(range(len(use_currency)), use_currency, rotation='horizontal')

for i, j in itertools.product(range(len(use_currency)), range(len(use_products))):
  plt.text(j, i, f'{corr_matrix[i, j]:0.4f}', color='white', horizontalalignment='center')

plt.show()

#### Correlation Matrix Based on Countries

In [26]:
use_currency = ['EUR', 'USD', 'CNY', 'GBP', 'JPY', 'RUB']
use_countries = ['Yurt içi', 'Yurt dışı']

corr_matrix = np.zeros((len(use_currency), len(use_countries)))

for i, j in itertools.product(range(len(use_currency)), range(len(use_countries))):
  currency = df_currency.loc[:, use_currency[i]]
  product = df_train.loc[:, pd.IndexSlice[:, :, use_countries[j]]].mean(axis=1)
  corr_matrix[i, j] = product.corr(currency)

plt.figure(figsize = (8, 8))
plt.matshow(corr_matrix, fignum=1, cmap=plt.cm.copper)

plt.xticks(range(len(use_countries)), use_countries, rotation='vertical')
plt.yticks(range(len(use_currency)), use_currency, rotation='horizontal')

for i, j in itertools.product(range(len(use_currency)), range(len(use_countries))):
  plt.text(j, i, f'{corr_matrix[i, j]:0.4f}', color='white', horizontalalignment='center')

plt.show()

## METHOD-1 (ARIMA)

* p: number of autoregressive terms (AR order)
* d: number of nonseasonal differences (differencing order)
* q: number of moving-average terms (MA order)

In [27]:
use_series = random.choice(df_train.columns)
use_train_dates = pd.date_range('2016-01-01', '2019-12-01', freq='MS')
use_valid_dates = pd.date_range('2020-01-01', '2020-12-01', freq='MS')
order = (3, 2, 0)

train_x = df_train.loc[use_train_dates, use_series]
test_x = df_train.loc[use_valid_dates, use_series]

In [28]:
model = ARIMA(train_x, order=order, freq='MS')
result = model.fit()
print(result.summary())

In [29]:
forecast = result.get_forecast(steps=12)
fc = forecast.predicted_mean

conf = forecast.conf_int(alpha=0.05)
lower = conf.iloc[:, 0]
upper = conf.iloc[:, 1]

In [30]:
plt.figure(figsize=(12,5))
plt.title(f'Product Price Forecast of "{use_series[1]}"')

plt.plot(train_x, label='training data')
plt.plot(test_x, label='test data')
plt.plot(fc, label='forecast')

plt.fill_between(lower.index, lower.values, upper.values, color='k', alpha=.15)

plt.ylabel('Turkish Lira(₺)')
plt.legend()
plt.grid()
plt.show()

print(f'RMSE: {rmse(test_x, fc)}')

## METHOD-2 (VAR)

* p: number of var terms (VAR order)

In [31]:
use_series = random.choice(df_train.columns)
use_currency = 'USD'
use_train_dates = pd.date_range('2016-01-01', '2019-12-01', freq='MS')
use_valid_dates = pd.date_range('2020-01-01', '2020-12-01', freq='MS')

train_X1 = df_train.loc[use_train_dates, use_series]
train_X2 = df_currency.loc[use_train_dates, use_currency]

train_x = np.column_stack([
    train_X1.diff().diff().dropna(), 
    train_X2.diff().diff().dropna()])

test_X1 = df_train.loc[use_valid_dates, use_series]
test_X2 = df_currency.loc[use_valid_dates, use_currency]

In [32]:
model = VAR(train_x)
result = model.fit(4)
print(result.summary())

In [33]:
fc, lower, upper = result.forecast_interval(train_x[-4:], steps=12)

fc_1 = (train_X1.iloc[-1] - train_X1.iloc[-2]) + fc[:, 0].cumsum() # to 1st diff
fc_1 = train_X1.iloc[-1] + fc_1.cumsum() # to 0st diff

fc_2 = (train_X2.iloc[-1] - train_X2.iloc[-2]) + fc[:, 1].cumsum() # to 1st diff
fc_2 = train_X2.iloc[-1] + fc_2.cumsum() # to 0st diff

In [34]:
fig, axes = plt.subplots(2, 1, figsize=(12,10), sharex=True)

axes[0].set_title(f'Product Price Forecast of "{use_series[1]}"')
axes[0].plot(train_X1, label='training data')
axes[0].plot(test_X1, label='test data')
axes[0].plot(use_valid_dates, fc_1, label='forecast')
axes[0].fill_between(use_valid_dates, fc_1 + lower[:, 0], fc_1 + upper[:, 0], color='k', alpha=.15)

axes[1].set_title(f'Exchange Forecast of "{use_currency}"')
axes[1].plot(train_X2, label='training data')
axes[1].plot(test_X2, label='test data')
axes[1].plot(use_valid_dates, fc_2, label='forecast')
axes[1].fill_between(use_valid_dates, fc_2 + lower[:, 1], fc_2 + upper[:, 1], color='k', alpha=.15)

for ax in axes:
    ax.set_ylabel('Turkish Lira(₺)')
    ax.legend()
    ax.grid()

plt.show()

print(f'RMSE(product): {rmse(test_X1, fc_1)}')
print(f'RMSE(currency): {rmse(test_X2, fc_2)}')

## METHOD-3 (LSTM)

In [35]:
use_series = random.choice(df_train.columns)
use_currency = 'USD'
use_train_dates = pd.date_range('2016-01-01', '2019-12-01', freq='MS')
use_valid_dates = pd.date_range('2020-01-01', '2020-12-01', freq='MS')
nsteps = 6

model = Sequential([
    LSTM(units=128, input_shape=(nsteps, 2), return_sequences=True),
    LSTM(units=128),
    Dense(units=64),
    Dense(units=2)
])

scaler = MinMaxScaler(feature_range=(0, 1))
model.compile(optimizer='adam', loss='mean_squared_error')

In [36]:
train_X1 = df_train.loc[use_train_dates, use_series]
train_X2 = df_currency.loc[use_train_dates, use_currency]

test_X1 = df_train.loc[use_valid_dates, use_series]
test_X2 = df_currency.loc[use_valid_dates, use_currency]

train_data = np.column_stack([train_X1, train_X2])
scaler.fit(train_data)
train_data = scaler.transform(train_data)

train_x = np.array([train_data[i - nsteps:i] for i in range(nsteps, len(train_data))])
train_y = np.array([train_data[i] for i in range(nsteps, len(train_data))])

In [37]:
model.fit(x=train_x, y=train_y, batch_size=4, epochs=10)

In [38]:
test_x = train_x[-1].reshape(1, nsteps, 2)
fc = np.zeros((12, 2))

for i in range(12):
  pred = model.predict(test_x)
  fc[i] = pred[0]
  test_x = np.hstack([test_x[:, -(nsteps - 1):], pred.reshape(1, 1, 2)])

fc = scaler.inverse_transform(fc)

In [39]:
fig, axes = plt.subplots(2, 1, figsize=(12,10), sharex=True)

axes[0].set_title(f'Product Price Forecast of "{use_series[1]}"')
axes[0].plot(train_X1, label='training data')
axes[0].plot(test_X1, label='test data')
axes[0].plot(use_valid_dates, fc[:, 0], label='forecast')

axes[1].set_title(f'Exchange Forecast of "{use_currency}"')
axes[1].plot(train_X2, label='training data')
axes[1].plot(test_X2, label='test data')
axes[1].plot(use_valid_dates, fc[:, 1], label='forecast')

for ax in axes:
    ax.set_ylabel('Turkish Lira(₺)')
    ax.legend()
    ax.grid()

plt.show()

print(f'RMSE(product): {rmse(test_X1, fc[:, 0])}')
print(f'RMSE(currency): {rmse(test_X2, fc[:, 1])}')

## METHOD-4 (Extrapolation with Polynomial)

In [40]:
use_product = random.choice(df_train.columns)
poly_degree = 2

train_y = df_train[use_product].iloc[:48]
test_y = df_train[use_product].iloc[48:]

y = train_y.values
y = y[:, np.newaxis]
x = np.linspace(0, len(y), len(y), endpoint=False)[:, np.newaxis]

In [41]:
model = make_pipeline(PolynomialFeatures(poly_degree), Ridge(alpha=1e-3))
model.fit(x, y)

In [42]:
fc_y = pd.Series(data=[model.predict([[x]])[0, 0] for x in range(48, 60)], index=test_y.index)

plt.figure(figsize=(12,5))

plt.title(f'Product Price Forecast of "{use_product[1]}"')
plt.ylabel('Turkish Lira(₺)')

plt.plot(train_y, label='train data')
plt.plot(test_y, label='test data')
plt.plot(fc_y, label='forecast')

plt.legend()
plt.grid()
plt.show()

print(f'RMSE: {rmse(fc_y, test_y)}')

## Score Boosting - Prediction of Bias

The real data has upper trend that makes loss value higher. We should add some sort of bias to the prediction series.

In [43]:
# constant bias

W0 = np.array([1.00 for _ in range(12)])
W1 = np.array([1.01 for _ in range(12)])
W2 = np.array([1.02 for _ in range(12)])
W3 = np.array([1.04 for _ in range(12)])
W4 = np.array([1.08 for _ in range(12)])
W5 = np.array([1.16 for _ in range(12)])

# linear weighted bias

W6 = np.array([1.0 + 0.16 * (i / 12) for i in range(1, 13)])
W7 = np.array([1.0 + 0.20 * (i / 12) for i in range(1, 13)])
W8 = np.array([1.0 + 0.26 * (i / 12) for i in range(1, 13)])

# USD exchange weighted bias

vals = df_currency.loc['2021-01-01': '2021-12-01', 'USD'].values
vals = (vals - vals.min()) / (vals.max() - vals.min())

W9 = np.array([1.0 + 0.16 * i for i in vals])

In [44]:
fig = plt.subplots(figsize=(12, 4))

labels = ['W0', 'W1', 'W2', 'W3', 'W4', 'W5', 'W6', 'W7', 'W8', 'W9']
scores = [6.41, 6.06, 5.72, 5.07, 3.96, 3.47, 2.80, 2.30, 2.41, 4.10]

plt.title('Scores Based on Weighted Biases')
plt.bar(labels, scores)
plt.show()

## Forecasting

In [45]:
df_test.head()

In [46]:
df_forecast = pd.DataFrame(0, columns=df_train.columns, index=df_test.index.unique())
order = (3, 2, 0)
weight_coefs = W7
forecast_data = {}

for col in df_forecast.columns:
  model = ARIMA(df_train[col].values, order=order)
  fitted = model.fit()
  fc = weight_coefs * fitted.forecast(len(df_forecast))
  df_forecast.loc[:, col] = pd.Series(fc, index=df_forecast.index)

for idx, row in df_test.iterrows():
  fc_val = df_forecast.loc[idx, (row['ürün kategorisi'], row['ürün'], row['ürün üretim yeri'], row['şehir'], row['market'])]
  forecast_data[row['id']] = fc_val

df_result = pd.DataFrame(data    = [[idx, val] for idx, val in forecast_data.items()], 
                         columns = ['id', 'ürün fiyatı'])

In [47]:
df_result.head()

In [48]:
df_result.to_csv('result.csv', index=False)