In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [7]:
# Load the data
data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

In [8]:


# Define the start date
start_date = pd.to_datetime('2010-01-01')

# Convert 'id' to datetime in the train set
data['date'] = pd.to_datetime(data['id'], origin=start_date, unit='D')

# Convert 'id' to datetime in the test set
test_data['date'] = pd.to_datetime(test_data['id'], origin=start_date, unit='D')


In [9]:
data.drop(columns=['id'], inplace =True)
data

Unnamed: 0,price,date
0,10.383,2010-01-02
1,6.163,2010-01-03
2,8.812,2010-01-04
3,7.994,2010-01-05
4,6.910,2010-01-06
...,...,...
4995,0.683,2023-09-06
4996,0.679,2023-09-07
4997,0.697,2023-09-08
4998,0.687,2023-09-09


In [10]:
# create 12 month moving average
data['MA12'] = data['price'].rolling(12).mean()

# plot the data and MA
import plotly.express as px
def plot_line_graph(data, x_col, y_cols, template='plotly_dark'):
    fig = px.line(data, x=x_col, y=y_cols, template=template)
    fig.show()

In [11]:
plot_line_graph(data, "date", ["price","MA12"])

In [12]:
# extract month and year from dates**
data['day'] = [i.day for i in data['date']]
data['Month'] = [i.month for i in data['date']]
data['Year'] = [i.year for i in data['date']]

# create a sequence of numbers
data['Series'] = np.arange(1,len(data)+1)

In [13]:
##Feature Engineering

# 7 day moving average
data['7_day_MA'] = data['price'].rolling(window=7).mean()

# 30 day moving average
data['30_day_MA'] = data['price'].rolling(window=30).mean()

# 7 day standard deviation
data['7_day_std'] = data['price'].rolling(window=7).std()

# 30 day standard deviation
data['30_day_std'] = data['price'].rolling(window=30).std()

delta = data['price'].diff()
up = delta.clip(lower=0)
down = -1*delta.clip(upper=0)
ema_up = up.ewm(com=13, adjust=False).mean()
ema_down = down.ewm(com=13, adjust=False).mean()
rs = ema_up/ema_down

data['RSI'] = 100 - (100/(1 + rs))

data['price_change'] = data['price'].diff()
data['price_pct_change'] = data['price'].pct_change()

In [14]:
data.dropna(inplace = True)
data

Unnamed: 0,price,date,MA12,day,Month,Year,Series,7_day_MA,30_day_MA,7_day_std,30_day_std,RSI,price_change,price_pct_change
29,4.219,2010-01-31,3.931500,31,1,2010,30,4.203571,5.241967,0.419046,1.716148,16.575332,0.300,0.076550
30,4.382,2010-02-01,4.012000,1,2,2010,31,4.345429,5.041933,0.216428,1.420523,17.794002,0.163,0.038635
31,4.134,2010-02-02,4.047083,2,2,2010,32,4.281571,4.974300,0.200711,1.413592,17.378051,-0.248,-0.056595
32,4.099,2010-02-03,4.106667,3,2,2010,33,4.234286,4.817200,0.198897,1.221175,17.316530,-0.035,-0.008466
33,3.947,2010-02-04,4.132250,4,2,2010,34,4.163286,4.682300,0.200260,1.072639,17.034485,-0.152,-0.037082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.683,2023-09-06,0.682750,6,9,2023,4996,0.682000,0.607467,0.006481,0.084962,60.105973,0.008,0.011852
4996,0.679,2023-09-07,0.682000,7,9,2023,4997,0.681286,0.613067,0.006499,0.083914,59.043226,-0.004,-0.005857
4997,0.697,2023-09-08,0.684750,8,9,2023,4998,0.682571,0.619467,0.008600,0.082701,62.275670,0.018,0.026510
4998,0.687,2023-09-09,0.685417,9,9,2023,4999,0.684857,0.625567,0.006986,0.080613,59.467662,-0.010,-0.014347


In [15]:
data.drop(columns=['MA12','Year'],inplace = True)

In [16]:
data

Unnamed: 0,price,date,day,Month,Series,7_day_MA,30_day_MA,7_day_std,30_day_std,RSI,price_change,price_pct_change
29,4.219,2010-01-31,31,1,30,4.203571,5.241967,0.419046,1.716148,16.575332,0.300,0.076550
30,4.382,2010-02-01,1,2,31,4.345429,5.041933,0.216428,1.420523,17.794002,0.163,0.038635
31,4.134,2010-02-02,2,2,32,4.281571,4.974300,0.200711,1.413592,17.378051,-0.248,-0.056595
32,4.099,2010-02-03,3,2,33,4.234286,4.817200,0.198897,1.221175,17.316530,-0.035,-0.008466
33,3.947,2010-02-04,4,2,34,4.163286,4.682300,0.200260,1.072639,17.034485,-0.152,-0.037082
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.683,2023-09-06,6,9,4996,0.682000,0.607467,0.006481,0.084962,60.105973,0.008,0.011852
4996,0.679,2023-09-07,7,9,4997,0.681286,0.613067,0.006499,0.083914,59.043226,-0.004,-0.005857
4997,0.697,2023-09-08,8,9,4998,0.682571,0.619467,0.008600,0.082701,62.275670,0.018,0.026510
4998,0.687,2023-09-09,9,9,4999,0.684857,0.625567,0.006986,0.080613,59.467662,-0.010,-0.014347


In [17]:
# import pycaret time series and init setup
from pycaret.time_series import *
s = setup(data,target='price', fh = 365, session_id = 512)

Unnamed: 0,Description,Value
0,session_id,512
1,Target,price
2,Approach,Univariate
3,Exogenous Variables,Present
4,Original data shape,"(4971, 12)"
5,Transformed data shape,"(4971, 12)"
6,Transformed train set shape,"(4606, 12)"
7,Transformed test set shape,"(365, 12)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


In [18]:
# check statistical tests on original data
check_stats()

Unnamed: 0,Test,Test Name,Data,Property,Setting,Value
0,Summary,Statistics,Transformed,Length,,4971.0
1,Summary,Statistics,Transformed,# Missing Values,,0.0
2,Summary,Statistics,Transformed,Mean,,2.078318
3,Summary,Statistics,Transformed,Median,,1.997
4,Summary,Statistics,Transformed,Standard Deviation,,1.251783
5,Summary,Statistics,Transformed,Variance,,1.566961
6,Summary,Statistics,Transformed,Kurtosis,,0.207098
7,Summary,Statistics,Transformed,Skewness,,0.804191
8,Summary,Statistics,Transformed,# Distinct Values,,2790.0
9,White Noise,Ljung-Box,Transformed,Test Statictic,"{'alpha': 0.05, 'K': 24}",111398.279472


In [19]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
naive,Naive Forecaster,sktime.forecasting.naive.NaiveForecaster,True
grand_means,Grand Means Forecaster,sktime.forecasting.naive.NaiveForecaster,True
snaive,Seasonal Naive Forecaster,sktime.forecasting.naive.NaiveForecaster,True
arima,ARIMA,sktime.forecasting.arima.ARIMA,True
auto_arima,Auto ARIMA,sktime.forecasting.arima.AutoARIMA,True
stlf,STLF,sktime.forecasting.trend.STLForecaster,True
croston,Croston,sktime.forecasting.croston.Croston,True
lr_cds_dt,Linear w/ Cond. Deseasonalize & Detrending,pycaret.containers.models.time_series.BaseCdsD...,True
en_cds_dt,Elastic Net w/ Cond. Deseasonalize & Detrending,pycaret.containers.models.time_series.BaseCdsD...,True
ridge_cds_dt,Ridge w/ Cond. Deseasonalize & Detrending,pycaret.containers.models.time_series.BaseCdsD...,True


In [21]:
# compare baseline models
best = compare_models(include = ['snaive','snaive'])

<pandas.io.formats.style.Styler at 0x7f0af06951b0>

Processing:   0%|          | 0/13 [00:00<?, ?it/s]

KeyError: ignored

KeyError: ignored

In [22]:
# plot forecast for 952 days in future
plot_model(best, plot = 'forecast', data_kwargs = {'fh' : 1317})

In [23]:
# predict on test set
holdout_pred = predict_model(best)

Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,R2
0,Seasonal Naive Forecaster,0.3354,0.3115,0.151,0.203,0.2256,0.1831,-1.0693


In [24]:
# plot forecast for 952 days in future
plot_model(best, plot = 'forecast', data_kwargs = {'fh' : 1317})

In [27]:
# generate forecast period in future
result = predict_model(best, fh = 1317)

In [28]:
# save pipeline
save_model(best, 'my_tuned_pipeline')

Transformation Pipeline and Model Successfully Saved


(ForecastingPipeline(steps=[('forecaster',
                             TransformedTargetForecaster(steps=[('model',
                                                                 NaiveForecaster(sp=60))]))]),
 'my_tuned_pipeline.pkl')

In [29]:
# load pipeline
loaded_best_pipeline = load_model('my_tuned_pipeline')
loaded_best_pipeline

Transformation Pipeline and Model Successfully Loaded


In [30]:
result

Unnamed: 0,y_pred
4635,1.045
4636,1.059
4637,1.058
4638,1.069
4639,1.046
...,...
5947,0.930
5948,0.916
5949,0.901
5950,0.888


In [31]:
result.to_csv('result_snaive_no-tuned.csv',index = False)