<h3>Time Series Algorithms</h3>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math  # for rounding up values
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
sns.set()
pd.options.display.max_rows = 100

df = pd.read_csv('CR_FC_PREP_0522.csv')
df['POD'] = pd.to_datetime(df['POD'])
df.set_index('POD', inplace=True)

df.head()
df.tail()
df.info()
print(f'Dataset size: {df.shape}')

In [None]:
# After 2023 data to compare accuracy of forecast
df_after = pd.read_csv('CR_FC_PREP_23.csv')
df_after['POD'] = pd.to_datetime(df_after['POD'])

# df_after.set_index('POD',inplace=True)

df_after.head()
df_after.tail()
df_after.info()
print(f'Dataset size: {df_after.shape}')

In [None]:
df.plot()

In [None]:
from scipy.stats import skew
skewness = skew(df['RATE'])

# print the skewness value
print("Skewness of RATE column: ", skewness)

# check if the skewness is positive or negative
if skewness > 0:
    print("The RATE column is positively skewed")
elif skewness < 0:
    print("The RATE column is negatively skewed")
else:
    print("The RATE column is symmetric")

In [None]:
# Check the stationarity of the time series data
result = adfuller(df['RATE'])
print('ADF Statistic: %f' % result[0])
print('Critical Values:')
for key, value in result[4].items():
    print(f'\t{key}: {value}')
print('p-value: %f' % result[1])
if result[1] > 0.05:
    print('The data is non-stationary. Applying first-order differencing.')
    df['diff'] = df['RATE'].diff()
    df = df.dropna()
else:
    print('The data is stationary.')

In [None]:

# Step 3: Determine the appropriate ARIMA parameters
# check for stationarity using ADF test
# plot ACF and PACF
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 6))
plot_acf(df['diff'], ax=ax1, lags=20)
plot_pacf(df['diff'], ax=ax2, lags=20)
plt.show()

# Step 4: Determine the values of p, d, and q
p = 1 # number of AR terms (lags) from PACF plot
d = 1 # first-order differencing
q = 1# number of MA terms (lags) from ACF plot

print(f'p: {p}, d: {d}, q: {q}')


In [None]:
# Decompose the time series into its trend, seasonal, and residual components
decomposition = seasonal_decompose(df['RATE'], model='additive', period=365)

<h4>Transformation to target variable</h4>

In [None]:
def applyLogTrans(df):
    # Apply a logarithmic transformation to the rate variable
    return  np.log(df['RATE'])

In [None]:
df['RATE_LOG'] = applyLogTrans(df)
df.head()

<h4>Start Performing Time Series Algorithm</h4>

In [153]:
from statsmodels.tsa.arima.model import ARIMA
import pmdarima as pm

def find_AutoARIMA(df):
    # Fit the Auto ARIMA model
    model = pm.auto_arima(df['RATE_LOG'], seasonal=False, suppress_warnings=True, error_action="ignore", stepwise=True, trace=True, test='adf')
    model.summary()

    return model

In [None]:
# Auto Arima
arimaModel = find_AutoARIMA(df)

<h4>Creating new dates after 52 days with 1 empty column</h4>

In [None]:
from datetime import timedelta

# start_date = df.index[-1] + timedelta(days=1)
# print(start_date)
n_periods = 52

def newDateFrame(df, n_periods):
    # Create a data frame for the forecast for the next 24 months
    index_of_fc = pd.date_range(start=df.index[-1], periods = n_periods+1, freq='D')
    newFrameName = pd.DataFrame(index=index_of_fc[1:],columns=df.columns)
    newFrameName.iloc[1:]
    newFrameName.shape

    return newFrameName

In [None]:
arima_df = newDateFrame(df, n_periods)
arima_df.head()
arima_df.tail()
arima_df.info()

<h4>Forecasting values</h4>

In [None]:
# Forecast the future values using ARIMA
forecast_arima_log = arimaModel.predict(n_periods=n_periods)

# Transform the forecasted values back to original scale
forecast_arima = np.exp(forecast_arima_log)

print(forecast_arima)
print(len(forecast_arima))

arima_df['RATE'] = forecast_arima
arima_df['RATE'] = arima_df['RATE'].apply(lambda x: round(x, 2))

# Remove RATE_LOG column to return back to original
arima_df = arima_df.drop('RATE_LOG',axis=1)
arima_df.head()

<h4>Measure accuracy comparing with actual cost rate</h4>

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error
def findACC(forecast_df, actual_df):

    # Actual Data
    actual_y = actual_df['RATE']

    # Forecasted Data
    forecast_Y = forecast_df['RATE']

    # Calculate the accuracy of the model using mean absolute error and R-squared
    mae = mean_absolute_error(actual_y, forecast_Y)
    r2 = r2_score(actual_y, forecast_Y)

    print(f"Mean absolute error: {mae:.3f}")
    print(f"R-squared value: {r2:.3f}")

In [None]:
findACC(arima_df, df_after)

In [None]:
df_after.set_index('POD', inplace=True)

In [None]:
# Plot the forecast with current dataset
plt.figure(figsize=(20, 8))
plt.plot(df)
plt.plot(arima_df, color='darkgreen')
plt.plot(df_after, color='darkred')
plt.title("Final Forecast of COST_RATE")
plt.show()