<h1>Iterative Nested Forecasting</h1>

In [65]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from scipy import stats
from sklearn.model_selection import train_test_split
from IPython.core.interactiveshell import InteractiveShell

# For LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, LSTM, Dense, Bidirectional
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# For Arima
import pmdarima as pm

InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
sns.set()
pd.options.display.max_rows = 100
warnings.filterwarnings('ignore')


<h4>Importing Datasets</h4>

In [66]:
# Import dataset and clean, ready as a dataframe for creating keys
def createDF(datasets):
    df = pd.read_csv(datasets, converters={
                     'PARTY_ID': str, 'COM_ID': str, 'CNTR_SIZE': str})

    # Formating to type and remove NaN values
    df['POD'] = pd.to_datetime(df['POD'])
    df['ENCODED_TYPE'] = df['ENCODED_TYPE'].fillna(-1).astype(int)
    df = df.dropna(subset=['ENCODED_TYPE'])
    df['RATE'] = df['RATE'].fillna(-1).astype(float)
    df = df.dropna(subset=['RATE'])
    df['ENCODED_TYPE'] = df['ENCODED_TYPE'].astype(int)
    df_clean = df.dropna().reset_index(drop=True)

    # Selecting and rearranging columns
    sel_col = ['CSL_ID', 'CNTR_ID', 'POD_ID', 'ETD_POL_D', 'PARTY_ID',
               'PARTY_NAME', 'POD', 'CNTR_SIZE', 'CNTR_TYPE', 'RATE']
    df_fc = df_clean[sel_col]

    # Removing years we do not want to process in our models
    df_filtered = df_fc[df_fc['POD'].dt.year != 2002]

    # Sorting the dates
    df_filtered = df_filtered.sort_values(by='POD').reset_index(drop=True)

    return df_filtered


In [67]:
# Create Dataframes for old and new
old_data = '.\Datasets\CR_COST_FC.csv'
df1 = createDF(old_data)
df1.head()

new_data = '.\Datasets\CR_COST_FC_new.csv'
df2 = createDF(new_data)
df2.head()


Unnamed: 0,CSL_ID,CNTR_ID,POD_ID,ETD_POL_D,PARTY_ID,PARTY_NAME,POD,CNTR_SIZE,CNTR_TYPE,RATE
0,ECS01050600224,CTNR010050700354,ROTTERDAM,NLRTM,1008827,YANG MING (SINGAPORE) PTE. LTD.,2005-07-15,40,HC NOR,1620.0
1,ECS01050600610,CTNR010050700353,FELIXSTOWE,GBFXT,1002303,GLOBELINK FALLOW LIMITED,2005-07-15,40,GP,1800.0
2,ECS01050600041,CTNR010050700351,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-15,40,HC NOR,2170.0
3,ECS01050600163,CTNR010050700370,PASIR GUDANG,MYPGU,1002767,GLOBELINK CONTAINER LINES (JB) S/B,2005-07-15,40,HC,280.0
4,ECS01050600456,CTNR010050700450,KARACHI (KICT),PKKHI,1002783,GLOBELINK PAKISTAN (PVT) LTD,2005-07-16,40,HC,1625.0


Unnamed: 0,CSL_ID,CNTR_ID,POD_ID,ETD_POL_D,PARTY_ID,PARTY_NAME,POD,CNTR_SIZE,CNTR_TYPE,RATE
0,ECS01050600224,CTNR010050700354,ROTTERDAM,NLRTM,1008827,YANG MING (SINGAPORE) PTE. LTD.,2005-07-15,40,HC NOR,1620.0
1,ECS01050600610,CTNR010050700353,FELIXSTOWE,GBFXT,1002303,GLOBELINK FALLOW LIMITED,2005-07-15,40,GP,1800.0
2,ECS01050600041,CTNR010050700351,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-15,40,HC NOR,2170.0
3,ECS01050600163,CTNR010050700370,PASIR GUDANG,MYPGU,1002767,GLOBELINK CONTAINER LINES (JB) S/B,2005-07-15,40,HC,280.0
4,ECS01050600456,CTNR010050700450,KARACHI (KICT),PKKHI,1002783,GLOBELINK PAKISTAN (PVT) LTD,2005-07-16,40,HC,1625.0


<h4>Creating Dictionary Keys</h4>

In [68]:
# This function is to filter and create keys
def filter_dataframe(df):
    filtered_dataframes = {}

    for (port, size, ctype, party_id), group in df.groupby(['POD_ID', 'CNTR_SIZE', 'CNTR_TYPE', 'PARTY_ID']):
        group = group.reset_index(drop=True).sort_values(by='POD')
        df_id = f"Port_{port}_Size_{size}_Type_{ctype}_PartyID_{party_id}"
        filtered_dataframes[df_id] = group

    return filtered_dataframes


In [69]:
# Creating keys from data
print("Old Data keys:")
filtered_dataframe1 = filter_dataframe(df1)
df_ids1 = list(filtered_dataframe1.keys())
print(list(df_ids1))
print(len(list(df_ids1)))

print("\nNew Data keys:")
filtered_dataframe2 = filter_dataframe(df2)
df_ids2 = list(filtered_dataframe2.keys())
print(list(df_ids2))
print(len(list(df_ids2)))

# Removing Keys that have less then 500 rows as it is not enough data points for LSTM
print("\nRemoving keys that has less then 500 entries:")
# Old data keys
filtered_dataframe1_large = {key: df for key,
                             df in filtered_dataframe1.items() if len(df) >= 1000}
large_df_ids1 = list(filtered_dataframe1_large.keys())
print(list(large_df_ids1))
print(len(list(large_df_ids1)))
print("\n")

# New data keys
filtered_dataframe2_large = {key: df for key,
                             df in filtered_dataframe2.items() if len(df) >= 1000}
large_df_ids2 = list(filtered_dataframe2_large.keys())
print(list(large_df_ids2))
print(len(list(large_df_ids2)))


Old Data keys:
['Port_(CONSTANZA)_Size_40_Type_GP_PartyID_010007816', 'Port_(CONSTANZA)_Size_40_Type_HC_PartyID_010007816', 'Port_AARHUS_Size_20_Type_GP_PartyID_01000043', 'Port_AARHUS_Size_20_Type_GP_PartyID_0100027830', 'Port_AARHUS_Size_20_Type_GP_PartyID_010006666', 'Port_AARHUS_Size_40_Type_GP_PartyID_01000043', 'Port_AARHUS_Size_40_Type_HC_PartyID_01000043', 'Port_AARHUS_Size_40_Type_HC_PartyID_0100027830', 'Port_AARHUS_Size_40_Type_HC_PartyID_0100028193', 'Port_AARHUS_Size_40_Type_HC_PartyID_010006666', 'Port_ABIDJAN_Size_20_Type_GP_PartyID_0100027878', 'Port_ABIDJAN_Size_40_Type_HC_PartyID_010021097', 'Port_ADELAIDE_Size_20_Type_GP_PartyID_01002775', 'Port_ADELAIDE_Size_40_Type_GP_PartyID_01002775', 'Port_ADELAIDE_Size_40_Type_HC_PartyID_01002775', 'Port_ADELAIDE_Size_40_Type_HC_PartyID_01005078', 'Port_ADELAIDE_Size_40_Type_HC NOR_PartyID_01002775', 'Port_ADELAIDE(AU)_Size_20_Type_GP_PartyID_01002775', 'Port_ADELAIDE(AU)_Size_40_Type_HC_PartyID_01002775', 'Port_ADELAIDE(AU)_Si

<h2>Data Preprocessing</h2>

<h4>Interpolate old_df missing values and group missing entries by weeks</h4>

In [79]:
def interpolate_and_aggregate(df, key):

    # Initialise RobustScaler(handle outliers)
    scaler = RobustScaler()
    df[['RATE']] = scaler.fit_transform(df[['RATE']])

    # Check if the DataFrame has any NaN values
    if df.isna().any().any():
        print(f"{key} DataFrame contains NaN values.")
    else:
        print(f"{key} DataFrame does not contain NaN values.")

    # Drop duplicates
    sel_df = df.drop_duplicates(subset=['POD', 'RATE']).reset_index(drop=True)

    # Extract POD_ID and PARTY_ID from the first row
    pod_id = df['POD_ID'].iloc[0]
    party_id = df['PARTY_ID'].iloc[0]

    # Create a new dataframe with a date range from min to max date in your dataframe
    new_df = pd.DataFrame()
    new_df['POD'] = pd.date_range(
        start=sel_df['POD'].min(), end=sel_df['POD'].max())

    # Merge the original dataframe with the new one
    df_interpolated = pd.merge(
        new_df, sel_df[['POD', 'RATE']], on='POD', how='left')

    # Perform spline interpolation
    df_interpolated['RATE'] = df_interpolated['RATE'].interpolate(
        method='polynomial', order=1)

    # Now we need to inverse the scaling
    df_interpolated['RATE'] = scaler.inverse_transform(
        df_interpolated[['RATE']])
    df_interpolated['RATE'] = df_interpolated['RATE'].round(2)

    # Create YearMonthWeek directly from the 'POD'
    df_interpolated['YearMonthWeek'] = df_interpolated['POD'] - \
        pd.to_timedelta(df_interpolated['POD'].dt.dayofweek, unit='D')

    # Create a new dataframe with every week in the range
    all_weeks = pd.date_range(start=df_interpolated['POD'].min(
    ), end=df_interpolated['POD'].max(), freq='W')
    all_weeks_df = pd.DataFrame(all_weeks, columns=['POD'])

    # Create YearMonthWeek in all_weeks_df
    all_weeks_df['YearMonthWeek'] = all_weeks_df['POD'] - \
        pd.to_timedelta(all_weeks_df['POD'].dt.dayofweek, unit='D')

    # Merge this with your original dataframe
    merged_df = pd.merge(all_weeks_df, df_interpolated,
                         on=['YearMonthWeek'], how='left')

    # Group by YearMonthWeek and compute your rate
    grouped = merged_df.groupby(['YearMonthWeek'])

    agg_df = pd.DataFrame(
        columns=['YearMonthWeek', 'Rate', 'POD_ID', 'PARTY_ID'])

    for group_name, group_df in grouped:
        year_month_week = group_name

        # Calculate skewness of RATE values
        rate_skew = group_df['RATE'].skew()

        # Calculate trimmed mean of RATE values
        # trimming 10% from each end
        rate_metric = stats.trim_mean(group_df['RATE'].dropna().values, 0.1)

        new_row = {'YearMonthWeek': year_month_week,
                   'Rate': rate_metric, 'POD_ID': pod_id, 'PARTY_ID': party_id}

        # Append row to aggregated dataframe
        agg_df = agg_df.append(new_row, ignore_index=True)

    agg_df = agg_df.sort_values(by='YearMonthWeek').reset_index(drop=True)

    # Show dataframe preview:
    print(agg_df)

    # Plot and visualise interpolated results on actual
    # plt.figure(figsize=(20, 10))
    # plt.plot(filtered_dataframe1_large[key]['POD'],
    #          filtered_dataframe1_large[key]['RATE'], color='blue', label="Actual Data")
    # plt.plot(agg_df['YearMonthWeek'], agg_df['Rate'],
    #          color='red', label="Aggregated Data(weeks)")

    # plt.xlabel('Date(Year Month Week)')
    # plt.ylabel('Cost Rate(USD)')
    # plt.title(key)
    # plt.legend()
    # plt.show()

    return agg_df


# dictionary to store the results
processed_dfs = {}

# loop over all keys in the original dictionary
for key in filtered_dataframe1_large.keys():
    processed_dfs[key] = interpolate_and_aggregate(
        filtered_dataframe1_large[key], key)

# Preview dictionary
print()


Port_AUCKLAND_Size_40_Type_HC_PartyID_01005136 DataFrame does not contain NaN values.
    YearMonthWeek      Rate    POD_ID  PARTY_ID
0      2005-07-18  1.266667  AUCKLAND  01005136
1      2005-07-25  1.420000  AUCKLAND  01005136
2      2005-08-01  1.274286  AUCKLAND  01005136
3      2005-08-08  1.204286  AUCKLAND  01005136
4      2005-08-15  1.264286  AUCKLAND  01005136
..            ...       ...       ...       ...
913    2023-01-16  1.815714  AUCKLAND  01005136
914    2023-01-23  2.512857  AUCKLAND  01005136
915    2023-01-30  0.528571  AUCKLAND  01005136
916    2023-02-06  0.410000  AUCKLAND  01005136
917    2023-02-13  0.410000  AUCKLAND  01005136

[918 rows x 4 columns]
Port_BUSAN_Size_40_Type_HC_PartyID_010004286 DataFrame does not contain NaN values.
    YearMonthWeek      Rate POD_ID   PARTY_ID
0      2005-10-03  2.780000  BUSAN  010004286
1      2005-10-10  2.780000  BUSAN  010004286
2      2005-10-17  2.780000  BUSAN  010004286
3      2005-10-24  2.780000  BUSAN  010004286


<h4>Sorting and getting key arrays</h4>

In [71]:
def getPortKeys(keybunch):
    keybunch_pouch = []
    # Create a dictionary with corresponding dataframes
    keybunch_subset = {}

    # Get a dictionary with key and number of rows for each dataframe in filtered_dataframes
    key_row_counts = {key: len(keybunch[key]) for key in keybunch}

    # Sort the key_row_counts dictionary by value (number of rows) in descending order
    sorted_key_row_counts = sorted(
        key_row_counts.items(), key=lambda item: item[1], reverse=True)

    for key, row_count in sorted_key_row_counts:
        keybunch_subset[key] = keybunch[key]
        print(f"Number of rows in {key}: {row_count}")
        keybunch_pouch.append(key)

    # Return array of keys
    return keybunch_pouch


In [72]:
# This is changing it to an array

print('Processed Old Dataset Keybunch:')
process_old_df = getPortKeys(processed_dfs)
print(len(process_old_df))
print('\n')

print('Old Dataset Keybunch:')
old_df = getPortKeys(filtered_dataframe1_large)
print(len(old_df))
print('\n')

print('New Dataset Keybunch:')
new_df = getPortKeys(filtered_dataframe2_large)
print(len(new_df))
print('\n')


Processed Old Dataset Keybunch:
Number of rows in Port_AUCKLAND_Size_40_Type_HC_PartyID_01005136: 918
Number of rows in Port_DUBAI (JEBEL ALI)_Size_40_Type_HC_PartyID_01002788: 918
Number of rows in Port_MELBOURNE_Size_40_Type_HC_PartyID_01002778: 918
Number of rows in Port_SYDNEY_Size_40_Type_HC_PartyID_01002779: 915
Number of rows in Port_BUSAN_Size_40_Type_HC_PartyID_010004286: 908
5


Old Dataset Keybunch:
Number of rows in Port_BUSAN_Size_40_Type_HC_PartyID_010004286: 1689
Number of rows in Port_AUCKLAND_Size_40_Type_HC_PartyID_01005136: 1688
Number of rows in Port_MELBOURNE_Size_40_Type_HC_PartyID_01002778: 1325
Number of rows in Port_SYDNEY_Size_40_Type_HC_PartyID_01002779: 1300
Number of rows in Port_DUBAI (JEBEL ALI)_Size_40_Type_HC_PartyID_01002788: 1052
5


New Dataset Keybunch:
Number of rows in Port_BUSAN_Size_40_Type_HC_PartyID_010004286: 1712
Number of rows in Port_AUCKLAND_Size_40_Type_HC_PartyID_01005136: 1704
Number of rows in Port_MELBOURNE_Size_40_Type_HC_PartyID_01

In [73]:
# Global variable selector
sel_country = old_df[1]
print(sel_country)

# Getting the latest data from new vs old as accuracy measure
sel_process_old_df = processed_dfs[sel_country]
sel_process_old_df.head(3)
sel_process_old_df.info()
print("\n")

sel_old_df = filtered_dataframe1_large[sel_country]
sel_old_df.head(3)
sel_old_df.info()
print("\n")

sel_new_df = filtered_dataframe2_large[sel_country]
sel_new_df.head(3)
sel_new_df.info()


Port_AUCKLAND_Size_40_Type_HC_PartyID_01005136


Unnamed: 0,YearMonthWeek,Rate,POD_ID,PARTY_ID
0,2005-07-18,1.266667,AUCKLAND,1005136
1,2005-07-25,1.42,AUCKLAND,1005136
2,2005-08-01,1.274286,AUCKLAND,1005136


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   YearMonthWeek  918 non-null    datetime64[ns]
 1   Rate           918 non-null    float64       
 2   POD_ID         918 non-null    object        
 3   PARTY_ID       918 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 28.8+ KB




Unnamed: 0,CSL_ID,CNTR_ID,POD_ID,ETD_POL_D,PARTY_ID,PARTY_NAME,POD,CNTR_SIZE,CNTR_TYPE,RATE
0,ECS01050600140,CTNR010050700496,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-19,40,HC,1.204545
1,ECS01050600140,CTNR010050700495,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-19,40,HC,1.204545
2,ECS010050700037,CTNR010050700657,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-21,40,HC,1.318182


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1688 entries, 0 to 1687
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   CSL_ID      1688 non-null   object        
 1   CNTR_ID     1688 non-null   object        
 2   POD_ID      1688 non-null   object        
 3   ETD_POL_D   1688 non-null   object        
 4   PARTY_ID    1688 non-null   object        
 5   PARTY_NAME  1688 non-null   object        
 6   POD         1688 non-null   datetime64[ns]
 7   CNTR_SIZE   1688 non-null   object        
 8   CNTR_TYPE   1688 non-null   object        
 9   RATE        1688 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(8)
memory usage: 145.1+ KB




Unnamed: 0,CSL_ID,CNTR_ID,POD_ID,ETD_POL_D,PARTY_ID,PARTY_NAME,POD,CNTR_SIZE,CNTR_TYPE,RATE
0,ECS01050600140,CTNR010050700496,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-19,40,HC,2200.0
1,ECS01050600140,CTNR010050700495,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-19,40,HC,2200.0
2,ECS010050700037,CTNR010050700657,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-21,40,HC,2300.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1704 entries, 0 to 1703
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   CSL_ID      1704 non-null   object        
 1   CNTR_ID     1704 non-null   object        
 2   POD_ID      1704 non-null   object        
 3   ETD_POL_D   1704 non-null   object        
 4   PARTY_ID    1704 non-null   object        
 5   PARTY_NAME  1704 non-null   object        
 6   POD         1704 non-null   datetime64[ns]
 7   CNTR_SIZE   1704 non-null   object        
 8   CNTR_TYPE   1704 non-null   object        
 9   RATE        1704 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(8)
memory usage: 146.4+ KB


<h2>Iterative Nested Forecasting</h2>

<h4>Time series pipeline for best forecasting model to each dataframe.</h4>

In [74]:
# General Functions

# Mean Square Error Function:
def calculate_RMSE(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


def plot_train_val_loss(history):
    plt.figure(figsize=(10, 6))
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model loss progress during training and validation')
    plt.xlabel('Epoch')
    plt.ylabel('Training and Validation Loss')
    plt.legend()
    plt.show()


<h4>LSTM</h4>

In [75]:
# Update create_dataset to handle multi-feature dataset
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return np.array(dataX), np.array(dataY)


def create_LSTM_model(trainX, trainY, testX, testY, epochs, lstm_layers):
    model = Sequential()

    # input layer
    model.add(Bidirectional(LSTM(lstm_layers[0], return_sequences=True),
                            input_shape=(trainX.shape[1], trainX.shape[2])))
    model.add(Dropout(0.2))

    # hidden layers
    for i in range(1, len(lstm_layers)):
        model.add(Bidirectional(
            LSTM(lstm_layers[i], return_sequences=(i != (len(lstm_layers)-1)))))

    # output layer
    model.add(Dense(1))

    model.compile(loss='mean_squared_error', optimizer='adam')

    history = model.fit(trainX, trainY, epochs=epochs, validation_data=(testX, testY),
                        callbacks=[EarlyStopping(
                            monitor='val_loss', patience=10)],
                        verbose=2, shuffle=False)

    return model, history


def LSTM_Execute(trainX, testY):
    # Reshape into X=t and Y=t+1, timestep  look_back
    look_back = 5
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)

    # Reshape input to be [samples, time steps, features]
    trainX = np.reshape(trainX, (trainX.shape[0], 1, look_back))
    testX = np.reshape(testX, (testX.shape[0], 1, look_back))

    epochs_list = [100]

    lstm_layers_list = [
        [64, 64, 32, 32, 16, 16, 8, 8, 4, 4, 2, 2],
        [32, 32, 16, 16, 8, 8, 4, 4, 2, 2],
        [16, 16, 8, 8, 4, 4, 2, 2],
        [8, 8, 4, 4, 2, 2],
        [4, 4, 2, 2],
        [2, 2]
    ]

    rmse_results = {}

    for epochs in epochs_list:
        print(f'Training for {epochs} epochs...')

        for lstm_layers in lstm_layers_list:
            print(f'Training with LSTM layers: {lstm_layers}')
            model, history = create_LSTM_model(
                trainX, trainY, testX, testY, epochs, lstm_layers)

            # Add the loss for this model to the plot
            plt.plot(
                history.history['loss'], label=f'Train Loss - {epochs} epochs, layers: {lstm_layers}')
            plt.plot(
                history.history['val_loss'], label=f'Validation Loss - {epochs} epochs, layers: {lstm_layers}')

            # Evalute LSTM Model
            trainPredict = model.predict(trainX)
            testPredict = model.predict(testX)

            # inverse_transform
            trainPredict = scaler.inverse_transform(trainPredict)
            trainY_orig = scaler.inverse_transform([trainY])
            testPredict = scaler.inverse_transform(testPredict)
            testY_orig = scaler.inverse_transform([testY])

            # Calculate mean squared error
            trainScore = calculate_RMSE(trainY_orig[0], trainPredict[:, 0])
            print(f'Train Score: {trainScore:.2f} RMSE for {epochs} epochs')
            testScore = calculate_RMSE(testY_orig[0], testPredict[:, 0])
            print(f'Test Score: {testScore:.2f} RMSE for {epochs} epochs')

            rmse_results[f'{epochs} epochs, {lstm_layers} layers'] = {
                'Train RMSE': trainScore, 'Test RMSE': testScore}

    # Configure and show the plot
    plt.title('Model loss progress during training and validation')
    plt.xlabel('Epoch')
    plt.ylabel('Training and Validation Loss')
    plt.legend()
    plt.show()

    # Convert the dictionary to a DataFrame for easy display
    rmse_df = pd.DataFrame(rmse_results).T
    print(rmse_df)
    return model, trainScore, testScore


<h4>Arima</h4>

In [76]:
# Function to execute Auto ARIMA model
def ARIMA_Execute(train, test):
    # Fit an auto_arima model
    arima_model = pm.auto_arima(train, start_p=1, start_q=1,
                                max_p=5, max_q=5, m=12,
                                start_P=0, seasonal=False,
                                d=0, D=0, trace=True,
                                error_action='ignore',
                                suppress_warnings=True,
                                stepwise=True)  # set to stepwise

    # Print the summary of the model
    print(arima_model.summary())

    # Forecast
    train_forecast = arima_model.predict_in_sample()
    test_forecast = arima_model.predict(n_periods=len(test))

    # Calculate the RMSE
    RMSE_ARIMA_train = np.sqrt(mean_squared_error(train, train_forecast))
    print("Train RMSE: %.3f" % RMSE_ARIMA_train)
    RMSE_ARIMA_test = np.sqrt(mean_squared_error(test, test_forecast))
    print("Test RMSE: %.3f" % RMSE_ARIMA_test)

    return arima_model, RMSE_ARIMA_train, RMSE_ARIMA_test


In [78]:
# Normalize dataset for LSTM
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit and transform the train dataset
train_size = int(len(sel_process_old_df) * 0.7)
train_data = sel_process_old_df['Rate'].values[:train_size].reshape(-1, 1)
test_data = sel_process_old_df['Rate'].values[train_size:].reshape(-1, 1)
train = scaler.fit_transform(train_data)

# Only transform the test dataset
test = scaler.transform(test_data)

# Initialize a dictionary for the current key
results_dict = {}

print(f"Key: {sel_country}")
# Run LSTM model
model, LSTM_train_rmse, LSTM_test_rmse = LSTM_Execute(train, test)
results_dict[sel_country] = {'LSTM': {'model': model, 'Train RMSE': LSTM_train_rmse, 'Test RMSE': LSTM_test_rmse}}

# Run ARIMA model
model, ARIMA_train_rmse, ARIMA_test_rmse = ARIMA_Execute(train_data, test_data)
results_dict[sel_country]['ARIMA'] = {'model': model, 'Train RMSE': ARIMA_train_rmse, 'Test RMSE': ARIMA_test_rmse}

# Now you can directly fetch your results using sel_country as the key
results = results_dict[sel_country]
best_model_name, best_model_results = min(results.items(), key=lambda x: x[1]['Test RMSE'])
print(f"Key: {sel_country}, Best Model: {best_model_name}, Train RMSE: {best_model_results['Train RMSE']}, Test RMSE: {best_model_results['Test RMSE']}")


Key: Port_AUCKLAND_Size_40_Type_HC_PartyID_01005136
Training for 100 epochs...
Training with LSTM layers: [64, 64, 32, 32, 16, 16, 8, 8, 4, 4, 2, 2]
Epoch 1/100
20/20 - 46s - loss: 0.1966 - val_loss: 3.4698 - 46s/epoch - 2s/step
Epoch 2/100
20/20 - 0s - loss: 0.1741 - val_loss: 3.3916 - 357ms/epoch - 18ms/step
Epoch 3/100
20/20 - 0s - loss: 0.1519 - val_loss: 3.3066 - 349ms/epoch - 17ms/step
Epoch 4/100
20/20 - 0s - loss: 0.1294 - val_loss: 3.2097 - 357ms/epoch - 18ms/step
Epoch 5/100
20/20 - 0s - loss: 0.1060 - val_loss: 3.0862 - 353ms/epoch - 18ms/step
Epoch 6/100
20/20 - 0s - loss: 0.0809 - val_loss: 2.8987 - 358ms/epoch - 18ms/step
Epoch 7/100
20/20 - 0s - loss: 0.0584 - val_loss: 2.7174 - 353ms/epoch - 18ms/step
Epoch 8/100
20/20 - 0s - loss: 0.0478 - val_loss: 2.6685 - 360ms/epoch - 18ms/step
Epoch 9/100
20/20 - 0s - loss: 0.0453 - val_loss: 2.6640 - 359ms/epoch - 18ms/step
Epoch 10/100
20/20 - 0s - loss: 0.0444 - val_loss: 2.6608 - 420ms/epoch - 21ms/step
Epoch 11/100
20/20 - 0s

KeyboardInterrupt: 

<h2>Forecasting with best model</h2>

In [None]:
# Add check for 'RATE_actual' values to avoid division by zero
def compute_accuracy(row):
    if row['RATE_actual'] == 0:
        return np.nan
    else:
        error = abs(row['RATE_actual'] - row['RATE_forecasted'])
        error_proportion = error / row['RATE_actual']
        return (1 - error_proportion) * 100


def forecast_next_weeks(model, look_back, scaler, last_values, n_weeks):
    forecast = []
    for _ in range(n_weeks):
        # Reshape last_values to 2D array with one feature
        last_values_2d = np.array(last_values[-look_back:]).reshape(-1, 1)

        # Scale the last_values_2d to be between 0 and 1
        input_values_scaled = scaler.transform(last_values_2d)

        # Reshape to [samples, time steps, features]
        input_values_scaled = input_values_scaled.reshape((1, look_back, 1))

        # Predict the next value
        prediction = model.predict(input_values_scaled)

        # Rescale the prediction back to the original scale
        prediction_rescaled = scaler.inverse_transform(prediction)

        # Append the predicted value to the forecast list
        forecast.append(prediction_rescaled[0][0])

        # Append the predicted value to the last_values list to be used as input for the next prediction
        last_values.append(prediction_rescaled[0][0])
        # Drop the first value in the last_values list
        last_values.pop(0)

    return forecast


In [None]:
# Check if the best model is LSTM or ARIMA and perform the forecasting
if best_model_name == 'LSTM':
    # The best model for this key is the LSTM
    # Fetch the model
    best_model = results['LSTM']['model']

    # Use the model to make forecasts
    last_values = list(sel_process_old_df['Rate'].values[-look_back:])
    forecasted_values = forecast_next_weeks(
        best_model, look_back, scaler, last_values, weeks)

elif best_model_name == 'ARIMA':
    # The best model for this key is the ARIMA
    # Fetch the model
    best_model = results['ARIMA']['model']

    # Use the model to make forecasts
    forecasted_values = best_model.predict(n_periods=weeks)

else:
    # Unknown model
    print(f"Unknown model: {best_model_name}")


weeks = 12

# Ensure that 'YearMonthWeek' is a datetime object
df['YearMonthWeek'] = pd.to_datetime(df['YearMonthWeek'])
last_date = df['YearMonthWeek'].iloc[-1]

forecasted_dates = pd.date_range(
    start=last_date, periods=weeks+1, freq='W')[1:]

df_forecasted = pd.DataFrame({
    'POD': forecasted_dates,
    'RATE': forecasted_values
})

df_forecasted["RATE"] = df_forecasted["RATE"].round(2)
df_forecasted.head(5)
df_forecasted.tail(5)
df_forecasted.info()

<h4>Forecasting</h4>