<h1>Iterative Nested Forecasting</h1>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from scipy import stats
from sklearn.model_selection import train_test_split
from IPython.core.interactiveshell import InteractiveShell

# For machine learning
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from math import sqrt
from pmdarima import auto_arima
from prophet import Prophet
from statsmodels.tsa.holtwinters import ExponentialSmoothing

InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
sns.set()
pd.options.display.max_rows = 100
warnings.filterwarnings('ignore')

<h4>Importing Datasets</h4>

In [None]:
# Import dataset and clean, ready as a dataframe for creating keys
def createDF(datasets):
    df = pd.read_csv(datasets, converters={
                     'PARTY_ID': str, 'COM_ID': str, 'CNTR_SIZE': str})

    # Formating to type and remove NaN values
    df['POD'] = pd.to_datetime(df['POD'])
    df['ENCODED_TYPE'] = df['ENCODED_TYPE'].fillna(-1).astype(int)
    df = df.dropna(subset=['ENCODED_TYPE'])
    df['RATE'] = df['RATE'].fillna(-1).astype(float)
    df = df.dropna(subset=['RATE'])
    df['ENCODED_TYPE'] = df['ENCODED_TYPE'].astype(int)
    df_clean = df.dropna().reset_index(drop=True)

    # Selecting and rearranging columns
    sel_col = ['CSL_ID', 'CNTR_ID', 'POD_ID', 'ETD_POL_D', 'PARTY_ID',
               'PARTY_NAME', 'POD', 'CNTR_SIZE', 'CNTR_TYPE', 'RATE']
    df_fc = df_clean[sel_col]

    # Removing years we do not want to process in our models
    df_filtered = df_fc[df_fc['POD'].dt.year != 2002]

    # Sorting the dates
    df_filtered = df_filtered.sort_values(by='POD').reset_index(drop=True)

    return df_filtered


In [None]:
# Create Dataframes for old and new
old_data = '.\Datasets\CR_COST_FC.csv'
df1 = createDF(old_data)
df1.head()

new_data = '.\Datasets\CR_COST_FC_new.csv'
df2 = createDF(new_data)
df2.head()


<h4>Creating Dictionary Keys</h4>

In [None]:
# This function is to filter and create keys
def filter_dataframe(df):
    filtered_dataframes = {}

    for (port, size, ctype, party_id), group in df.groupby(['POD_ID', 'CNTR_SIZE', 'CNTR_TYPE', 'PARTY_ID']):
        group = group.reset_index(drop=True).sort_values(by='POD')
        df_id = f"Port_{port}_Size_{size}_Type_{ctype}_PartyID_{party_id}"
        filtered_dataframes[df_id] = group

    return filtered_dataframes

In [None]:
# Creating keys from data
print("Old Data keys:")
filtered_dataframe1 = filter_dataframe(df1)
df_ids1 = list(filtered_dataframe1.keys())
print(list(df_ids1))
print(len(list(df_ids1)))

print("\nNew Data keys:")
filtered_dataframe2 = filter_dataframe(df2)
df_ids2 = list(filtered_dataframe2.keys())
print(list(df_ids2))
print(len(list(df_ids2)))

# Removing Keys that have less then 500 rows as it is not enough data points for LSTM
print("\nRemoving keys that has less then 500 entries:")
# Old data keys
filtered_dataframe1_large = {key: df for key, df in filtered_dataframe1.items() if len(df) >= 1000}
large_df_ids1 = list(filtered_dataframe1_large.keys())
print(list(large_df_ids1))
print(len(list(large_df_ids1)))
print("\n")

# New data keys
filtered_dataframe2_large = {key: df for key, df in filtered_dataframe2.items() if len(df) >= 1000}
large_df_ids2 = list(filtered_dataframe2_large.keys())
print(list(large_df_ids2))
print(len(list(large_df_ids2)))

<h2>Data Preprocessing</h2>

<h4>Interpolate old_df missing values and group missing entries by weeks</h4>

In [None]:
def interpolate_and_aggregate(df, key):
    
    # Initialise RobustScaler(handle outliers)
    scaler = RobustScaler()
    df[['RATE']] = scaler.fit_transform(df[['RATE']])
    
    # Check if the DataFrame has any NaN values
    if df.isna().any().any():
        print(f"{key} DataFrame contains NaN values.")
    else:
        print(f"{key} DataFrame does not contain NaN values.")
    
    # Drop duplicates
    sel_df = df.drop_duplicates(subset=['POD', 'RATE']).reset_index(drop=True)

    # Extract POD_ID and PARTY_ID from the first row
    pod_id = df['POD_ID'].iloc[0]
    party_id = df['PARTY_ID'].iloc[0]

    # Create a new dataframe with a date range from min to max date in your dataframe
    new_df = pd.DataFrame()
    new_df['POD'] = pd.date_range(
        start=sel_df['POD'].min(), end=sel_df['POD'].max())

    # Merge the original dataframe with the new one
    df_interpolated = pd.merge(
        new_df, sel_df[['POD', 'RATE']], on='POD', how='left')

    # Perform spline interpolation
    df_interpolated['RATE'] = df_interpolated['RATE'].interpolate(
        method='polynomial', order=1)
    df_interpolated['RATE'] = df_interpolated['RATE'].round(2)

    # Create YearMonthWeek directly from the 'POD'
    df_interpolated['YearMonthWeek'] = df_interpolated['POD'] - \
        pd.to_timedelta(df_interpolated['POD'].dt.dayofweek, unit='D')

    # Create a new dataframe with every week in the range
    all_weeks = pd.date_range(start=df_interpolated['POD'].min(
    ), end=df_interpolated['POD'].max(), freq='W')
    all_weeks_df = pd.DataFrame(all_weeks, columns=['POD'])

    # Create YearMonthWeek in all_weeks_df
    all_weeks_df['YearMonthWeek'] = all_weeks_df['POD'] - \
        pd.to_timedelta(all_weeks_df['POD'].dt.dayofweek, unit='D')

    # Merge this with your original dataframe
    merged_df = pd.merge(all_weeks_df, df_interpolated,
                         on=['YearMonthWeek'], how='left')

    # Group by YearMonthWeek and compute your rate
    grouped = merged_df.groupby(['YearMonthWeek'])

    agg_df = pd.DataFrame(
        columns=['YearMonthWeek', 'Rate', 'POD_ID', 'PARTY_ID'])

    for group_name, group_df in grouped:
        year_month_week = group_name

        # Calculate skewness of RATE values
        rate_skew = group_df['RATE'].skew()

        # Calculate trimmed mean of RATE values
        # trimming 10% from each end
        rate_metric = stats.trim_mean(group_df['RATE'].dropna().values, 0.1)

        new_row = {'YearMonthWeek': year_month_week,
                   'Rate': rate_metric, 'POD_ID': pod_id, 'PARTY_ID': party_id}

        # Append row to aggregated dataframe
        agg_df = agg_df.append(new_row, ignore_index=True)

    agg_df = agg_df.sort_values(by='YearMonthWeek').reset_index(drop=True)

    return agg_df


# dictionary to store the results
processed_dfs = {}

# loop over all keys in the original dictionary
for key in filtered_dataframe1_large.keys():
    processed_dfs[key] = interpolate_and_aggregate(
        filtered_dataframe1_large[key], key)

# Preview dictionary
print()


<h4>Sorting and getting key arrays</h4>

In [None]:
def getPortKeys(keybunch):
    keybunch_pouch = []
    # Create a dictionary with corresponding dataframes
    keybunch_subset = {}
    
    # Get a dictionary with key and number of rows for each dataframe in filtered_dataframes
    key_row_counts = {key: len(keybunch[key]) for key in keybunch}

    # Sort the key_row_counts dictionary by value (number of rows) in descending order
    sorted_key_row_counts = sorted(
        key_row_counts.items(), key=lambda item: item[1], reverse=True)

    for key, row_count in sorted_key_row_counts:
            keybunch_subset[key] = keybunch[key]
            print(f"Number of rows in {key}: {row_count}")
            keybunch_pouch.append(key)

    # Return array of keys
    return keybunch_pouch

In [None]:
# This is changing it to an array

print('Processed Old Dataset Keybunch:')
process_old_df= getPortKeys(processed_dfs)
print(len(process_old_df))
print('\n')

print('Old Dataset Keybunch:')
old_df= getPortKeys(filtered_dataframe1_large)
print(len(old_df))
print('\n')

print('New Dataset Keybunch:')
new_df= getPortKeys(filtered_dataframe2_large)
print(len(new_df))
print('\n')


<h2>Iterative Nested Forecasting</h2>

<h4>Time series pipeline for best forecasting model to each dataframe.</h4>

In [None]:
def create_lagged_features(df, lags):
    df = df.copy()
    for i in range(1, lags + 1):
        df[f'Rate_lag_{i}'] = df['Rate'].shift(i)
    return df


def train_auto_arima(train, test):
    model = auto_arima(train['Rate'], seasonal=True, trace=True,
                       error_action='ignore', suppress_warnings=True)
    model.fit(train['Rate'])
    forecast, conf_int = model.predict(
        n_periods=len(test), return_conf_int=True)
    pred_test = pd.Series(forecast, index=test.index)
    return pred_test


def train_prophet(train, test):
    train = train.rename(columns={'YearMonthWeek': 'ds', 'Rate': 'y'})
    test = test.rename(columns={'YearMonthWeek': 'ds', 'Rate': 'y'})
    model = Prophet()
    model.fit(train)
    forecast = model.predict(test)
    pred_test = forecast['yhat']
    return pred_test


def train_tree_based_model(model_class, train, test, lags):
    model = model_class(random_state=42)

    # Construct a pipeline that performs imputation and then trains the model
    pipeline = Pipeline(steps=[
        ('imputation', SimpleImputer()),
        ('model', model)
    ])

    features = [f'Rate_lag_{i}' for i in range(1, lags+1)]
    pipeline.fit(train[features], train['Rate'])
    pred_test = pipeline.predict(test[features])

    return pred_test


def train_ets(train, test):
    model = ExponentialSmoothing(endog=train['Rate'])
    fit = model.fit()
    pred = fit.predict(start=len(train), end=len(train) + len(test) - 1)
    return pred


lags = 1

models = [
    {"name": "auto_arima", "train_function": train_auto_arima},
    {"name": "prophet", "train_function": train_prophet},
    {"name": "ets", "train_function": train_ets},
    {"name": "RandomForest", "train_function": train_tree_based_model,
        "model": RandomForestRegressor},
    {"name": "SVM", "train_function": train_tree_based_model, "model": SVR},
    {"name": "XGBoost", "train_function": train_tree_based_model, "model": XGBRegressor}
]

for key in processed_dfs.keys():
    df = processed_dfs[key]

    # Create lagged features
    df = create_lagged_features(df, lags)

    # Drop any rows with NaN values in the original Rate column
    df = df.dropna(subset=['Rate'])

    # Split the data into training and testing sets
    train_size = int(len(df) * 0.8)
    train, test = df[:train_size], df[train_size:]

    # Then, within your main loop, you can call the appropriate function:
    for model_info in models:
        try:
            model_name = model_info["name"]
            train_function = model_info["train_function"]
            if "model" in model_info:
                model = model_info["model"]
                pred_test = train_function(model, train, test, lags)
            else:
                pred_test = train_function(train, test)

            test_rmse = sqrt(mean_squared_error(test['Rate'], pred_test))
            print(f"Model: {model_name}, Key: {key}, Test RMSE: {test_rmse}")
            model_results.append({"model": model_name, "test_rmse": test_rmse})

        except Exception as e:
            error_message = f"Model: {model_name}, Key: {key} failed due to {str(e)}"
            print(error_message)
            errors.append(error_message)


<h2>Forecasting with best model</h2>

<h4>Selecting Dataset(country)</h4>

In [None]:
# Global variable selector
sel_country = old_df[1]
print(sel_country)

# Getting the latest data from new vs old as accuracy measure
sel_process_old_df = processed_dfs[sel_country]
sel_process_old_df.head(3)
sel_process_old_df.info()
print("\n")


sel_old_df = filtered_dataframe1_large[sel_country]
sel_old_df.head(3)
sel_old_df.info()
print("\n")

sel_new_df = filtered_dataframe2_large[sel_country]
sel_new_df.head(3)
sel_new_df.info()

<h4>Forecasting</h4>