In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dropout, LSTM, Dense
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from IPython.core.interactiveshell import InteractiveShell
from sklearn.model_selection import KFold
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
sns.set()
pd.options.display.max_rows = 100


<h4>Importing Datasets</h4>

In [16]:
# Import dataset and clean, ready as a dataframe for creating keys
def createDF(datasets):
    df = pd.read_csv(datasets, converters={
                     'PARTY_ID': str, 'COM_ID': str, 'CNTR_SIZE': str})

    # Formating to type and remove NaN values
    df['POD'] = pd.to_datetime(df['POD'])
    df['ENCODED_TYPE'] = df['ENCODED_TYPE'].fillna(-1).astype(int)
    df = df.dropna(subset=['ENCODED_TYPE'])
    df['RATE'] = df['RATE'].fillna(-1).astype(float)
    df = df.dropna(subset=['RATE'])
    df['ENCODED_TYPE'] = df['ENCODED_TYPE'].astype(int)
    df_clean = df.dropna().reset_index(drop=True)

    # Selecting and rearranging columns
    sel_col = ['CSL_ID', 'CNTR_ID', 'POD_ID', 'ETD_POL_D', 'PARTY_ID',
               'PARTY_NAME', 'POD', 'CNTR_SIZE', 'CNTR_TYPE', 'RATE']
    df_fc = df_clean[sel_col]

    # Removing years we do not want to process in our models
    df_filtered = df_fc[df_fc['POD'].dt.year != 2002]

    # Sorting the dates
    df_filtered = df_filtered.sort_values(by='POD').reset_index(drop=True)

    return df_filtered


In [17]:
# Create Dataframes for old and new
old_data = '.\Datasets\CR_COST_FC.csv'
df1 = createDF(old_data)
df1.head()

new_data = '.\Datasets\CR_COST_FC_new.csv'
df2 = createDF(new_data)
df2.head()


Unnamed: 0,CSL_ID,CNTR_ID,POD_ID,ETD_POL_D,PARTY_ID,PARTY_NAME,POD,CNTR_SIZE,CNTR_TYPE,RATE
0,ECS01050600224,CTNR010050700354,ROTTERDAM,NLRTM,1008827,YANG MING (SINGAPORE) PTE. LTD.,2005-07-15,40,HC NOR,1620.0
1,ECS01050600610,CTNR010050700353,FELIXSTOWE,GBFXT,1002303,GLOBELINK FALLOW LIMITED,2005-07-15,40,GP,1800.0
2,ECS01050600041,CTNR010050700351,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-15,40,HC NOR,2170.0
3,ECS01050600163,CTNR010050700370,PASIR GUDANG,MYPGU,1002767,GLOBELINK CONTAINER LINES (JB) S/B,2005-07-15,40,HC,280.0
4,ECS01050600456,CTNR010050700450,KARACHI (KICT),PKKHI,1002783,GLOBELINK PAKISTAN (PVT) LTD,2005-07-16,40,HC,1625.0


Unnamed: 0,CSL_ID,CNTR_ID,POD_ID,ETD_POL_D,PARTY_ID,PARTY_NAME,POD,CNTR_SIZE,CNTR_TYPE,RATE
0,ECS01050600224,CTNR010050700354,ROTTERDAM,NLRTM,1008827,YANG MING (SINGAPORE) PTE. LTD.,2005-07-15,40,HC NOR,1620.0
1,ECS01050600610,CTNR010050700353,FELIXSTOWE,GBFXT,1002303,GLOBELINK FALLOW LIMITED,2005-07-15,40,GP,1800.0
2,ECS01050600041,CTNR010050700351,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-15,40,HC NOR,2170.0
3,ECS01050600163,CTNR010050700370,PASIR GUDANG,MYPGU,1002767,GLOBELINK CONTAINER LINES (JB) S/B,2005-07-15,40,HC,280.0
4,ECS01050600456,CTNR010050700450,KARACHI (KICT),PKKHI,1002783,GLOBELINK PAKISTAN (PVT) LTD,2005-07-16,40,HC,1625.0


<h4>Creating Dictionary Keys</h4>

In [18]:
def filter_dataframe(df):
    filtered_dataframes = {}

    for (port, size, ctype, party_id), group in df.groupby(['POD_ID', 'CNTR_SIZE', 'CNTR_TYPE', 'PARTY_ID']):
        group = group.reset_index(drop=True).sort_values(by='POD')
        df_id = f"Port_{port}_Size_{size}_Type_{ctype}_PartyID_{party_id}"
        filtered_dataframes[df_id] = group

    return filtered_dataframes


In [19]:
# Creating keys from data
print("Old Data keys:")
filtered_dataframe1 = filter_dataframe(df1)
df_ids1 = list(filtered_dataframe1.keys())
print(list(df_ids1))
print(len(list(df_ids1)))

print("\nNew Data keys:")
filtered_dataframe2 = filter_dataframe(df2)
df_ids2 = list(filtered_dataframe2.keys())
print(list(df_ids2))
print(len(list(df_ids2)))


Old Data keys:
['Port_(CONSTANZA)_Size_40_Type_GP_PartyID_010007816', 'Port_(CONSTANZA)_Size_40_Type_HC_PartyID_010007816', 'Port_AARHUS_Size_20_Type_GP_PartyID_01000043', 'Port_AARHUS_Size_20_Type_GP_PartyID_0100027830', 'Port_AARHUS_Size_20_Type_GP_PartyID_010006666', 'Port_AARHUS_Size_40_Type_GP_PartyID_01000043', 'Port_AARHUS_Size_40_Type_HC_PartyID_01000043', 'Port_AARHUS_Size_40_Type_HC_PartyID_0100027830', 'Port_AARHUS_Size_40_Type_HC_PartyID_0100028193', 'Port_AARHUS_Size_40_Type_HC_PartyID_010006666', 'Port_ABIDJAN_Size_20_Type_GP_PartyID_0100027878', 'Port_ABIDJAN_Size_40_Type_HC_PartyID_010021097', 'Port_ADELAIDE_Size_20_Type_GP_PartyID_01002775', 'Port_ADELAIDE_Size_40_Type_GP_PartyID_01002775', 'Port_ADELAIDE_Size_40_Type_HC_PartyID_01002775', 'Port_ADELAIDE_Size_40_Type_HC_PartyID_01005078', 'Port_ADELAIDE_Size_40_Type_HC NOR_PartyID_01002775', 'Port_ADELAIDE(AU)_Size_20_Type_GP_PartyID_01002775', 'Port_ADELAIDE(AU)_Size_40_Type_HC_PartyID_01002775', 'Port_ADELAIDE(AU)_Si

<h4>Removing data keys that has less then 500 rows as they have not enough points for LSTM</h4>

In [20]:
# Old data keys
filtered_dataframe1_large = {key: df for key, df in filtered_dataframe1.items() if len(df) >= 500}
large_df_ids1 = list(filtered_dataframe1_large.keys())
print(list(large_df_ids1))
print(len(list(large_df_ids1)))

# New data keys
filtered_dataframe2_large = {key: df for key, df in filtered_dataframe2.items() if len(df) >= 500}
large_df_ids2 = list(filtered_dataframe2_large.keys())
print(list(large_df_ids2))
print(len(list(large_df_ids2)))

['Port_AUCKLAND_Size_40_Type_HC_PartyID_01005136', 'Port_AUCKLAND_Size_40_Type_HC NOR_PartyID_01005136', 'Port_BANGKOK_Size_40_Type_HC_PartyID_01002799', 'Port_BRISBANE_Size_40_Type_HC_PartyID_01002776', 'Port_BUSAN_Size_40_Type_HC_PartyID_010004286', 'Port_DUBAI (JEBEL ALI)_Size_40_Type_HC_PartyID_01002788', 'Port_FELIXSTOWE_Size_40_Type_HC_PartyID_01002303', 'Port_FREMANTLE_Size_40_Type_HC_PartyID_01002777', 'Port_HAIPHONG_Size_40_Type_HC_PartyID_010005255', 'Port_HOCHIMINH CAT LAI,VIETNAM_Size_40_Type_HC_PartyID_010005256', 'Port_LAEM CHABANG_Size_40_Type_HC_PartyID_01002799', 'Port_LYTTELTON_Size_40_Type_HC_PartyID_01005136', 'Port_MADRAS/CHENNAI_Size_40_Type_HC_PartyID_010006979', 'Port_MANILA (NORTH HARBOUR)_Size_40_Type_HC_PartyID_010021727', 'Port_MANILA (NORTH HARBOUR)_Size_40_Type_HC_PartyID_01004969', 'Port_MELBOURNE_Size_40_Type_HC_PartyID_01002778', 'Port_NHAVA SHEVA_Size_40_Type_HC_PartyID_01002787', 'Port_PIRAEUS_Size_40_Type_HC_PartyID_010006369', 'Port_PORT LOUIS_Size_

<h4>Interpolate missing values in between dates and group each key to weeks</h4>

In [22]:
from scipy.interpolate import UnivariateSpline

def interpolate_and_group(groups):
    all_agg_dfs = {}

    for df_id, group in groups.items():
        group = group.reset_index(drop=True)  # Reset index after sorting
        x = group.index
        y = group['RATE']

        # Spline interpolation of order 1
        spl = UnivariateSpline(x, y, k=1, s=0)
        group['interpolated_rate'] = spl(x)

        # Create YearMonthWeek directly from the 'POD'
        group['YearMonthWeek'] = group['POD'] - pd.to_timedelta(group['POD'].dt.dayofweek, unit='D')

        # Create a new dataframe with every week in the range
        all_weeks = pd.date_range(start=group['POD'].min(), end=group['POD'].max(), freq='W')
        all_weeks_df = pd.DataFrame(all_weeks, columns=['POD'])

        # Create YearMonthWeek in all_weeks_df
        all_weeks_df['YearMonthWeek'] = all_weeks_df['POD'] - pd.to_timedelta(all_weeks_df['POD'].dt.dayofweek, unit='D')

        # Merge this with your original dataframe
        merged_df = pd.merge(all_weeks_df, group, on=['YearMonthWeek'], how='left')

        # Now you can group by YearMonthWeek and compute your rate
        grouped = merged_df.groupby(['YearMonthWeek'])

        agg_df = pd.DataFrame(columns=['YearMonthWeek', 'interpolated_rate'])

        for group_name, group_df in grouped:
            year_month_week = group_name

            # Calculate sum and skewness of interpolated_rate values
            rate_sum = group_df['interpolated_rate'].sum()
            rate_skew = group_df['interpolated_rate'].skew()

            # Calculate mean or median based on skewness
            if rate_skew < 0:
                rate_metric = group_df['interpolated_rate'].mean()
            else:
                rate_metric = group_df['interpolated_rate'].median()

            # Get the first non-null POD_ID and PARTY_ID in the group
            pod_id = group_df['POD_ID'].dropna().iloc[0]
            party_id = group_df['PARTY_ID'].dropna().iloc[0]

            new_row = {
                'YearMonthWeek': year_month_week,
                'interpolated_rate': rate_metric,
                'POD_ID': pod_id,
                'PARTY_ID': party_id
            }

            # Append row to aggregated dataframe
            agg_df = agg_df.append(new_row, ignore_index=True)

        agg_df = agg_df.sort_values(by='YearMonthWeek').reset_index(drop=True)

        all_agg_dfs[df_id] = agg_df

    return all_agg_dfs

interpolated_and_grouped = interpolate_and_group(filtered_dataframe1_large)

  for group_name, group_df in grouped:
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  agg_df = agg_df.append(new_row, ignore_index=T

TypeError: float() argument must be a string or a number, not 'NaTType'

In [None]:
def getPortKeys(keybunch):
    keybunch_pouch = []
    # Create a dictionary with corresponding dataframes
    keybunch_subset = {}
    
    # Get a dictionary with key and number of rows for each dataframe in filtered_dataframes
    key_row_counts = {key: len(keybunch[key]) for key in keybunch}

    # Sort the key_row_counts dictionary by value (number of rows) in descending order
    sorted_key_row_counts = sorted(
        key_row_counts.items(), key=lambda item: item[1], reverse=True)

    for key, row_count in sorted_key_row_counts:
            keybunch_subset[key] = keybunch[key]
            print(f"Number of rows in {key}: {row_count}")
            keybunch_pouch.append(key)

    # Return array of keys
    return keybunch_pouch

In [None]:
print('Old Dataset Keybunch (interpolated and grouped weekly):')
processed_df = getPortKeys(interpolated_and_grouped)
print(len(processed_df))
print('\n')

print('Old Dataset Keybunch:')
old_df = getPortKeys(filtered_dataframe1_large)
print(len(old_df))
print('\n')

print('New Dataset Keybunch:')
new_df = getPortKeys(filtered_dataframe2_large)
print(len(new_df))

In [None]:
# Global variable selector
sel_country = old_df[1]
print(sel_country)

In [None]:
# Getting the latest data from new vs old as accuracy measure
sel_old_df = filtered_dataframe1_large[sel_country]
sel_old_df.head(3)
sel_old_df.info()
print("\n")

sel_new_df = filtered_dataframe2_large[sel_country]
sel_new_df.head(3)
sel_new_df.info()

sel_processed_df = interpolated_and_grouped[sel_country]
sel_processed_df.head(3)
sel_processed_df.info()

In [None]:
# Select columns and casting float type
sel_col = ['YearMonthWeek','interpolated_rate']
sel_processed_df = sel_processed_df[sel_col]

<h4>Latest datapoints from Latest dataframe for comparing after forecasting (Measure accuracy)</h4>

In [None]:
max_date_in_old = sel_df['POD'].max()

# Create a new dataframe that only includes rows from the latest dataframe where the date is greater than the maximum date in the old dataframe
new_dates_df = latest_sel_df[latest_sel_df['POD']
                             > max_date_in_old].reset_index(drop=True)

# Print the new dataframe
new_dates_df.head(3)
new_dates_df.tail(3)
new_dates_df.info()

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(sel_df['POD'], sel_df['RATE'], color='blue', label="Actual Data")
plt.plot(agg_df['YearMonthWeek'], agg_df['Rate'],
         color='red', label="Aggregated Data(weeks)")

plt.xlabel('Date(Year Month Week)')
plt.ylabel('Cost Rate(USD)')
plt.title(old_df[1])
plt.legend()
plt.show()


<h4>Mean Square Error Evaluation Function</h4>

In [None]:
# Mean Square Error Function:
def calculate_RMSE(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


<h4>LSTM Regression<h4>

In [None]:
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from keras.regularizers import l1_l2
from keras.wrappers.scikit_learn import KerasRegressor
from bayes_opt import BayesianOptimization
from keras.models import load_model
from keras.layers import LSTM, Dense
from keras.models import Sequential
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Define the bounds of the parameters for the Bayesian Optimization
pbounds = {
    'look_back': (1, 10),
    'neurons': (32, 128),
    'dropout': (0.1, 0.5),
    'lr': (0.0001, 0.01),
    'epochs': (10, 100),
    'batch_size': (32, 256)
}

# Restructure time series data for LSTM model
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset) - look_back - 1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return np.array(dataX), np.array(dataY)

# add l1_l2 regularization
def create_model(dropout_rate=0.0, l1_factor=0.0, l2_factor=0.0, learning_rate=0.001, neurons=64, look_back=5):
    model = Sequential()
    model.add(LSTM(neurons, return_sequences=True, input_shape=(trainX.shape[1], trainX.shape[2]), activation='tanh', recurrent_activation='hard_sigmoid', kernel_regularizer=l1_l2(l1_factor, l2_factor)))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(neurons//2, activation='tanh'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate))
    return model

def train_LSTM_model(look_back=5, neurons=64, dropout=0.3, lr=0.001, epochs=50, batch_size=64):
    look_back = int(look_back)
    neurons = int(neurons)
    epochs = int(epochs)
    batch_size = int(batch_size)

    model = create_model(dropout, 0.0, 0.0, lr, neurons, look_back)

    # Fit the model and store the history
    history = model.fit(trainX, trainY, epochs=epochs, batch_size=batch_size, verbose=2, validation_data=(testX, testY))

    return model, history

def evaluate_model(look_back=5, neurons=64, dropout=0.3, lr=0.001, epochs=50, batch_size=64):
    _, history = train_LSTM_model(look_back, neurons, dropout, lr, epochs, batch_size)

    # Return the validation loss of the last epoch
    return -history.history['val_loss'][-1]

# Ensemble prediction
def ensemble_predictions(members, testX):
    # make predictions
    yhats = [model.predict(testX) for model in members]
    yhats = np.array(yhats)
    # sum across ensemble members
    summed = np.sum(yhats, axis=0)
    # argmax across classes
    result = summed / len(members)
    return result, testY[-result.shape[0]:]


In [None]:
# Normalize dataset for LSTM
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(agg_df['Rate'].values.reshape(-1,1))

# Split into train and test sets
train_size = int(len(dataset) * 0.70)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]

# Reshape into X=t and Y=t+1, timestep  look_back
look_back = 5
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)

# Reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

# Initialize the optimizer
optimizer = BayesianOptimization(
    f=evaluate_model,
    pbounds=pbounds,
    random_state=1,
)

# Optimize the model
optimizer.maximize(init_points=10, n_iter=50)

# Extract the best parameters
params = optimizer.max['params']

# Train the model with the optimized hyperparameters
print('Training with optimized parameters...')
model, history = train_LSTM_model(look_back=params['look_back'], neurons=params['neurons'], dropout=params['dropout'], lr=params['lr'], epochs=params['epochs'], batch_size=params['batch_size'])


In [None]:
# add early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# save ensemble models
saved_models = []
for i in range(n_members):
    model.fit(trainX, trainY, validation_data=(testX, testY), epochs=100, verbose=0, callbacks=[early_stopping])
    filename = 'model_' + str(i + 1) + '.h5'
    model.save(filename)
    saved_models.append(filename)

# load models
members = [load_model(model) for model in saved_models]

# Get ensemble predictions
ensemble_predictions = np.zeros((testX.shape[0], 1))
for model in members:
    predictions = model.predict(testX)
    ensemble_predictions += predictions
ensemble_predictions /= len(members)

# Reshape ensemble predictions and apply inverse transformation
ensemble_predictions = scaler.inverse_transform(ensemble_predictions)
ensemble_testY = scaler.inverse_transform(testY.reshape(-1, 1))

# Get ensemble RMSE
ensemble_rmse = calculate_RMSE(ensemble_testY, ensemble_predictions)
print(f'Ensemble RMSE: {ensemble_rmse:.2f}')

<h4>Forecast the results</h4>

In [None]:
def forecast_next_week(model, look_back, scaler, last_values, n_weeks):
    forecast = []
    input_values = np.array(last_values)
    for _ in range(n_weeks):
        input_values_scaled = scaler.transform(input_values[-look_back:].reshape(-1, 1))
        input_values_scaled = input_values_scaled.reshape((1, 1, look_back))
        prediction = model.predict(input_values_scaled)
        prediction = scaler.inverse_transform(prediction)
        forecast.append(prediction[0, 0])
        input_values = np.append(input_values, prediction)
    return forecast

In [None]:
# How many weeks you want to forecast
weeks = 12

# Get the last "look_back" values
last_values = list(agg_df['Rate'].values[-look_back:])

# Forecast the next weeks using the individual models
forecasted_values = []
for model in members:
    forecast = forecast_next_week(
        model, look_back, scaler, last_values, weeks)
    forecasted_values.append(forecast)

# Calculate the ensemble forecast by taking the average of individual forecasts
ensemble_forecast = np.mean(forecasted_values, axis=0)

# Get the last date from 'YearMonthWeek'
last_date = agg_df['YearMonthWeek'].iloc[-1]

# Create new dates
new_dates = pd.date_range(
    last_date + pd.DateOffset(weeks=1), periods=weeks, freq='W')

# Create a new DataFrame for the forecasted values
df_forecasted = pd.DataFrame(
    data={'POD': new_dates, 'RATE': ensemble_forecast})

# Rounding of the rate nearest 2 decimal points
df_forecasted["RATE"] = df_forecasted["RATE"].round(2)

df_forecasted.head(5)
df_forecasted.tail(5)
df_forecasted.info()


<h4>Comparing with actual updated against forecasted</h4>

In [None]:
# Create an empty DataFrame to store comparison data
comparison_df = pd.DataFrame(columns=[
                             'WeekStart', 'WeekEnd', 'POD_actual', 'RATE_forecasted', 'RATE_actual', 'error', 'accuracy'])
df_forecasted['WeekEnd'] = df_forecasted['POD'] + pd.to_timedelta(7, unit='d')

# Loop over df_forecasted
for _, row in df_forecasted.iterrows():
    # Find the actual dates within the week of the forecasted date
    # Add a 'WeekEnd' column to df_forecasted
    mask = (new_dates_df['POD'] >= row['POD']) & (
        new_dates_df['POD'] < row['WeekEnd'])
    actual_dates_within_week = new_dates_df[mask]

    # Calculate the absolute error and accuracy for each actual date within the week
    for _, actual_row in actual_dates_within_week.iterrows():
        error = abs(actual_row['RATE'] - row['RATE'])
        error_proportion = error / actual_row['RATE']
        accuracy = (1 - error_proportion) * 100

        # Append the data to comparison_df
        comparison_df = comparison_df.append({
            'WeekStart': row['POD'],
            'WeekEnd': row['WeekEnd'],
            'POD_actual': actual_row['POD'],
            'RATE_forecasted': row['RATE'],
            'RATE_actual': actual_row['RATE'],
            'error': error,
            'accuracy': accuracy
        }, ignore_index=True)

# Remove duplicates based on 'POD_actual', 'RATE_forecasted', and 'RATE_actual'
comparison_df = comparison_df.drop_duplicates(
    subset=['POD_actual', 'RATE_forecasted', 'RATE_actual']).reset_index(drop=True)

# Display the comparison dataframe
comparison_df
comparison_df.info()

total_mean_accuracy = comparison_df['accuracy'].mean()
print(f'The mean accuracy is {total_mean_accuracy:.2f}%')


In [None]:
# # Identify rows where accuracy is negative
# negative_accuracy_df = comparison_df[comparison_df['accuracy'] < 0]

# # Display these rows
# negative_accuracy_df

# # Check if error_proportion is greater than 1
# large_error_df = comparison_df[comparison_df['error'] /
#                                comparison_df['RATE_actual'] > 1]

# # Display these rows
# large_error_df


<h4>Visualise all, Conclusion</h4>

In [None]:
# plt.figure(figsize=(20, 10))
# plt.plot(sel_df['POD'], sel_df['RATE'], color='blue', label="Actual Data")
# plt.plot(new_dates_df['POD'], new_dates_df['RATE'],
#          color='blue', label="Actual Data (Updated)")

# plt.plot(df_interpolated['POD'], df_interpolated['RATE'],
#          color='green', label="Aggregated Data")
# plt.plot(df_forecasted['POD'], df_forecasted['RATE'],
#          color='red', label="Forecasted Data")

# plt.xlabel('Date(Year Month)')
# plt.ylabel('Cost Rate(USD)')
# plt.title(old_df[1])
# plt.legend()
# plt.show()
