In [61]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dropout, LSTM, Dense
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from IPython.core.interactiveshell import InteractiveShell
from sklearn.model_selection import KFold
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
sns.set()
pd.options.display.max_rows = 100


<h4>Importing Datasets</h4>

In [62]:
# Import dataset and clean, ready as a dataframe for creating keys
def createDF(datasets):
    df = pd.read_csv(datasets, converters={
                     'PARTY_ID': str, 'COM_ID': str, 'CNTR_SIZE': str})

    # Formating to type and remove NaN values
    df['POD'] = pd.to_datetime(df['POD'])
    df['ENCODED_TYPE'] = df['ENCODED_TYPE'].fillna(-1).astype(int)
    df = df.dropna(subset=['ENCODED_TYPE'])
    df['RATE'] = df['RATE'].fillna(-1).astype(float)
    df = df.dropna(subset=['RATE'])
    df['ENCODED_TYPE'] = df['ENCODED_TYPE'].astype(int)
    df_clean = df.dropna().reset_index(drop=True)

    # Selecting and rearranging columns
    sel_col = ['CSL_ID', 'CNTR_ID', 'POD_ID', 'ETD_POL_D', 'PARTY_ID',
               'PARTY_NAME', 'POD', 'CNTR_SIZE', 'CNTR_TYPE', 'RATE']
    df_fc = df_clean[sel_col]

    # Removing years we do not want to process in our models
    df_filtered = df_fc[df_fc['POD'].dt.year != 2002]

    # Sorting the dates
    df_filtered = df_filtered.sort_values(by='POD').reset_index(drop=True)

    return df_filtered


In [63]:
# Create Dataframes for old and new
old_data = '.\Datasets\CR_COST_FC.csv'
df1 = createDF(old_data)
df1.head()

new_data = '.\Datasets\CR_COST_FC_new.csv'
df2 = createDF(new_data)
df2.head()


Unnamed: 0,CSL_ID,CNTR_ID,POD_ID,ETD_POL_D,PARTY_ID,PARTY_NAME,POD,CNTR_SIZE,CNTR_TYPE,RATE
0,ECS01050600224,CTNR010050700354,ROTTERDAM,NLRTM,1008827,YANG MING (SINGAPORE) PTE. LTD.,2005-07-15,40,HC NOR,1620.0
1,ECS01050600610,CTNR010050700353,FELIXSTOWE,GBFXT,1002303,GLOBELINK FALLOW LIMITED,2005-07-15,40,GP,1800.0
2,ECS01050600041,CTNR010050700351,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-15,40,HC NOR,2170.0
3,ECS01050600163,CTNR010050700370,PASIR GUDANG,MYPGU,1002767,GLOBELINK CONTAINER LINES (JB) S/B,2005-07-15,40,HC,280.0
4,ECS01050600456,CTNR010050700450,KARACHI (KICT),PKKHI,1002783,GLOBELINK PAKISTAN (PVT) LTD,2005-07-16,40,HC,1625.0


Unnamed: 0,CSL_ID,CNTR_ID,POD_ID,ETD_POL_D,PARTY_ID,PARTY_NAME,POD,CNTR_SIZE,CNTR_TYPE,RATE
0,ECS01050600224,CTNR010050700354,ROTTERDAM,NLRTM,1008827,YANG MING (SINGAPORE) PTE. LTD.,2005-07-15,40,HC NOR,1620.0
1,ECS01050600610,CTNR010050700353,FELIXSTOWE,GBFXT,1002303,GLOBELINK FALLOW LIMITED,2005-07-15,40,GP,1800.0
2,ECS01050600041,CTNR010050700351,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-15,40,HC NOR,2170.0
3,ECS01050600163,CTNR010050700370,PASIR GUDANG,MYPGU,1002767,GLOBELINK CONTAINER LINES (JB) S/B,2005-07-15,40,HC,280.0
4,ECS01050600456,CTNR010050700450,KARACHI (KICT),PKKHI,1002783,GLOBELINK PAKISTAN (PVT) LTD,2005-07-16,40,HC,1625.0


<h4>Creating Dictionary Keys</h4>

In [64]:
# This function is to filter and create keys
def filter_dataframe(df):
    filtered_dataframes = {}

    for (port, size, ctype, party_id), group in df.groupby(['POD_ID', 'CNTR_SIZE', 'CNTR_TYPE', 'PARTY_ID']):
        group = group.reset_index(drop=True).sort_values(by='POD')
        df_id = f"Port_{port}_Size_{size}_Type_{ctype}_PartyID_{party_id}"
        filtered_dataframes[df_id] = group

    return filtered_dataframes


In [65]:
# Creating keys from data
print("Old Data keys:")
filtered_dataframe1 = filter_dataframe(df1)
df_ids1 = list(filtered_dataframe1.keys())
print(list(df_ids1))
print(len(list(df_ids1)))

print("\nNew Data keys:")
filtered_dataframe2 = filter_dataframe(df2)
df_ids2 = list(filtered_dataframe2.keys())
print(list(df_ids2))
print(len(list(df_ids2)))

# Removing Keys that have less then 500 rows as it is not enough data points for LSTM
print("\nRemoving keys that has less then 500 entries:")
# Old data keys
filtered_dataframe1_large = {key: df for key, df in filtered_dataframe1.items() if len(df) >= 500}
large_df_ids1 = list(filtered_dataframe1_large.keys())
print(list(large_df_ids1))
print(len(list(large_df_ids1)))
print("\n")

# New data keys
filtered_dataframe2_large = {key: df for key, df in filtered_dataframe2.items() if len(df) >= 500}
large_df_ids2 = list(filtered_dataframe2_large.keys())
print(list(large_df_ids2))
print(len(list(large_df_ids2)))

Old Data keys:
['Port_(CONSTANZA)_Size_40_Type_GP_PartyID_010007816', 'Port_(CONSTANZA)_Size_40_Type_HC_PartyID_010007816', 'Port_AARHUS_Size_20_Type_GP_PartyID_01000043', 'Port_AARHUS_Size_20_Type_GP_PartyID_0100027830', 'Port_AARHUS_Size_20_Type_GP_PartyID_010006666', 'Port_AARHUS_Size_40_Type_GP_PartyID_01000043', 'Port_AARHUS_Size_40_Type_HC_PartyID_01000043', 'Port_AARHUS_Size_40_Type_HC_PartyID_0100027830', 'Port_AARHUS_Size_40_Type_HC_PartyID_0100028193', 'Port_AARHUS_Size_40_Type_HC_PartyID_010006666', 'Port_ABIDJAN_Size_20_Type_GP_PartyID_0100027878', 'Port_ABIDJAN_Size_40_Type_HC_PartyID_010021097', 'Port_ADELAIDE_Size_20_Type_GP_PartyID_01002775', 'Port_ADELAIDE_Size_40_Type_GP_PartyID_01002775', 'Port_ADELAIDE_Size_40_Type_HC_PartyID_01002775', 'Port_ADELAIDE_Size_40_Type_HC_PartyID_01005078', 'Port_ADELAIDE_Size_40_Type_HC NOR_PartyID_01002775', 'Port_ADELAIDE(AU)_Size_20_Type_GP_PartyID_01002775', 'Port_ADELAIDE(AU)_Size_40_Type_HC_PartyID_01002775', 'Port_ADELAIDE(AU)_Si

<h4>Interpolate old_df missing values and group missing entries by weeks</h4>

In [66]:
sel_old_df = filtered_dataframe1_large[old_df[1]]
sel_old_df.head(3)
sel_old_df.info()

Unnamed: 0,CSL_ID,CNTR_ID,POD_ID,ETD_POL_D,PARTY_ID,PARTY_NAME,POD,CNTR_SIZE,CNTR_TYPE,RATE
0,ECS01050600140,CTNR010050700496,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-19,40,HC,2200.0
1,ECS01050600140,CTNR010050700495,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-19,40,HC,2200.0
2,ECS010050700037,CTNR010050700657,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-21,40,HC,2300.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1688 entries, 0 to 1687
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   CSL_ID      1688 non-null   object        
 1   CNTR_ID     1688 non-null   object        
 2   POD_ID      1688 non-null   object        
 3   ETD_POL_D   1688 non-null   object        
 4   PARTY_ID    1688 non-null   object        
 5   PARTY_NAME  1688 non-null   object        
 6   POD         1688 non-null   datetime64[ns]
 7   CNTR_SIZE   1688 non-null   object        
 8   CNTR_TYPE   1688 non-null   object        
 9   RATE        1688 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(8)
memory usage: 145.1+ KB


In [67]:
def interpolate_and_aggregate(df):
    # Drop duplicates
    sel_df = df.drop_duplicates(subset=['POD', 'RATE']).reset_index(drop=True)
    
    # Extract POD_ID and PARTY_ID from the first row
    pod_id = df['POD_ID'].iloc[0]
    party_id = df['PARTY_ID'].iloc[0]

    # Create a new dataframe with a date range from min to max date in your dataframe
    new_df = pd.DataFrame()
    new_df['POD'] = pd.date_range(start=sel_df['POD'].min(), end=sel_df['POD'].max())

    # Merge the original dataframe with the new one
    df_interpolated = pd.merge(new_df, sel_df[['POD', 'RATE']], on='POD', how='left')

    # Perform spline interpolation
    df_interpolated['RATE'] = df_interpolated['RATE'].interpolate(method='spline', order=1)
    df_interpolated['RATE'] = df_interpolated['RATE'].round(2)

    # Create YearMonthWeek directly from the 'POD'
    df_interpolated['YearMonthWeek'] = df_interpolated['POD'] - pd.to_timedelta(df_interpolated['POD'].dt.dayofweek, unit='D')

    # Create a new dataframe with every week in the range
    all_weeks = pd.date_range(start=df_interpolated['POD'].min(), end=df_interpolated['POD'].max(), freq='W')
    all_weeks_df = pd.DataFrame(all_weeks, columns=['POD'])

    # Create YearMonthWeek in all_weeks_df
    all_weeks_df['YearMonthWeek'] = all_weeks_df['POD'] - pd.to_timedelta(all_weeks_df['POD'].dt.dayofweek, unit='D')

    # Merge this with your original dataframe
    merged_df = pd.merge(all_weeks_df, df_interpolated, on=['YearMonthWeek'], how='left')

    # Group by YearMonthWeek and compute your rate
    grouped = merged_df.groupby(['YearMonthWeek'])

    agg_df = pd.DataFrame(columns=['YearMonthWeek', 'Rate','POD_ID','PARTY_ID'])

    for group_name, group_df in grouped:
        year_month_week = group_name

        # Calculate skewness of RATE values
        rate_skew = group_df['RATE'].skew()

        # Calculate mean or median based on skewness
        if rate_skew < 0:
            rate_metric = group_df['RATE'].mean()
        else:
            rate_metric = group_df['RATE'].median()

        new_row = {'YearMonthWeek': year_month_week, 'Rate': rate_metric, 'POD_ID': pod_id, 'PARTY_ID': party_id}

        # Append row to aggregated dataframe
        agg_df = agg_df.append(new_row, ignore_index=True)

    agg_df = agg_df.sort_values(by='YearMonthWeek').reset_index(drop=True)

    return agg_df


# dictionary to store the results
processed_dfs = {}

# loop over all keys in the original dictionary
for key in filtered_dataframe1_large.keys():
    processed_dfs[key] = interpolate_and_aggregate(filtered_dataframe1_large[key])

# Preview dictionary
print()

  for group_name, group_df in grouped:
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_d




  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index=True)
  agg_df = agg_df.append(new_row, ignore_index

<h4>Sorting and getting key arrays</h4>

In [68]:
def getPortKeys(keybunch):
    keybunch_pouch = []
    # Create a dictionary with corresponding dataframes
    keybunch_subset = {}
    
    # Get a dictionary with key and number of rows for each dataframe in filtered_dataframes
    key_row_counts = {key: len(keybunch[key]) for key in keybunch}

    # Sort the key_row_counts dictionary by value (number of rows) in descending order
    sorted_key_row_counts = sorted(
        key_row_counts.items(), key=lambda item: item[1], reverse=True)

    for key, row_count in sorted_key_row_counts:
            keybunch_subset[key] = keybunch[key]
            print(f"Number of rows in {key}: {row_count}")
            keybunch_pouch.append(key)

    # Return array of keys
    return keybunch_pouch

In [69]:
# This is changing it to an array

print('Processed Old Dataset Keybunch:')
process_old_df= getPortKeys(processed_dfs)
print(len(process_old_df))
print('\n')

print('Old Dataset Keybunch:')
old_df= getPortKeys(filtered_dataframe1_large)
print(len(old_df))
print('\n')

print('New Dataset Keybunch:')
new_df= getPortKeys(filtered_dataframe2_large)
print(len(new_df))
print('\n')


Processed Old Dataset Keybunch:
Number of rows in Port_AUCKLAND_Size_40_Type_HC_PartyID_01005136: 918
Number of rows in Port_DUBAI (JEBEL ALI)_Size_40_Type_HC_PartyID_01002788: 918
Number of rows in Port_FREMANTLE_Size_40_Type_HC_PartyID_01002777: 918
Number of rows in Port_MELBOURNE_Size_40_Type_HC_PartyID_01002778: 918
Number of rows in Port_FELIXSTOWE_Size_40_Type_HC_PartyID_01002303: 917
Number of rows in Port_AUCKLAND_Size_40_Type_HC NOR_PartyID_01005136: 916
Number of rows in Port_SYDNEY_Size_40_Type_HC_PartyID_01002779: 915
Number of rows in Port_BRISBANE_Size_40_Type_HC_PartyID_01002776: 909
Number of rows in Port_BUSAN_Size_40_Type_HC_PartyID_010004286: 908
Number of rows in Port_LYTTELTON_Size_40_Type_HC_PartyID_01005136: 863
Number of rows in Port_VALPARAISO_Size_40_Type_HC NOR_PartyID_010006350: 854
Number of rows in Port_PIRAEUS_Size_40_Type_HC_PartyID_010006369: 837
Number of rows in Port_HAIPHONG_Size_40_Type_HC_PartyID_010005255: 575
Number of rows in Port_NHAVA SHEVA_S

In [71]:
# Global variable selector
sel_country = old_df[1]
print(sel_country)

# Getting the latest data from new vs old as accuracy measure
sel_process_old_df = processed_dfs[sel_country]
sel_process_old_df.head(3)
sel_process_old_df.info()
print("\n")


sel_old_df = filtered_dataframe1_large[sel_country]
sel_old_df.head(3)
sel_old_df.info()
print("\n")

sel_new_df = filtered_dataframe2_large[sel_country]
sel_new_df.head(3)
sel_new_df.info()

Port_AUCKLAND_Size_40_Type_HC_PartyID_01005136


Unnamed: 0,YearMonthWeek,Rate,POD_ID,PARTY_ID
0,2005-07-18,2255.001667,AUCKLAND,1005136
1,2005-07-25,2388.568571,AUCKLAND,1005136
2,2005-08-01,2261.425714,AUCKLAND,1005136


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   YearMonthWeek  918 non-null    datetime64[ns]
 1   Rate           918 non-null    float64       
 2   POD_ID         918 non-null    object        
 3   PARTY_ID       918 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 28.8+ KB




Unnamed: 0,CSL_ID,CNTR_ID,POD_ID,ETD_POL_D,PARTY_ID,PARTY_NAME,POD,CNTR_SIZE,CNTR_TYPE,RATE
0,ECS01050600140,CTNR010050700496,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-19,40,HC,2200.0
1,ECS01050600140,CTNR010050700495,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-19,40,HC,2200.0
2,ECS010050700037,CTNR010050700657,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-21,40,HC,2300.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1688 entries, 0 to 1687
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   CSL_ID      1688 non-null   object        
 1   CNTR_ID     1688 non-null   object        
 2   POD_ID      1688 non-null   object        
 3   ETD_POL_D   1688 non-null   object        
 4   PARTY_ID    1688 non-null   object        
 5   PARTY_NAME  1688 non-null   object        
 6   POD         1688 non-null   datetime64[ns]
 7   CNTR_SIZE   1688 non-null   object        
 8   CNTR_TYPE   1688 non-null   object        
 9   RATE        1688 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(8)
memory usage: 145.1+ KB




Unnamed: 0,CSL_ID,CNTR_ID,POD_ID,ETD_POL_D,PARTY_ID,PARTY_NAME,POD,CNTR_SIZE,CNTR_TYPE,RATE
0,ECS01050600140,CTNR010050700496,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-19,40,HC,2200.0
1,ECS01050600140,CTNR010050700495,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-19,40,HC,2200.0
2,ECS010050700037,CTNR010050700657,AUCKLAND,NZAKL,1005136,MONDIALE FREIGHT SERVICES LIMITED-AUCKLAND,2005-07-21,40,HC,2300.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1704 entries, 0 to 1703
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   CSL_ID      1704 non-null   object        
 1   CNTR_ID     1704 non-null   object        
 2   POD_ID      1704 non-null   object        
 3   ETD_POL_D   1704 non-null   object        
 4   PARTY_ID    1704 non-null   object        
 5   PARTY_NAME  1704 non-null   object        
 6   POD         1704 non-null   datetime64[ns]
 7   CNTR_SIZE   1704 non-null   object        
 8   CNTR_TYPE   1704 non-null   object        
 9   RATE        1704 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(8)
memory usage: 146.4+ KB
