In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import cross_val_score
import joblib
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
sns.set()
pd.options.display.max_rows = 100

df = pd.read_csv('CR_COST_FC.csv', converters={'PARTY_ID': str, 'COM_ID': str, 'CNTR_SIZE': str})
df['POD'] = pd.to_datetime(df['POD'])
df['ENCODED_TYPE'] = df['ENCODED_TYPE'].fillna(-1).astype(int)
df = df.dropna(subset=['ENCODED_TYPE'])
df['RATE'] = df['RATE'].fillna(-1).astype(int)
df = df.dropna(subset=['RATE'])
df['ENCODED_TYPE'] = df['ENCODED_TYPE'].astype(int)

# Remove NAN values
df_clean= df.dropna().reset_index(drop=True)

df_clean.head()
df_clean.info()
print(f'Dataset size: {df_clean.shape}')

In [None]:
df_clean['CNTR_SIZE'] = df_clean['CNTR_SIZE'].astype(np.int32)
df_clean['RATE'] = df_clean['RATE'].astype(np.int32)
df_clean['PARTY_ID_EN'] = df_clean['PARTY_ID_EN'].astype(np.int32)
df_clean['POD_ID_EN'] = df_clean['POD_ID_EN'].astype(np.int64)
df_clean['ETA_ETD_NO'] = df_clean['ETA_ETD_NO'].astype(np.int32)
df_clean.info()
df_clean.head()

In [None]:
sel_col = ['COM_ID','CSL_ID', 'CNTR_ID','ENCODED_TERM','COST_TERM','POD_ID','ETD_POL_D','PARTY_ID',
           'PARTY_ID_EN', 'PARTY_NAME','POD_ID_EN','ETA_ETD_NO','POD',
           'CNTR_SIZE','ENCODED_TYPE','CNTR_TYPE','RATE']

df_fc = df_clean[sel_col]
df_fc.head()
df_fc.info()

In [None]:
# filter out rows where the year is 2002
df_filtered = df_fc[df_fc['POD'].dt.year != 2002]
df_filtered.head()
df_filtered.info()

# Checking if year 2002 is removed
df_filtered['POD'].dt.year.unique()

In [None]:
df_filtered = df_filtered.sort_values(by='POD').reset_index(drop=True)
df_filtered.head()
df_filtered['POD'].dt.year.unique()

<h4>FIltering and getting a list stored as a dictionary</h4>

In [None]:
def filter_dataframe(df):
    filtered_dataframes = {}

    for (port, size, ctype, party_id), group in df.groupby(['POD_ID', 'CNTR_SIZE', 'CNTR_TYPE', 'PARTY_ID']):
        group = group.reset_index().sort_values(by='POD')
        df_id = f"Port_{port}_Size_{size}_Type_{ctype}_PartyID_{party_id}"
        filtered_dataframes[df_id] = group

    return filtered_dataframes

In [None]:
filtered_dataframes = filter_dataframe(df_filtered)

In [None]:
df_ids = list(filtered_dataframes.keys())
print(list(df_ids))
print(len(list(df_ids)))

Testing key

In [None]:
# grouped_df = filtered_dataframes['Port_AUCKLAND_Size_40_Type_HC_PartyID_01005136']
# grouped_df.head()
# grouped_df.info()

<h4>Training Isolation Forest with all historical data for anomaly</h4>

In [55]:
def train_and_evaluate_models(filtered_dataframes):
    # Initialize a dictionary to store the trained models and their evaluation scores
    trained_models = {}
    scores = {}

    # Train an IsolationForest model on each dataframe and evaluate its performance using cross-validation
    for key, group in filtered_dataframes.items():
        # Create an IsolationForest model
        model = IsolationForest(n_estimators=100, contamination='auto', random_state=42)

        # Evaluate the model using cross-validation
        scores[key] = cross_val_score(model, group[['RATE']], cv=5, scoring='f1_macro')

        # Fit the model to the dataframe
        model.fit(group[['RATE']])

        # Store the trained model
        trained_models[key] = model

    # Select the model with the highest evaluation score
    best_model_key = max(scores, key=scores.get)
    best_model = trained_models[best_model_key]

    # Return the best model
    return best_model

Cross-validation can be used for evaluating the performance of an Isolation Forest algorithm for anomaly detection, but it's not always the best option.

Isolation Forest is an unsupervised learning algorithm, which means that there are no labels available to evaluate its performance in a traditional supervised learning sense. In this case, cross-validation can be used to evaluate the stability of the algorithm and its ability to detect anomalies in new, unseen data.

However, cross-validation assumes that the data is independently and identically distributed (i.i.d.), which may not always be the case in practice, especially when dealing with time series data. In such cases, a more appropriate evaluation method may be to use a sliding window approach, where the model is trained on a fixed window of past data and evaluated on a sliding window of future data.

Another evaluation metric that can be used for Isolation Forest is the area under the Receiver Operating Characteristic (ROC) curve (AUC-ROC). AUC-ROC is a measure of how well the model can distinguish between normal and anomalous data points, and can be useful when evaluating the performance of anomaly detection algorithms.

In summary, while cross-validation can be useful for evaluating the performance of an Isolation Forest algorithm, it's not always the best option, and other evaluation metrics and methods should also be considered depending on the specific use case and data characteristics.

In [56]:
trained_model = train_isolation_forest(filtered_dataframes)