In [2]:
# Cell 1: Import libraries and load data
import pandas as pd
import numpy as np

In [3]:
df_kiel = pd.read_parquet('kiel_anomalies_labeled_2.parquet')
df_bremerhaven = pd.read_parquet('bremerhaven_anomalies_partlabeled.parquet')
df_cleaned = pd.read_parquet('clean_data_no_labels.parquet')

print(f"Data loaded:")
print(f"  Kiel: {len(df_kiel):,}  rows")
print(f"  Bremerhaven: {len(df_bremerhaven):,} rows")
print(f"  Cleaned: {len(df_cleaned):,} rows")
print(f"Total two files: {len(df_kiel) + len(df_bremerhaven):,} rows")

Data loaded:
  Kiel: 535,273  rows
  Bremerhaven: 378,322 rows
  Cleaned: 913,595 rows
Total two files: 913,595 rows


In [4]:
def fix_types(df):
    df['start_time'] = pd.to_datetime(df['start_time'], utc=True)
    df['end_time'] = pd.to_datetime(df['end_time'], utc=True)
    df['time_stamp'] = pd.to_datetime(df['time_stamp'], utc=True)  # Appears to have timezone info (+01:00)

    df['start_port'] = df['start_port'].astype('string').astype('category')
    df['end_port'] = df['end_port'].astype('string').astype('category')
    df['ship_type'] = df['ship_type'].astype('category')
    df['destination'] = df['destination'].astype('string').astype('category')

    return df

In [5]:
df_kiel = fix_types(df_kiel)
df_bremerhaven = fix_types(df_bremerhaven)
df_cleaned = fix_types(df_cleaned)

In [6]:
def fix_nan(df):
    df_new = df.copy()
    df_new['ship_type'] = df_new['ship_type'].astype('float').fillna('Unknown')
    df_new['destination'] = df_new['destination'].astype('string').fillna('Unknown')

    return df_new

In [7]:
df_bremerhaven['is_anomaly'] = df_bremerhaven['is_anomaly'].map({'True': True, 'False': False}).fillna(pd.NA)
df_bremerhaven['is_anomaly'] = df_bremerhaven['is_anomaly'].astype('boolean')

In [8]:
df_kiel.sort_values(['trip_id', 'time_stamp'], inplace=True)
df_kiel.to_parquet('kiel_anomalies_labeled_2_fixed.parquet')

In [86]:
df_all = pd.concat([df_kiel, df_bremerhaven])
df_all.sort_values(['trip_id', 'time_stamp'], inplace=True)

In [87]:
df_all.dtypes

trip_id                             Int64
start_latitude                    float64
start_longitude                   float64
start_time            datetime64[ns, UTC]
end_latitude                      float64
end_longitude                     float64
end_time              datetime64[ns, UTC]
start_port                 string[python]
end_port                   string[python]
time_stamp            datetime64[ns, UTC]
ship_type                         Float64
length                            float64
breadth                           float64
draught                           float64
latitude                          float64
longitude                         float64
speed_over_ground                 float64
course_over_ground                float64
true_heading                      float64
destination                string[python]
is_anomaly                        boolean
dtype: object

In [88]:
df_all.to_parquet('connected_labeled_anomalies.parquet')

## Checking for differences in the cleaned data

In [8]:
df_cleaned_reset = fix_nan(df_cleaned)
df_all_reset = fix_nan(df_all)

df_cleaned_reset.sort_values(['trip_id', 'time_stamp'], inplace=True)
df_all_reset.sort_values(['trip_id', 'time_stamp'], inplace=True)

to_drop = ['length', 'breadth', 'draught', 'is_anomaly']
df_cleaned_reset = df_cleaned_reset.drop(columns=to_drop).reset_index(drop=True)
df_all_reset = df_all_reset.drop(columns=to_drop).reset_index(drop=True)

In [9]:
df_diff = df_cleaned_reset.compare(df_all_reset)