In [None]:
import os
import pandas as pd

from data_cleaning.processing_utils import (
    get_percentage_missing,
    get_entries_with_missing_values,
    plot_missing
)

In [None]:
file_path = '../data/prepared.parquet'
output_file_path = '../data/fix_noise.parquet'

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")

df = pd.read_parquet(file_path)
# df.dtypes

In [None]:
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")
df.drop(columns=['AisSourcen'], inplace=True) #NOTE can be changed

df = df.drop_duplicates()
df = df.reset_index(drop=True)
len(df)

**1. Converting impossible values to NaN:**

- TH: should only have values 0-360, but has 500
**Heading (degrees) of the vessel's hull. A value of 511 indicates there is no heading data.**
[link](https://api.vtexplorer.com/docs/response-ais.html#:~:text=Heading%20(degrees)%20of%20the%20vessel's,there%20is%20no%20heading%20data.&text=Dimension%20(meters)%20from%20AIS%20GPS%20antenna%20to%20the%20Stern%20of,Vessel%20Length%20=%20A%20+%20B)
- COG: should only have values 0-360
--------------------------------
- Latitude one really low value
- SOG: 3 examples with superspeed ships
-------STATIC---------
- Length and Breadth should be positive
- Draught 0 and really high values


--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

 Yes, a draft of 0.1 is certainly possible. The term "draft" refers to the vertical distance between the waterline and the lowest part of a ship's hull. This means that even a very small draft of 0.1 is possible, as it simply means the ship is sitting low in the water, possibly due to being lightly loaded or having a design that allows for minimal depth.



In [None]:
import numpy as np

df['Length'] = df['Length'].apply(lambda x: x if x > 0 else  np.nan)
df['Breadth'] = df['Breadth'].apply(lambda x: x if x > 0 else  np.nan)
df['Draught'] = df['Draught'].apply(lambda x: x if 0 < x else  np.nan)
df['shiptype'] = df['shiptype'].apply(lambda x: x if 0 < x else  np.nan)
df['Destination'] = df['Destination'].fillna(pd.NA)

df['TH'] = df['TH'].apply(lambda x: x if 0 <= x <= 360 or x == 511 else None)
df['COG'] = df['COG'].apply(lambda x: x if 0 <= x <= 360 else None)

# df['SOG'] = df['SOG'].apply(lambda x: x if x <= 60 else None)    #NOTE CAN BE CHANGED
df = df[df['SOG'] < 30] # Drop rows where SOG is greater than or equal to 30

In [None]:
# plot_missing(df)

In [None]:
get_percentage_missing(df)

| Parameter     | Missing % |
|---------------|-----------|
| Length        | 1.122565  |
| Breadth       | 1.122565  |
| Draught       | 1.784436  |
| COG           | 0.000109  |
| Destination   | 0.575840  |
| COG           | 0.000109  |
| shiptype      | 0.717820  |


After 3

| Parameter     | Missing % |
|---------------|-----------|
| Length        | 0.471434  |
| Breadth       | 0.471434  |
| Draught       | 1.779008  |
| Destination   | 0.007443  |
| shiptype      | 0.152912  |

After 4

| Parameter     | Missing % |
|---------------|-----------|
| Destination   | 0.007443  |
| shiptype      | 0.152912  |


## 2. Destination

See if we can just fill in missing values with the nearest value within the same TripID

In [73]:
# get_entries_with_missing_values(df, 'Destination')

df.loc[df['TripID'] == 355595]  # Check the first trip with missing Destination
Not really but we can fill missing values with the nearest value within the same TripID and more matching coordinates.
Find the first nan value above and below it will fill it with the nearest value within the same TripID and more matching coordinates.

In [None]:
from data_cleaning.processing_utils import fill_missing_destinations_by_proximity

df_dest = fill_missing_destinations_by_proximity(df)
print(get_percentage_missing(df_dest, 'Destination'))

**No Destination at All**

In [None]:
# get_entries_with_missing_values(df_dest, 'Destination')

### Functions

In [None]:
has_none = df.groupby("TripID")['Length'].apply(lambda x: x.isna().any())
has_none

In [None]:
def get_inconsistent_trip_ids(df, column):
    nunique_values = df.groupby("TripID")[column].nunique()

    has_nan_values = df.groupby("TripID")[column].apply(lambda x: x.isna().any())
    inconsistent_by_nunique = nunique_values > 1
    inconsistent_by_nan_and_value = has_nan_values & (nunique_values > 0)
    inconsistent_trip_ids = (inconsistent_by_nunique | inconsistent_by_nan_and_value)

    return inconsistent_trip_ids.loc[lambda x: x].index

def make_inconsistent_mode(dataf, column):
    inconsistent_trip_ids = get_inconsistent_trip_ids(dataf, column)

    for trip_id in inconsistent_trip_ids:
        trip_mask = dataf["TripID"] == trip_id

        mode_values = dataf.loc[trip_mask, column].mode(dropna=True)
        print(f"Calculated mode values (Series):\n{mode_values.to_list()}")

        if not mode_values.empty:
            chosen_mode = mode_values[0]
            print(f"First mode chosen: {chosen_mode}")
            # Replace ALL values (including nulls) with the first mode
            dataf.loc[trip_mask, column] = chosen_mode
        else:
            print(f"No mode found for TripID: {trip_id} after dropping NaNs.")

    return dataf # assuming you want to return the modified DataFrame

def all_fill_with_mode(dataf, column):
    print("-"* 40)
    print(" "*20,f"Before filling {column}")
    print(f"percentage of missing:" , get_percentage_missing(df_lbs, column))
    print("Inconsistent Trip IDs:\n", get_inconsistent_trip_ids(dataf, column))
    make_inconsistent_mode(dataf, column)
    print(" "*20, f"After filling {column}")
    print(f"percentage of missing:" , get_percentage_missing(dataf, column))
    print("Inconsistent Trip IDs:\n", get_inconsistent_trip_ids(dataf, column))
    print("-"* 40)




### 3 Length && Breadth && Shiptype

In [67]:
df_lbs = df_dest.copy()

In [68]:
all_fill_with_mode(df_lbs, 'Length')

----------------------------------------
                     Before filling Length
percentage of missing: 1.1225968590055122
Inconsistent Trip IDs:
 Index([   5944,   19585,   23834,   28257,   29139,   29152,   29165,   29190,
         32680,   32692,
       ...
       1998836, 2022611, 2183472, 2183480, 2183488, 2183505, 2201111, 2258835,
       2263639, 2271342],
      dtype='int64', name='TripID', length=249)
Calculated mode values (Series):
[82.0]
First mode chosen: 82.0
Calculated mode values (Series):
[81.0]
First mode chosen: 81.0
Calculated mode values (Series):
[87.0]
First mode chosen: 87.0
Calculated mode values (Series):
[175.0]
First mode chosen: 175.0
Calculated mode values (Series):
[151.0]
First mode chosen: 151.0
Calculated mode values (Series):
[151.0]
First mode chosen: 151.0
Calculated mode values (Series):
[151.0]
First mode chosen: 151.0
Calculated mode values (Series):
[151.0]
First mode chosen: 151.0
Calculated mode values (Series):
[151.0]
First mode chosen: 

In [None]:
all_fill_with_mode(df_lbs, 'Breadth')

#### Shiptype

In [None]:
df_lbs['shiptype'] = df_lbs['shiptype'].apply(lambda x: x if 0 < x else None)

In [None]:
print(get_percentage_missing(df_lbs, 'shiptype'))

valid_trips = df_lbs.groupby('TripID')['shiptype'].transform(lambda x: x.notna().any())
# Forward-fill ONLY in trips that have at least one non-null shiptype
df_lbs['shiptype'] = (
    df_lbs.groupby('TripID')['shiptype']
    .ffill()
    .where(valid_trips)  # Keep filled values only in valid trips
    .combine_first(df_lbs['shiptype'])  # Restore original NaN where no fill occurred
)

print(get_percentage_missing(df_lbs, 'shiptype'))

In [None]:
get_entries_with_missing_values(df_lbs, 'shiptype')

In [None]:
plot_missing(df_lbs)

As if see then ship type is missing together with Length and Breadth.
The mean doesn't make sense here
Idk what to do with it, now

Clustering is an option but I will bother with it later

#### Draught

In [None]:
df_drght = df_lbs.copy()

#### Not all nulls

In [None]:
# has none but at least one proper
mixed_draught_trips = (
    df_drght.groupby("TripID")["Draught"]
    .apply(lambda x: x.notna().any() and x.isna().any())
    .loc[lambda x: x]  # Filter only True cases
    .index
    .tolist()
)
mixed_draught_trips
#After looking in the data for this we can use median of the trip to fill in the missing values or interpolate

In [None]:
print(get_percentage_missing(df_drght, 'Draught'))

trip_medians = df_drght[df_drght["TripID"].isin(mixed_draught_trips)].groupby("TripID")["Draught"].median()

df_drght_cp = df_drght.copy()
for trip_id in mixed_draught_trips:
    mask = (df_drght_cp["TripID"] == trip_id) & (df_drght_cp["Draught"].isna())
    df_drght_cp.loc[mask, "Draught"] = trip_medians[trip_id]

# df['Draught'] = df.groupby('TripID')['Draught'].interpolate()
print(get_percentage_missing(df_drght_cp, 'Draught'))

#### All nulls

In [None]:
# No values at all

trips_with_no_draught = df_drght_cp.groupby('TripID')['Draught'].apply(
    lambda x: x.isna().all()
)
no_draught_trip_ids = trips_with_no_draught[trips_with_no_draught].index.tolist()
no_draught_trip_ids

#### Cog

In [79]:
df_cog = df_drght_cp.copy()

In [80]:
# df_cog['COG'].isnull()
# df_cog.loc[df_cog['TripID'] == 1778056]
#The null is last entry id data series
df_cog['COG'] = df_cog['COG'].fillna(0)

In [81]:
get_percentage_missing(df_cog)

TripID            0.000000
StartLatitude     0.000000
StartLongitude    0.000000
StartTime         0.000000
EndLatitude       0.000000
EndLongitude      0.000000
EndTime           0.000000
StartPort         0.000000
EndPort           0.000000
time              0.000000
shiptype          0.152912
Length            0.471434
Breadth           0.471434
Draught           1.779014
Latitude          0.000000
Longitude         0.000000
SOG               0.000000
COG               0.000000
TH                0.000000
Destination       0.007443
dtype: float64

## 4. All nulls for time entry

Draught higly colerated to Length and Breadth, so we can use them to fill in the missing values

In [82]:
df_regression = df_cog.copy()

In [83]:
print(df_regression['Draught'].corr(df_regression['Length']))
print(df_regression['Draught'].corr(df_regression['Breadth']))
print(df_regression['Length'].corr(df_regression['Breadth']))

0.8653101044306464
0.8559429515375512
0.9397848338458639


Train a HistGradientBoostingRegressor model to predict missing Draught values based on Length and Breadth:

In [86]:
df_regression['Draught'].isnull().count()

np.int64(913596)

In [87]:
from sklearn.metrics import r2_score
from sklearn.ensemble import HistGradientBoostingRegressor

# Function to fill missing values safely
def fill_missing(df, target_col, feature_cols, round_values=True):
    df_copy = df.copy()  # Work on a copy to avoid modifying the original DataFrame
    known = df_copy.dropna(subset=[target_col])
    missing = df_copy[df[target_col].isna()]

    if len(missing) == 0:
        return df  # No missing values to fill

    X_train = known[feature_cols]
    y_train = known[target_col]
    X_test = missing[feature_cols]

    model = HistGradientBoostingRegressor()
    model.fit(X_train, y_train)
    predicted_values = model.predict(X_test)
    predicted_values_int = np.round(predicted_values).astype(int)
    prediction = predicted_values_int if round_values else predicted_values
    df_copy.loc[df_copy[target_col].isna(), target_col] = prediction
    print("R2 score", r2_score(y_train, model.predict(X_train)))

    return df_copy

print("Missing Draught (%):", get_percentage_missing(df_regression, 'Draught'))
print("Missing Length (%):", get_percentage_missing(df_regression, 'Length'))
print("Missing Breadth (%):", get_percentage_missing(df_regression, 'Breadth'))


# Fill Length first (using Breadth + Draught)
df_regression = fill_missing(df_regression, 'Length', ['Breadth', 'Draught'])

# Then fill Breadth (using Length + Draught)
df_regression = fill_missing(df_regression, 'Breadth', ['Length', 'Draught'])

# Fill Draught first (using Length + Breadth)
df_regression = fill_missing(df_regression, 'Draught', ['Length', 'Breadth'], round_values=False)


# Verify no missing values remain
print("Missing Draught (%):", df_regression['Draught'].isnull().sum() / len(df_regression) * 100)
print("Missing Length (%):", df_regression['Length'].isnull().sum() / len(df_regression) * 100)
print("Missing Breadth (%):", df_regression['Breadth'].isnull().sum() / len(df_regression) * 100)

Missing Draught (%): 1.7790139186248628
Missing Length (%): 0.47143376284484606
Missing Breadth (%): 0.47143376284484606
R2 score 0.9909411775492746
R2 score 0.9974288053445358
R2 score 0.9597535535043457
Missing Draught (%): 0.0
Missing Length (%): 0.0
Missing Breadth (%): 0.0


In [93]:
print(get_percentage_missing(df_regression))


TripID            0.000000
StartLatitude     0.000000
StartLongitude    0.000000
StartTime         0.000000
EndLatitude       0.000000
EndLongitude      0.000000
EndTime           0.000000
StartPort         0.000000
EndPort           0.000000
time              0.000000
shiptype          0.152912
Length            0.000000
Breadth           0.000000
Draught           0.000000
Latitude          0.000000
Longitude         0.000000
SOG               0.000000
COG               0.000000
TH                0.000000
Destination       0.007443
dtype: float64


In [None]:
# plot_missing(df_regression)

### Save

In [94]:
df_final = df_regression.copy()

# Rename columns to match the desired format

In [95]:
column_mapping = {
    'TripID':           'trip_id',
    'StartLatitude':    'start_latitude',
    'StartLongitude':   'start_longitude',
    'StartTime':        'start_time',
    'EndLatitude':      'end_latitude',
    'EndLongitude':     'end_longitude',
    'EndTime':          'end_time',
    'StartPort':        'start_port',
    'EndPort':          'end_port',
    'time':             'time_stamp',
    'shiptype':         'ship_type',
    'Length':           'length',
    'Breadth':          'breadth',
    'Draught':          'draught',
    'Latitude':         'latitude',
    'Longitude':        'longitude',
    'SOG':              'speed_over_ground',
    'COG':              'course_over_ground',
    'TH':               'true_heading',
    'Destination':      'destination',
}

# Apply renaming
df_final = df_final.rename(columns=column_mapping)

In [96]:
df_final.info

<bound method DataFrame.info of         trip_id  start_latitude  start_longitude                start_time  \
0         39131           53.57             8.53 2016-01-24 08:06:00+00:00   
1         39131           53.57             8.53 2016-01-24 08:06:00+00:00   
2         39131           53.57             8.53 2016-01-24 08:06:00+00:00   
3         39131           53.57             8.53 2016-01-24 08:06:00+00:00   
4         39131           53.57             8.53 2016-01-24 08:06:00+00:00   
...         ...             ...              ...                       ...   
913594  2204049           54.36            10.14 2017-04-03 07:54:00+00:00   
913595  2204049           54.36            10.14 2017-04-03 07:54:00+00:00   
913596  2204049           54.36            10.14 2017-04-03 07:54:00+00:00   
913597  2204049           54.36            10.14 2017-04-03 07:54:00+00:00   
913598  2204049           54.36            10.14 2017-04-03 07:54:00+00:00   

        end_latitude  end_longi

In [98]:
duplicate_count = df_final.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

df_final = df_final.drop_duplicates()
df_final = df_final.reset_index(drop=True)
len(df_final)

Number of duplicate rows: 0


913595

In [99]:
df_final['is_anomaly'] = None

In [100]:
df_final.to_parquet(output_file_path)

### Divide data set for two tips

In [109]:
print(df_final['start_port'].unique())
print(df_final['end_port'].unique())

['BREMERHAVEN', 'KIEL']
Categories (2, object): ['BREMERHAVEN', 'KIEL']
['HAMBURG', 'GDYNIA']
Categories (2, object): ['GDYNIA', 'HAMBURG']


In [110]:
df_from_KIEL = df_final[df_final['start_port'] == 'KIEL'].copy().reset_index(drop=True)
df_from_BREMERHAVEN = df_final[df_final['start_port'] == 'BREMERHAVEN'].copy().reset_index(drop=True)

In [112]:
df_from_KIEL.to_parquet('../data/manual_labeling/from_KIEL.parquet')
df_from_BREMERHAVEN.to_parquet('../data/manual_labeling/from_BREMERHAVEN.parquet')