In [None]:
import os
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt

In [None]:
file_path = '../data/prepared.parquet'
output_file_path = '../data/fix_noise.parquet'

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")

df = pd.read_parquet(file_path)
# df.dtypes

In [None]:
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")
df.drop(columns=['AisSourcen'], inplace=True) #NOTE can be changed

df = df.drop_duplicates()
df = df.reset_index(drop=True)
len(df)

**1. Converting impossible values to NaN:**

- TH: should only have values 0-360, but has 500
**Heading (degrees) of the vessel's hull. A value of 511 indicates there is no heading data.**
[link](https://api.vtexplorer.com/docs/response-ais.html#:~:text=Heading%20(degrees)%20of%20the%20vessel's,there%20is%20no%20heading%20data.&text=Dimension%20(meters)%20from%20AIS%20GPS%20antenna%20to%20the%20Stern%20of,Vessel%20Length%20=%20A%20+%20B)
- COG: should only have values 0-360
--------------------------------
- Latitude one really low value
- SOG: 3 examples with superspeed ships
-------STATIC---------
- Length and Breadth should be positive
- Draught 0 and really high values


--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

 Yes, a draft of 0.1 is certainly possible. The term "draft" refers to the vertical distance between the waterline and the lowest part of a ship's hull. This means that even a very small draft of 0.1 is possible, as it simply means the ship is sitting low in the water, possibly due to being lightly loaded or having a design that allows for minimal depth.



In [None]:
df['Length'] = df['Length'].apply(lambda x: x if x > 0 else None)
df['Breadth'] = df['Breadth'].apply(lambda x: x if x > 0 else None)
df['Draught'] = df['Draught'].apply(lambda x: x if 0 < x else None)
df['shiptype'] = df['shiptype'].apply(lambda x: x if 0 < x else None)

df['TH'] = df['TH'].apply(lambda x: x if 0 <= x <= 360 or x == 511 else None)
df['COG'] = df['COG'].apply(lambda x: x if 0 <= x <= 360 else None)

# df['SOG'] = df['SOG'].apply(lambda x: x if x <= 60 else None)    #NOTE CAN BE CHANGED
# df.loc[df['SOG'] >= 60]

In [None]:
cols = df.columns
plt.figure(figsize = (10,5))
sns.heatmap(df[cols].isnull(), cmap=['white', 'black'], cbar=False)

In [None]:
df.isnull().sum() / len(df) * 100, 1

| Parameter     | Missing % |
|---------------|-----------|
| Length        | 1.122565  |
| Breadth       | 1.122565  |
| Draught       | 1.784436  |
| COG           | 0.000109  |
| TH            | 1.418968  |
| Destination   | 0.575840  |
| COG           | 0.000109  |
| shiptype      | 0.717820  |

## 2. Destination

See if we can just fill in missing values with the nearest value within the same TripID

In [None]:
trips_with_missing_dest = df[df['Destination'].isna()]['TripID'].unique()   # 1. Get TripIDs with at least one missing Destination
df_missing_trips = df[df['TripID'].isin(trips_with_missing_dest)]           # 2. Filter DataFrame to include only these trips

result = (
    df_missing_trips
    .groupby('TripID')['Destination']
    .apply(lambda x: list(x.unique()))
)
result

In [None]:
df.loc[df['TripID'] == 355595]  # Check the first trip with missing Destination

df.loc[df['TripID'] == 355595]  # Check the first trip with missing Destination
Not really but we can fill missing values with the nearest value within the same TripID and more matching coordinates.
Find the first nan value above and below it will fill it with the nearest value within the same TripID and more matching coordinates.

In [None]:
from data_cleaning.processing_utils import fill_missing_destinations_by_proximity

df = fill_missing_destinations_by_proximity(df)
print(df['Destination'].isna().sum() / len(df) * 100)

**No Destination at All**

In [None]:
trips_with_missing_dest = df[df['Destination'].isna()]['TripID'].unique()   # 1. Get TripIDs with at least one missing Destination
df_missing_trips = df[df['TripID'].isin(trips_with_missing_dest)]           # 2. Filter DataFrame to include only these trips

result = (
    df_missing_trips
    .groupby('TripID')['Destination']
    .apply(lambda x: list(x.unique()))
)
result

### Functions

In [None]:
def get_inconsistent_trip_ids(df, column):
      return (
            df.groupby("TripID")[column]
            .apply(lambda x: x.dropna().nunique() > 1)
            .loc[lambda x: x]
            .index
        )


def make_inconsistent_mode(dataf, column):
    inconsistent_trip_ids = get_inconsistent_trip_ids(dataf, column)

    for trip_id in inconsistent_trip_ids:
        trip_mask = dataf["TripID"] == trip_id
        mode_values = dataf.loc[trip_mask, column].mode(dropna=True)

        if not mode_values.empty:
            # Replace ALL values (including nulls) with the first mode
            dataf.loc[trip_mask, column] = mode_values[0]


### 3 Length && Breadth && Shiptype

In [None]:
print(df['Length'].isnull().sum() / len(df) * 100)
print(get_inconsistent_trip_ids(df, "Length"))
make_inconsistent_mode(df, "Length")
print(df['Length'].isnull().sum() / len(df) * 100)
get_inconsistent_trip_ids(df, "Length")

In [None]:
print(df['Breadth'].isnull().sum() / len(df) * 100)
print(get_inconsistent_trip_ids(df, "Breadth"))
make_inconsistent_mode(df, "Breadth")
print(df['Breadth'].isnull().sum() / len(df) * 100)
get_inconsistent_trip_ids(df, "Breadth")

#### Shiptype

In [None]:
df['shiptype'] = df['shiptype'].apply(lambda x: x if 0 < x else None)

In [None]:
print(get_inconsistent_trip_ids(df, "shiptype"))
print(df['shiptype'].isnull().sum() / len(df) * 100)
make_inconsistent_mode(df, "shiptype")
print(df['shiptype'].isnull().sum() / len(df) * 100)
get_inconsistent_trip_ids(df, "shiptype")

In [None]:
trips_with_missing_dest = df[df['shiptype'].isna()]['TripID'].unique()   # 1. Get TripIDs with at least one missing Destination
df_missing_trips = df[df['shiptype'].isin(trips_with_missing_dest)]           # 2. Filter DataFrame to include only these trips

result = (
    df_missing_trips
    .groupby('TripID')['shiptype']
    .apply(lambda x: list(x.unique()))
)
result

In [None]:
cols = df.columns
plt.figure(figsize = (10,5))
sns.heatmap(df[cols].isnull(), cmap=['white', 'black'], cbar=False)

As if see then ship type is missing together with Length and Breadth.
The mean doesn't make sense here
Idk what to do with it, now

Clustering is an option but I will bother with it later

#### Draught

#### Not all nulls

In [None]:
# has none but at least one proper
mixed_draught_trips = (
    df.groupby("TripID")["Draught"]
    .apply(lambda x: x.notna().any() and x.isna().any())
    .loc[lambda x: x]  # Filter only True cases
    .index
    .tolist()
)
mixed_draught_trips
#After looking in the data for this we can use median of the trip to fill in the missing values or interpolate

In [None]:
print(df['Draught'].isnull().sum() / len(df) * 100)

# trip_medians = df[df["TripID"].isin(mixed_draught_trips)].groupby("TripID")["Draught"].median()
#
# df = df.copy()
# for trip_id in mixed_draught_trips:
#     mask = (df["TripID"] == trip_id) & (df["Draught"].isna())
#     df.loc[mask, "Draught"] = trip_medians[trip_id]

df['Draught'] = df.groupby('TripID')['Draught'].interpolate()
print(df['Draught'].isnull().sum() / len(df) * 100)

#### All nulls

In [None]:
# No values at all

trips_with_no_draught = df.groupby('TripID')['Draught'].apply(
    lambda x: x.isna().all()
)
no_draught_trip_ids = trips_with_no_draught[trips_with_no_draught].index.tolist()
no_draught_trip_ids

#### Cog

In [None]:
# df['COG'].isnull()
# df.loc[df['TripID'] == 1778056]
#The null is last entry id data series
df['COG'] = df['COG'].fillna(0)

### All nulls for time entry

### Save

In [None]:
df.to_parquet(output_file_path)