In [None]:
import os
import pandas as pd
import seaborn as sns
from data_cleaning.processing_utils import fill_missing_destinations_by_proximity

from matplotlib import pyplot as plt

In [None]:
file_path = '../../data/destination_norm.parquet'
output_file_path = '../../data/fixed_dest.parquet'

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")

df = pd.read_parquet(file_path)
df.dtypes

## ====================================
Here we can drop duplicates as we alredz normalized some stuff and it will detecte more of them than before

Then we will analzye what data is missing and how to deal with it

- do we fill it with median?
- do we drop it?
- do we consider it an anomaly?
- do we drop the whole trip?

## ====================================

In [None]:
len(df)

In [None]:
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

df = df.drop_duplicates()
df = df.reset_index(drop=True)
len(df)

In [None]:
# Heat map to visualize missing data
cols = df.columns
plt.figure(figsize = (10,5))
sns.heatmap(df[cols].isnull(), cmap=['white', 'black'], cbar=False)

In [None]:
df.isnull().sum() / len(df) * 100, 1

 Draught          1.778963
 Destination       0.573213
 AisSourcen        1.082286

Matches previous analysys +-

We can do median for Draught as I doubt it can indicate an anomaly, and it is a numerical value we will deal with it later when we deal with numericals.

### What to do with missing Destination values?

See if we can just fill in missing values with the nearest value within the same TripID

In [None]:

trips_with_missing_dest = df[df['Destination'].isna()]['TripID'].unique()   # 1. Get TripIDs with at least one missing Destination
df_missing_trips = df[df['TripID'].isin(trips_with_missing_dest)]           # 2. Filter DataFrame to include only these trips

result = (
    df_missing_trips
    .groupby('TripID')['Destination']
    .apply(lambda x: list(x.unique()))
)
result

In [None]:
df.loc[df['TripID'] == 355595]  # Check the first trip with missing Destination

Not really but we can fill missing values with the nearest value within the same TripID and more matching coordinates.
Find the first nan value above and below it will fill it with the nearest value within the same TripID and more matching coordinates.

In [None]:
# Apply the function to your dataframe
df = fill_missing_destinations_by_proximity(df)
df.isnull().sum() / len(df) * 100, 1

### No Destination at All

Might be anomaly

In [None]:
# Get TripIDs where ALL 'Destination' values are missing
trips_with_no_destination = df.groupby('TripID')['Destination'].apply(
    lambda x: x.isna().all()
)
no_dest_trip_ids = trips_with_no_destination[trips_with_no_destination].index.tolist()

print(f"Number of trips with NO destination: {len(no_dest_trip_ids)}")
print("Example TripIDs:", no_dest_trip_ids[:5])  # Show first 5 for reference

In [None]:
# Save the cleaned DataFrame to a new Parquet file
df.to_parquet(output_file_path)