In [1]:
import os
import pandas as pd
import re
from data_cleaning.utils.normalization_utils import (
    clean_destination
)
from tqdm.notebook import tqdm # For progress bars in notebook

##  Remark
Destinations are being processed so only official ports are left and nothing else
##

In [2]:
tqdm.pandas()

file_path = '../data/1_merged_typed_data.parquet'
output_path = '../data/2_destination_norm.parquet'

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")

df = pd.read_parquet(file_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1060708 entries, 0 to 1060707
Data columns (total 20 columns):
 #   Column          Non-Null Count    Dtype              
---  ------          --------------    -----              
 0   TripID          1060708 non-null  int64              
 1   StartLatitude   1060708 non-null  float64            
 2   StartLongitude  1060708 non-null  float64            
 3   StartTime       1060708 non-null  datetime64[ns, UTC]
 4   EndLatitude     1060708 non-null  float64            
 5   EndLongitude    1060708 non-null  float64            
 6   EndTime         1060708 non-null  datetime64[ns, UTC]
 7   StartPort       1060708 non-null  string             
 8   EndPort         1060708 non-null  string             
 9   time            1060708 non-null  datetime64[ns, UTC]
 10  shiptype        1060708 non-null  int64              
 11  Length          1060708 non-null  int64              
 12  Breadth         1060708 non-null  int64              
 1

## --- Step 1
Deal with Inconsistent Records (We can have different formats representing the same thing) if they are there.
And convert columns to categorical where appropriate.

In [3]:
def check_unique_values(df):
    """Check unique values in each column of the DataFrame."""
    col_un = {}
    for col in df.columns:
        clean_series = df[col].dropna()
        nunique = clean_series.nunique()
        col_un[col] = nunique
    return col_un

unique_values_before = check_unique_values(df)
unique_values_before

{'TripID': 1126,
 'StartLatitude': 28,
 'StartLongitude': 34,
 'StartTime': 953,
 'EndLatitude': 29,
 'EndLongitude': 47,
 'EndTime': 943,
 'StartPort': 2,
 'EndPort': 2,
 'time': 414193,
 'shiptype': 11,
 'Length': 107,
 'Breadth': 36,
 'Draught': 238,
 'Latitude': 273,
 'Longitude': 1285,
 'SOG': 227,
 'COG': 3602,
 'TH': 361,
 'Destination': 139}

Column: StartPort, Unique values: 2 - seems correct
Column: EndPort, Unique values: 2 - seems correct

Column: Destination, Unique values: 139 - weird

In [4]:
# Case normalization
text_columns = df.select_dtypes(include=['string']).columns
for col in text_columns:
    df[col] = df[col].str.upper()  # Ensure string type and uppercase

unique_values_after = check_unique_values(df)

changed_columns = list(filter(lambda col: unique_values_after[col] != unique_values_before[col], df.columns))
df[changed_columns].dropna().nunique() # Check how many unique values are there after case normalization

Destination    138
dtype: int64

Case normalization has changed the unique values in the following columns: Destination
It seems there are still inconsistencies in the Destination column, so we will need to clean it further.

In [5]:
df['Destination'].unique() #See some examples of the Destination column

<StringArray>
[   'GDYNIA.VIA.NOK',                <NA>,           'HAMBURG',
            'GDYNIA',            'GDANSK', 'GDYNIA.VIA.NOK.:)',
       'KLJ.VIA.NOK',             'PLGDY',     'PLGDN.VIA.NOK',
         'GDYNIA.VI',
 ...
       'GDANSK...AS',       'KALININGRAD',       'HAMBURG....',
             'SEHAD',        'FINKENWERD',         'NORDENHAM',
     'GDANSK.VIANOK',         'GDYNIA.PL',            'GDANKS',
        'HAMBURG???']
Length: 139, dtype: string

In [6]:
# Ensure 'Destination' has at least one alphabetic character and is not just a country code
df['Destination'] = df['Destination'].apply(
    lambda x: pd.NA if not re.search(r'[A-Za-z]', str(x)) or re.match(r'^[A-Z]{2}$', str(x)) else x
)

def find_values_with_special_chars(df):
    """Find values with special characters in the 'Destination' column."""
    return [
        value for value in df['Destination'].unique()
        if re.search(r'[^A-Za-z0-9]', str(value))
    ]


In [7]:
dest_before = find_values_with_special_chars(df)
print(len(dest_before), "unique values before cleaning with special characters")
dest_before

100 unique values before cleaning with special characters


['GDYNIA.VIA.NOK',
 <NA>,
 'GDYNIA.VIA.NOK.:)',
 'KLJ.VIA.NOK',
 'PLGDN.VIA.NOK',
 'GDYNIA.VI',
 'GDYNIA.VIA',
 'GDYNIA.VIBIA.NOK',
 'GDYNIA.VIAE',
 "'GDYNIA.VIK?0\\\\BPO?_'",
 'GDYNIA.VIE',
 'HAMBURG.:)',
 'GDYNIAVIA)NOK',
 'BREMERHAVEN.VIA.NOK',
 'ELBE.RC',
 'ELBE.PS',
 'HHLO.PS',
 'DE.HAM',
 'DE.HAMBIVER.ELBE',
 'GDANSK.VIA.NOK',
 'GDANSK.VICEL',
 'GDYNIA...TH',
 'HAMBURG.DE',
 'BRV.PS',
 'GDANSK.PILOT',
 'GDANSKVIA.NOK)',
 'GDYNIA.VIA.K.CANAL',
 'GDANSK....=SWIN',
 'GDANSK.VIA.KIEL.K',
 'KLAIPEDA.VIA.NOK',
 'BLEXEN.ROAD',
 'GDYNIA...!RSBURG',
 'KLAIPEDA..=SWIN',
 'KLAIPEDA.VIA.NOC',
 'PL.GDY.VIA.NOK',
 'PL.GDY',
 'DEBRV.>.DEHAM',
 'HAMBURG/.AIRBUS',
 'HH.FINKENWERDER',
 'GDYNIA....>.BALTIC.2',
 'GDYNIA....ROJECT',
 'DEHAM.ELBE',
 'DEHAM.ELBE.PLT',
 'GDYNIA...!RATION',
 'GDYNIA....>.HANKO',
 'GDYNIA...DING.AREA',
 'DEHAM.CTT',
 'SEAHU.>.DEBRV',
 'DEBRT.>.DEHAM',
 'GDYNIA...!E',
 'GDYNIA...!STELLE',
 'GDYNIA.VIA.KIEL',
 'PL.GDN',
 'GDANSK....NOK',
 'DEHAM.>.PLGDY',
 'DEHAM.>.PLGDY.NO

## --- Step 2
We can see that we have different formats representing the same thing, like 'HAMBURG' and 'DEHAM' ext.
The data is incredibly messy, we need to handle country codes, special characters, and different formats.

1. Some have country codes
2. Some contain starting port too
3. Some contain type of facilities (e.g., 'ELBE.RC', 'BREMERHAVEN.VIA.NOK')

start > destination
##

In [8]:
# Clean all data
df['Destination'] = df['Destination'].progress_apply(clean_destination)
df['Destination'].unique()

  0%|          | 0/1060708 [00:00<?, ?it/s]

array(['GDYNIA.VIA.NOK', None, 'HAMBURG', 'GDYNIA', 'GDANSK',
       'KLJ.VIA.NOK', 'PLGDY', 'PLGDN.VIA.NOK', 'GDYNIA.VI', 'GDYNIA.VIA',
       'GDYNIA.VIBIA.NOK', 'GDYNIA.VIAE', 'GDYNIA.VIK0.BPO', 'GDYNIA.VIE',
       'GYDINIA', 'GDYNIAVIANOK', 'BREMERHAVEN.VIA.NOK', 'DEHAM', 'DEBRV',
       'ELBE.RC', 'ELBE.PS', 'HHLO.PS', 'DEBRE', 'DE.HAM',
       'DE.HAMBIVER.ELBE', 'GDANSK.VIA.NOK', 'GDANSK.VICEL', 'GDYNIA.TH',
       'HAMBURG.DE', 'BRV.PS', 'GDANSK.PILOT', 'GDANSKVIA.NOK',
       'COPENHAGEN', 'KLAIPEDA', 'GDYNIA.VIA.K.CANAL', 'GDANSK.SWIN',
       'GDANSK.VIA.KIEL.K', 'KLAIPEDA.VIA.NOK', 'BREMENHAVEN',
       'BLEXEN.ROAD', 'GDANK', 'GDYNIA.RSBURG', 'KLAIPEDA.SWIN',
       'KLAIPEDA.VIA.NOC', 'PL.GDY.VIA.NOK', 'PL.GDY', 'DEBRV.>.DEHAM',
       'HAMBURG.AIRBUS', 'FINKENWERDER', 'HH.FINKENWERDER',
       'GDYNIA.>.BALTIC.2', 'GDYNIA.ROJECT', 'DEHAM.ELBE',
       'DEHAM.ELBE.PLT', 'GDYNIA.RATION', 'GDYNIA.>.HANKO',
       'GDYNIA.DING.AREA', 'DEHAM.CTT', 'SEAHU.>.DEBRV', 'DEBRT.>.D

In [9]:
# Create mask for rows containing '>'
mask = df['Destination'].str.contains('>', na=False)

# Initialize columns (if not already done)
# df['start_fr_dest'] = None
# df['cleaned_destination'] = df['Destination'].copy()
# Split and assign values safely
# df.loc[mask, 'start_fr_dest'] = df.loc[mask, 'Destination'].str.split('>').str[0] #NOTE for now we wont bother

df.loc[mask, 'Destination'] = df.loc[mask, 'Destination'].str.split('>').str[1]
df['Destination'].unique()

array(['GDYNIA.VIA.NOK', None, 'HAMBURG', 'GDYNIA', 'GDANSK',
       'KLJ.VIA.NOK', 'PLGDY', 'PLGDN.VIA.NOK', 'GDYNIA.VI', 'GDYNIA.VIA',
       'GDYNIA.VIBIA.NOK', 'GDYNIA.VIAE', 'GDYNIA.VIK0.BPO', 'GDYNIA.VIE',
       'GYDINIA', 'GDYNIAVIANOK', 'BREMERHAVEN.VIA.NOK', 'DEHAM', 'DEBRV',
       'ELBE.RC', 'ELBE.PS', 'HHLO.PS', 'DEBRE', 'DE.HAM',
       'DE.HAMBIVER.ELBE', 'GDANSK.VIA.NOK', 'GDANSK.VICEL', 'GDYNIA.TH',
       'HAMBURG.DE', 'BRV.PS', 'GDANSK.PILOT', 'GDANSKVIA.NOK',
       'COPENHAGEN', 'KLAIPEDA', 'GDYNIA.VIA.K.CANAL', 'GDANSK.SWIN',
       'GDANSK.VIA.KIEL.K', 'KLAIPEDA.VIA.NOK', 'BREMENHAVEN',
       'BLEXEN.ROAD', 'GDANK', 'GDYNIA.RSBURG', 'KLAIPEDA.SWIN',
       'KLAIPEDA.VIA.NOC', 'PL.GDY.VIA.NOK', 'PL.GDY', '.DEHAM',
       'HAMBURG.AIRBUS', 'FINKENWERDER', 'HH.FINKENWERDER', '.BALTIC.2',
       'GDYNIA.ROJECT', 'DEHAM.ELBE', 'DEHAM.ELBE.PLT', 'GDYNIA.RATION',
       '.HANKO', 'GDYNIA.DING.AREA', 'DEHAM.CTT', '.DEBRV', 'GDYNYA',
       'GDYNIA.E', 'GDYNIA.STELLE', '

#### **I CREATED CUSTOM FILE WITH FUNCTION WE WILL USE DOWN**

df_recombined['Destination'].unique()
Now we have somewhat cleaned Destination column, but we still have some inconsistencies. That the same ports have different names.
I didn't find quicker way to do it, rather than manually checking the names and creating a list of names that represent the same port.
We will only a bit automize this process, by using fuzzy matching to find similar names.
And extracting all ports from UpdatedPub150.csv file, which contains ports and their countries.
[link](https://msi.nga.mil/Publications/WPI)

In [10]:
# from fuzzywuzzy import fuzz
#
# def find_fuzzy_matches(destinations, threshold=80, scorer=fuzz.token_set_ratio, show_progress=False):
#     """
#     Find fuzzy matches among destination names.
#
#     Parameters:
#         destinations (list): List of destination strings to compare
#         threshold (int): Minimum similarity score to consider a match (0-100)
#         scorer: Fuzzy matching function (default: token_set_ratio)
#         show_progress (bool): Whether to print progress during processing
#
#     Returns:
#         dict: Dictionary where keys are original names and values are lists of matches
#               with their scores in format [(matched_name, score), ...]
#     """
#     matches = {}
#     total = len(destinations)
#
#     for i, dest in enumerate(destinations, 1):
#         # Skip NAN/empty values
#         if not dest or str(dest).strip().upper() in ('NAN', 'NULL', ''):
#             continue
#
#         if show_progress:
#             print(f"Processing {i}/{total}: {dest[:30]}...", end='\r')
#
#         # Find matches above threshold (excluding self)
#         potential_matches = process.extract(
#             dest,
#             destinations,
#             scorer=scorer,
#             limit=None
#         )
#
#         # Filter matches
#         good_matches = [
#             (match, score)
#             for match, score in potential_matches
#             if score >= threshold and match != dest
#         ]
#
#         if good_matches:
#             matches[dest] = good_matches
#
#     if show_progress:
#         print("\n" + "=" * 50)
#
#     return matches
# def print_fuzzy_matches(matches, min_score=0, group_similar=False):
#     """
#     Print fuzzy matching results in a readable format.
#
#     Parameters:
#         matches (dict): Output from find_fuzzy_matches
#         min_score (int): Minimum score to display
#         group_similar (bool): Whether to group similar matches together
#     """
#     if not matches:
#         print("No matches found")
#         return
#
#     print(f"\nFuzzy matches (score ≥ {min_score}):")
#     print("=" * 60)
#
#     if group_similar:
#         # Group similar matches to avoid duplicates
#         already_matched = set()
#         for dest in sorted(matches.keys()):
#             if dest in already_matched:
#                 continue
#
#             print(f"\nGroup: {dest}")
#             print("-" * 50)
#
#             # Include the original in the group
#             all_in_group = {dest}
#
#             for match, score in matches[dest]:
#                 if score >= min_score:
#                     print(f"  → {match} (score: {score})")
#                     all_in_group.add(match)
#
#                     # Also include matches of matches
#                     if match in matches:
#                         for submatch, subscore in matches[match]:
#                             if subscore >= min_score and submatch not in all_in_group:
#                                 print(f"    → {submatch} (score: {subscore})")
#                                 all_in_group.add(submatch)
#
#             already_matched.update(all_in_group)
#     else:
#         # Simple listing
#         for dest in sorted(matches.keys()):
#             print(f"\n{dest} matches:")
#             print("-" * 50)
#             for match, score in matches[dest]:
#                 if score >= min_score:
#                     print(f"  → {match} (score: {score})")

In [11]:
# unique_dests = df['Destination'].unique().tolist()
#
# # Find matches with threshold of 85
# matches = find_fuzzy_matches(unique_dests, threshold=75, show_progress=True)
#
# # Print results grouped by similarity
# print_fuzzy_matches(matches, min_score=75, group_similar=True)


From this I will manually create a list of names that represent the same port, that do not have 100 match.
As they can be incorrectly matched, and it would be better to do it manually.


In [12]:
from data_cleaning.utils.normalization_utils import match_names
df['Destination'] = df['Destination'].progress_apply(lambda x: match_names(x))

  0%|          | 0/1060708 [00:00<?, ?it/s]

In [13]:
# df = replace_with_key(df, 'start_fr_dest', full_dict)
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

Unnamed: 0,index,Destination
0,0,PL.GDY
2862,2862,
3717,3717,DE.HAM
5983,5983,PL.GDN
11015,11015,LT.KLJ
28816,28816,DE.BRV
37668,37668,DE.BRE
91662,91662,DK.KOB
138803,138803,DE.KEL
232201,232201,FI.HKO


In [14]:
# df[['start_fr_dest']].reset_index().drop_duplicates(subset=['start_fr_dest'])

In [18]:
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

df = df.drop_duplicates()
df = df.reset_index(drop=True)
len(df)

Number of duplicate rows: 0


913599

In [19]:
df.to_parquet(output_path)