In [1]:
import os
import pandas as pd
import re
from data_cleaning.processing_utils import (
    clean_destination,
    match_names
)

##  Remark
Destinations are being processed so only official ports are left and nothing else (can be changed by looking for NOTE comment)
##

In [2]:

file_path = '../../data/type_norm.parquet'
output_path = '../../data/prepared.parquet'

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")

df = pd.read_parquet(file_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1060708 entries, 0 to 1060707
Data columns (total 21 columns):
 #   Column          Non-Null Count    Dtype              
---  ------          --------------    -----              
 0   TripID          1060708 non-null  int64              
 1   StartLatitude   1060708 non-null  float64            
 2   StartLongitude  1060708 non-null  float64            
 3   StartTime       1060708 non-null  datetime64[ns, UTC]
 4   EndLatitude     1060708 non-null  float64            
 5   EndLongitude    1060708 non-null  float64            
 6   EndTime         1060708 non-null  datetime64[ns, UTC]
 7   StartPort       1060708 non-null  category           
 8   EndPort         1060708 non-null  category           
 9   time            1060708 non-null  datetime64[ns, UTC]
 10  shiptype        1060708 non-null  int64              
 11  Length          1060708 non-null  int64              
 12  Breadth         1060708 non-null  int64              
 1

## Step 1
Deal with Inconsistent Records (We can have different formats representing the same thing) if they are there.
And convert columns to categorical where appropriate.

In [3]:
def check_unique_values(df):
    """Check unique values in each column of the DataFrame."""
    col_un = {}
    for col in df.columns:
        clean_series = df[col].dropna()
        nunique = clean_series.nunique()
        col_un[col] = nunique
    return col_un

unique_values_before = check_unique_values(df)
unique_values_before

{'TripID': 1126,
 'StartLatitude': 28,
 'StartLongitude': 34,
 'StartTime': 953,
 'EndLatitude': 29,
 'EndLongitude': 47,
 'EndTime': 943,
 'StartPort': 2,
 'EndPort': 2,
 'time': 414193,
 'shiptype': 11,
 'Length': 107,
 'Breadth': 36,
 'Draught': 238,
 'Latitude': 273,
 'Longitude': 1285,
 'SOG': 227,
 'COG': 3602,
 'TH': 361,
 'Destination': 140,
 'AisSourcen': 224}

Column: StartPort, Unique values: 2 - seems correct
Column: EndPort, Unique values: 2 - seems correct

Column: Destination, Unique values: 140 - weird
Column: AisSourcen, Unique values: 224 - should check if it is correct

In [4]:
# Case normalization
text_columns = df.select_dtypes(include=['string']).columns
for col in text_columns:
    df[col] = df[col].str.upper()  # Ensure string type and uppercase

unique_values_after = check_unique_values(df)

changed_columns = list(filter(lambda col: unique_values_after[col] != unique_values_before[col], df.columns))
df[changed_columns].dropna().nunique() # Check how many unique values are there after case normalization

Series([], dtype: float64)

Case normalization has changed the unique values in the following columns: Destination
It seems there are still inconsistencies in the Destination column, so we will need to clean it further.

In [5]:
df['Destination'].unique() #See some examples of the Destination column

['HAMBURG', 'DEHAM', 'DEBRE', 'DEBRV', 'ELBE.RC', ..., 'SZCZECIN', 'SEHAD', 'GDANSK.VIANOK', 'GDYNIA.PL', 'GDANKS']
Length: 140
Categories (140, object): [''GDYNIA.VIK?0\\BPO?_'', '>4_?', 'BE.ANR.>>.PL.GDN', 'BLEXEN.ROAD', ..., 'SEHAD', 'STADE', 'SZCZECIN', 'nan']

In [7]:
# Ensure 'Destination' has at least one alphabetic character and is not just a country code
df['Destination'] = df['Destination'].apply(
    lambda x: "NAN" if not re.search(r'[A-Za-z]', str(x)) or re.match(r'^[A-Z]{2}$', str(x)) else x
)

def find_values_with_special_chars(df):
    """Find values with special characters in the 'Destination' column."""
    return [
        value for value in df['Destination'].unique()
        if re.search(r'[^A-Za-z0-9]', str(value))
    ]


In [8]:
dest_before = find_values_with_special_chars(df)
print(len(dest_before), "unique values before cleaning with special characters")
dest_before

99 unique values before cleaning with special characters


['ELBE.RC',
 'HAMBURG.:)',
 'BREMERHAVEN.VIA.NOK',
 'DE.HAM',
 'DE.HAMBIVER.ELBE',
 'ELBE.PS',
 'HHLO.PS',
 'HAMBURG.DE',
 'HAMBURG/.AIRBUS',
 'BLEXEN.ROAD',
 'BRV.PS',
 'HH.FINKENWERDER',
 'DEBRV.>.DEHAM',
 'DEHAM.ELBE.PLT',
 'DEHAM.ELBE',
 'DEHAM.CTT',
 'SEAHU.>.DEBRV',
 'DEBRT.>.DEHAM',
 'DE.BRV>DEHAM',
 'DE.BRV>DE.HAM',
 'HAMBURG..DE',
 'DEBRV>DEHAM',
 'DEHAM.EGH',
 'DEHAM.CTA',
 'DEHAM.EG',
 'DE.HAM.............',
 'DE.WVN.>.DE.HAM',
 'DEBHV.NOK',
 'DEBRV.EGH',
 'DEBRV---->DEHAM',
 'HAMBURG/EUR',
 'DEBRV.>.DEIM',
 'HH.PS',
 'HH.CTB',
 'HAM.PS',
 'HAMBURG....',
 'HAMBURG???',
 'GDYNIA.VIA.NOK',
 'GDYNIA.VIA.NOK.:)',
 'GDYNIAVIA)NOK',
 'KLJ.VIA.NOK',
 'PLGDN.VIA.NOK',
 'GDYNIA.VIA',
 'GDYNIA.VIAE',
 'GDYNIA.VIBIA.NOK',
 'GDYNIA.VI',
 "'GDYNIA.VIK?0\\\\BPO?_'",
 'GDYNIA.VIE',
 'GDANSK.VIA.NOK',
 'GDYNIA...TH',
 'GDANSK.VICEL',
 'KLAIPEDA.VIA.NOK',
 'KLAIPEDA..=SWIN',
 'GDANSK....=SWIN',
 'GDANSK.VIA.KIEL.K',
 'PL.GDY',
 'PL.GDY.VIA.NOK',
 'GDYNIA...!RSBURG',
 'KLAIPEDA.VIA.NOC',
 'GD

## ==================================== Step 2
We can see that we have different formats representing the same thing, like 'HAMBURG' and 'DEHAM' ext.
The data is incredibly messy, we need to handle country codes, special characters, and different formats.

1. Some have country codes
2. Some contain starting port too
3. Some contain type of facilities (e.g., 'ELBE.RC', 'BREMERHAVEN.VIA.NOK')

start > destination
## ====================================

In [9]:
# Clean all data
df['Destination'] = df['Destination'].apply(clean_destination)
df['Destination'].unique()

array(['HAMBURG', 'DEHAM', 'DEBRE', 'DEBRV', 'ELBE.RC',
       'BREMERHAVEN.VIA.NOK', 'DE.HAM', 'DE.HAMBIVER.ELBE', 'ELBE.PS',
       'HHLO.PS', 'HAMBURG.DE', 'HAMBURG.AIRBUS', 'BREMENHAVEN',
       'FINKENWERDER', 'BLEXEN.ROAD', 'BRV.PS', 'HH.FINKENWERDER',
       'DEBRV>DEHAM', 'COPENHAGEN', 'DEHAM.ELBE.PLT', 'DEHAM.ELBE',
       'DEHAM.CTT', 'SEAHU>DEBRV', 'DEBRT>DEHAM', 'STADE', 'DE.BRV>DEHAM',
       'DE.BRV>DE.HAM', 'DEHAMCTA', 'BREMERHAVEN', 'DEHAM.EGH',
       'DEHAM.CTA', 'DEHAM.EG', 'HAMBUG', 'DEBHV', 'DE.WVN>DE.HAM',
       'DEBHV.NOK', 'NAN', 'DEBRV.EGH', 'HAMBURG.EUR', 'DEBRV>DEIM',
       'GDANSK', 'HH.PS', 'HH.CTB', 'HAM.PS', 'FINKENWERD', 'NORDENHAM',
       'GDYNIA.VIA.NOK', 'GDYNIA', 'GYDINIA', 'GDYNIAVIANOK',
       'KLJ.VIA.NOK', 'PLGDY', 'PLGDN.VIA.NOK', 'GDYNIA.VIA',
       'GDYNIA.VIAE', 'GDYNIA.VIBIA.NOK', 'GDYNIA.VI', 'GDYNIA.VIK0.BPO',
       'GDYNIA.VIE', 'GDANSK.VIA.NOK', 'GDYNIA.TH', 'GDANSK.VICEL',
       'KLAIPEDA.VIA.NOK', 'KLAIPEDA', 'KLAIPEDA.SWIN', 'G

In [10]:
# Create mask for rows containing '>'
mask = df['Destination'].str.contains('>', na=False)

# Initialize columns (if not already done)
# df['start_fr_dest'] = None
# df['cleaned_destination'] = df['Destination'].copy()
# Split and assign values safely
# df.loc[mask, 'start_fr_dest'] = df.loc[mask, 'Destination'].str.split('>').str[0] #NOTE for now we wont bother

df.loc[mask, 'Destination'] = df.loc[mask, 'Destination'].str.split('>').str[1]
df['Destination'].unique()

array(['HAMBURG', 'DEHAM', 'DEBRE', 'DEBRV', 'ELBE.RC',
       'BREMERHAVEN.VIA.NOK', 'DE.HAM', 'DE.HAMBIVER.ELBE', 'ELBE.PS',
       'HHLO.PS', 'HAMBURG.DE', 'HAMBURG.AIRBUS', 'BREMENHAVEN',
       'FINKENWERDER', 'BLEXEN.ROAD', 'BRV.PS', 'HH.FINKENWERDER',
       'COPENHAGEN', 'DEHAM.ELBE.PLT', 'DEHAM.ELBE', 'DEHAM.CTT', 'STADE',
       'DEHAMCTA', 'BREMERHAVEN', 'DEHAM.EGH', 'DEHAM.CTA', 'DEHAM.EG',
       'HAMBUG', 'DEBHV', 'DEBHV.NOK', 'NAN', 'DEBRV.EGH', 'HAMBURG.EUR',
       'DEIM', 'GDANSK', 'HH.PS', 'HH.CTB', 'HAM.PS', 'FINKENWERD',
       'NORDENHAM', 'GDYNIA.VIA.NOK', 'GDYNIA', 'GYDINIA', 'GDYNIAVIANOK',
       'KLJ.VIA.NOK', 'PLGDY', 'PLGDN.VIA.NOK', 'GDYNIA.VIA',
       'GDYNIA.VIAE', 'GDYNIA.VIBIA.NOK', 'GDYNIA.VI', 'GDYNIA.VIK0.BPO',
       'GDYNIA.VIE', 'GDANSK.VIA.NOK', 'GDYNIA.TH', 'GDANSK.VICEL',
       'KLAIPEDA.VIA.NOK', 'KLAIPEDA', 'KLAIPEDA.SWIN', 'GDANSK.SWIN',
       'GDANK', 'GDANSK.VIA.KIEL.K', 'PL.GDY', 'PL.GDY.VIA.NOK',
       'GDYNIA.RSBURG', 'KLAIPEDA.VIA

#### **I CREATED CUSTOM FILE WITH FUNCTION WE WILL USE DOWN**

df_recombined['Destination'].unique()
Now we have somewhat cleaned Destination column, but we still have some inconsistencies. That the same ports have different names.
I didn't find quicker way to do it, rather than manually checking the names and creating a list of names that represent the same port.
We will only a bit automize this process, by using fuzzy matching to find similar names.
And extracting all ports from UpdatedPub150.csv file, which contains ports and their countries.
[link](https://msi.nga.mil/Publications/WPI)

In [11]:
# from fuzzywuzzy import fuzz
#
# def find_fuzzy_matches(destinations, threshold=80, scorer=fuzz.token_set_ratio, show_progress=False):
#     """
#     Find fuzzy matches among destination names.
#
#     Parameters:
#         destinations (list): List of destination strings to compare
#         threshold (int): Minimum similarity score to consider a match (0-100)
#         scorer: Fuzzy matching function (default: token_set_ratio)
#         show_progress (bool): Whether to print progress during processing
#
#     Returns:
#         dict: Dictionary where keys are original names and values are lists of matches
#               with their scores in format [(matched_name, score), ...]
#     """
#     matches = {}
#     total = len(destinations)
#
#     for i, dest in enumerate(destinations, 1):
#         # Skip NAN/empty values
#         if not dest or str(dest).strip().upper() in ('NAN', 'NULL', ''):
#             continue
#
#         if show_progress:
#             print(f"Processing {i}/{total}: {dest[:30]}...", end='\r')
#
#         # Find matches above threshold (excluding self)
#         potential_matches = process.extract(
#             dest,
#             destinations,
#             scorer=scorer,
#             limit=None
#         )
#
#         # Filter matches
#         good_matches = [
#             (match, score)
#             for match, score in potential_matches
#             if score >= threshold and match != dest
#         ]
#
#         if good_matches:
#             matches[dest] = good_matches
#
#     if show_progress:
#         print("\n" + "=" * 50)
#
#     return matches
# def print_fuzzy_matches(matches, min_score=0, group_similar=False):
#     """
#     Print fuzzy matching results in a readable format.
#
#     Parameters:
#         matches (dict): Output from find_fuzzy_matches
#         min_score (int): Minimum score to display
#         group_similar (bool): Whether to group similar matches together
#     """
#     if not matches:
#         print("No matches found")
#         return
#
#     print(f"\nFuzzy matches (score ≥ {min_score}):")
#     print("=" * 60)
#
#     if group_similar:
#         # Group similar matches to avoid duplicates
#         already_matched = set()
#         for dest in sorted(matches.keys()):
#             if dest in already_matched:
#                 continue
#
#             print(f"\nGroup: {dest}")
#             print("-" * 50)
#
#             # Include the original in the group
#             all_in_group = {dest}
#
#             for match, score in matches[dest]:
#                 if score >= min_score:
#                     print(f"  → {match} (score: {score})")
#                     all_in_group.add(match)
#
#                     # Also include matches of matches
#                     if match in matches:
#                         for submatch, subscore in matches[match]:
#                             if subscore >= min_score and submatch not in all_in_group:
#                                 print(f"    → {submatch} (score: {subscore})")
#                                 all_in_group.add(submatch)
#
#             already_matched.update(all_in_group)
#     else:
#         # Simple listing
#         for dest in sorted(matches.keys()):
#             print(f"\n{dest} matches:")
#             print("-" * 50)
#             for match, score in matches[dest]:
#                 if score >= min_score:
#                     print(f"  → {match} (score: {score})")

In [12]:
# unique_dests = df['Destination'].unique().tolist()
#
# # Find matches with threshold of 85
# matches = find_fuzzy_matches(unique_dests, threshold=75, show_progress=True)
#
# # Print results grouped by similarity
# print_fuzzy_matches(matches, min_score=75, group_similar=True)


From this I will manually create a list of names that represent the same port, that do not have 100 match.
As they can be incorrectly matched, and it would be better to do it manually.

https://www.marinetraffic.com/en/ais/details/ports/347?name=HALMSTAD&country=Sweden


In [13]:
from data_cleaning.processing_utils import match_names

def replace_with_key(df, column):
    df[column] = df[column].apply(lambda x: match_names(x))
    return df

In [14]:
df = replace_with_key(df, 'Destination')
# df = replace_with_key(df, 'start_fr_dest', full_dict)
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

Unnamed: 0,index,Destination
0,0,DE.HAM
2894,2894,DE.BRE
5160,5160,DE.BRV
76207,76207,DK.KOB
159264,159264,DE.STA
243332,243332,
303836,303836,PL.GDN
479678,479678,PL.GDY
486420,486420,LT.KLJ
532599,532599,DE.KEL


In [15]:
# df[['start_fr_dest']].reset_index().drop_duplicates(subset=['start_fr_dest'])

In [16]:
df.drop_duplicates()

Unnamed: 0,TripID,StartLatitude,StartLongitude,StartTime,EndLatitude,EndLongitude,EndTime,StartPort,EndPort,time,...,Length,Breadth,Draught,Latitude,Longitude,SOG,COG,TH,Destination,AisSourcen
0,39131,53.57,8.53,2016-01-24 08:06:00+00:00,53.53,9.90,2016-01-24 16:44:00+00:00,BREMERHAVEN,HAMBURG,2016-01-24 08:07:00+00:00,...,277,42,11.54,53.57,8.53,0.7,331.2,143,DE.HAM,DAIS1.81B.90B.71.71A
1,39131,53.57,8.53,2016-01-24 08:06:00+00:00,53.53,9.90,2016-01-24 16:44:00+00:00,BREMERHAVEN,HAMBURG,2016-01-24 08:10:00+00:00,...,277,42,11.54,53.57,8.53,1.6,315.3,117,DE.HAM,DAIS1.81B.90B.71.71A
2,39131,53.57,8.53,2016-01-24 08:06:00+00:00,53.53,9.90,2016-01-24 16:44:00+00:00,BREMERHAVEN,HAMBURG,2016-01-24 08:10:00+00:00,...,277,42,11.54,53.57,8.53,2.8,322.6,100,DE.HAM,DAIS1.81B.90B.71.71A
3,39131,53.57,8.53,2016-01-24 08:06:00+00:00,53.53,9.90,2016-01-24 16:44:00+00:00,BREMERHAVEN,HAMBURG,2016-01-24 08:12:00+00:00,...,277,42,11.54,53.57,8.53,2.8,286.3,74,DE.HAM,DAIS1.81B.90B.71.71A
4,39131,53.57,8.53,2016-01-24 08:06:00+00:00,53.53,9.90,2016-01-24 16:44:00+00:00,BREMERHAVEN,HAMBURG,2016-01-24 08:16:00+00:00,...,277,42,11.54,53.57,8.53,4.3,333.1,333,DE.HAM,DAIS1.81B.90B.71.71A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1060703,2204049,54.36,10.14,2017-04-03 07:54:00+00:00,54.38,18.66,2017-04-04 15:28:00+00:00,KIEL,GDYNIA,2017-04-04 13:57:00+00:00,...,89,13,4.00,54.51,18.75,7.2,221.0,215,PL.GDN,H7001
1060704,2204049,54.36,10.14,2017-04-03 07:54:00+00:00,54.38,18.66,2017-04-04 15:28:00+00:00,KIEL,GDYNIA,2017-04-04 13:56:00+00:00,...,89,13,4.00,54.51,18.75,7.2,221.9,215,PL.GDN,H7001
1060705,2204049,54.36,10.14,2017-04-03 07:54:00+00:00,54.38,18.66,2017-04-04 15:28:00+00:00,KIEL,GDYNIA,2017-04-04 13:55:00+00:00,...,89,13,4.00,54.51,18.75,7.2,222.1,215,PL.GDN,H7001
1060706,2204049,54.36,10.14,2017-04-03 07:54:00+00:00,54.38,18.66,2017-04-04 15:28:00+00:00,KIEL,GDYNIA,2017-04-04 13:54:00+00:00,...,89,13,4.00,54.51,18.76,7.2,221.2,215,PL.GDN,H7001


In [17]:
df.to_parquet(output_path)