In [1]:
from data_cleaning.utils.normalization_utils import clean_destination, match_names
import pandas as pd

In [2]:
input_files = ["../../data/raw_data/b-h.csv", "../../data/raw_data/k-g.csv"]
output_file = "../data/2_destination_norm.parquet"

In [3]:
dfs = []
dfs_clean = []
for input_f in input_files:
    dfs.append(pd.read_csv(input_f,
                           engine='python',
                           on_bad_lines='skip',
                           na_values=['', '?']
    ))

### Step 1: Type Normalization

In [4]:
COLUMNS_TO_DROP = ['ID', 'Name', 'Callsign', 'MMSI', 'AisSourcen']

for df in dfs:
    df_clean = df.drop(columns=COLUMNS_TO_DROP, errors='ignore')
    df_clean['StartTime'] = pd.to_datetime(df_clean['StartTime'], utc=True)
    df_clean['EndTime'] = pd.to_datetime(df_clean['EndTime'], utc=True)
    df_clean['time'] = pd.to_datetime(df_clean['time'], utc=True)  # Appears to have timezone info (+01:00)

    df_clean['StartPort'] = df_clean['StartPort'].astype('string').astype('category')
    df_clean['EndPort'] = df_clean['EndPort'].astype('string').astype('category')
    df_clean['shiptype'] = df_clean['shiptype'].astype('category')
    df_clean['Destination'] = df_clean['Destination'].astype('string').astype('category')

    dfs_clean.append(df_clean)

In [5]:
def check_mixed_types_and_examples(df):
    mixed = []
    for column in df.columns:
        unique_types = df[column].apply(type).unique()
        if len(unique_types) > 1:
            print(f"Column '{column}' has mixed types: {unique_types}")
            # Display examples of string and float values
            string_values = df[column][df[column].apply(type) == str].head()
            float_values = df[column][df[column].apply(type) == float].head()
            print(f"Examples of string values in '{column}':\n{string_values}")
            print(f"Examples of float values in '{column}':\n{float_values}")
            mixed.append(column)
    return mixed

check_mixed_types_and_examples(dfs_clean[0])

Column 'Destination' has mixed types: [<class 'str'> nan]
Examples of string values in 'Destination':
0    HAMBURG
1    HAMBURG
2    HAMBURG
3    HAMBURG
4    HAMBURG
Name: Destination, dtype: category
Categories (53, string): [BLEXEN.ROAD, BREMENHAVEN, BREMERHAVEN, BREMERHAVEN.VIA.NOK, ..., HHLO.PS, NORDENHAM, SEAHU.>.DEBRV, STADE]
Examples of float values in 'Destination':
Series([], Name: Destination, dtype: category
Categories (53, string): [BLEXEN.ROAD, BREMENHAVEN, BREMERHAVEN, BREMERHAVEN.VIA.NOK, ..., HHLO.PS, NORDENHAM, SEAHU.>.DEBRV, STADE])


['Destination']

In [6]:
df_norm = pd.concat(dfs_clean, ignore_index=True)
df_norm.sort_values('time').groupby(['TripID', 'StartPort'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f35126fbd30>

In [7]:
df_norm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1060708 entries, 0 to 1060707
Data columns (total 20 columns):
 #   Column          Non-Null Count    Dtype              
---  ------          --------------    -----              
 0   TripID          1060708 non-null  int64              
 1   StartLatitude   1060708 non-null  float64            
 2   StartLongitude  1060708 non-null  float64            
 3   StartTime       1060708 non-null  datetime64[ns, UTC]
 4   EndLatitude     1060708 non-null  float64            
 5   EndLongitude    1060708 non-null  float64            
 6   EndTime         1060708 non-null  datetime64[ns, UTC]
 7   StartPort       1060708 non-null  string             
 8   EndPort         1060708 non-null  string             
 9   time            1060708 non-null  datetime64[ns, UTC]
 10  shiptype        1060708 non-null  int64              
 11  Length          1060708 non-null  int64              
 12  Breadth         1060708 non-null  int64              
 1

### Step 2: Clean Destination Names

In [8]:
df = df_norm.copy()

In [9]:
# Ensure 'Destination' has at least one alphabetic character and is not just a country code
text_columns = df.select_dtypes(include=['string']).columns
for col in text_columns:
    df[col] = df[col].str.upper()  # Ensure string type and uppercase

df['Destination'] = df['Destination'].apply(clean_destination)
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

Unnamed: 0,index,Destination
0,0,HAMBURG
517,517,DEHAM
2894,2894,DEBRE
5160,5160,DEBRV
12382,12382,ELBE.RC
...,...,...
998470,998470,SZCZECIN
1004254,1004254,SEHAD
1039679,1039679,GDANSK.VIANOK
1049962,1049962,GDYNIA.PL


In [11]:
# Create mask for rows containing '>'
mask = df['Destination'].str.contains('>', na=False)
df.loc[mask, 'Destination'] = df.loc[mask, 'Destination'].str.split('>').str[1]
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

Unnamed: 0,index,Destination
0,0,HAMBURG
517,517,DEHAM
2894,2894,DEBRE
5160,5160,DEBRV
12382,12382,ELBE.RC
...,...,...
998470,998470,SZCZECIN
1004254,1004254,SEHAD
1039679,1039679,GDANSK.VIANOK
1049962,1049962,GDYNIA.PL


In [13]:
def replace_with_key(df, column):
    df[column] = df[column].apply(lambda x: match_names(x))
    return df

df = replace_with_key(df, 'Destination')
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

Unnamed: 0,index,Destination
0,0,DE.HAM
2894,2894,DE.BRE
5160,5160,DE.BRV
76207,76207,DK.KOB
159264,159264,DE.STA
243332,243332,
303836,303836,PL.GDN
479678,479678,PL.GDY
486420,486420,LT.KLJ
532599,532599,DE.KEL


In [17]:
from data_cleaning.utils.fill_missing_utils import get_entries_with_missing_values
get_entries_with_missing_values(df, 'Destination')


TripID
5944       [PL.GDY]
19002            []
19585      [PL.GDY]
23834      [PL.GDN]
28257      [PL.GDN]
             ...   
2183505    [PL.GDN]
2200956          []
2258835    [PL.GDY]
2263639    [PL.GDN]
2271342    [PL.GDN]
Name: Destination, Length: 248, dtype: object

In [18]:
print(len(df))
df = df.drop_duplicates()
len(df)

1060708


913599

In [19]:
df.to_parquet(output_file)