In [1]:
import os
from data_cleaning.processing_utils import clean_destination, match_names
import pandas as pd

In [2]:
# Update the file path to move up one directory
file_path = '../data/cleaned_atr.csv'
output_path = '../data/prepared.parquet'

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")

df = pd.read_csv(file_path)
df.head()

Unnamed: 0,TripID,StartLatitude,StartLongitude,StartTime,EndLatitude,EndLongitude,EndTime,StartPort,EndPort,time,...,Length,Breadth,Draught,Latitude,Longitude,SOG,COG,TH,Destination,AisSourcen
0,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:07:00+01:00,...,277,42,11.54,53.57,8.53,0.7,331.2,143,HAMBURG,DAIS1.81b.90b.71.71a
1,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:10:00+01:00,...,277,42,11.54,53.57,8.53,1.6,315.3,117,HAMBURG,DAIS1.81b.90b.71.71a
2,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:10:00+01:00,...,277,42,11.54,53.57,8.53,2.8,322.6,100,HAMBURG,DAIS1.81b.90b.71.71a
3,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:12:00+01:00,...,277,42,11.54,53.57,8.53,2.8,286.3,74,HAMBURG,DAIS1.81b.90b.71.71a
4,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:16:00+01:00,...,277,42,11.54,53.57,8.53,4.3,333.1,333,HAMBURG,DAIS1.81b.90b.71.71a


### Step 1: Type Normalization

In [3]:
df['Destination'] = df['Destination'].fillna("nan").astype('string')
df['AisSourcen'] = df['AisSourcen'].fillna("nan").astype('string')

df['StartTime'] = pd.to_datetime(df['StartTime'], utc=True)
df['EndTime'] = pd.to_datetime(df['EndTime'], utc=True)
df['time'] = pd.to_datetime(df['time'], utc=True)  # Appears to have timezone info (+01:00)
df['shiptype'] = df['shiptype'].astype('category')

df['StartPort'] = df['StartPort'].astype('string').astype('category')
df['EndPort'] = df['EndPort'].astype('string').astype('category')
df['Destination'] = df['Destination'].astype('string').astype('category')
df.dtypes

TripID                          int64
StartLatitude                 float64
StartLongitude                float64
StartTime         datetime64[ns, UTC]
EndLatitude                   float64
EndLongitude                  float64
EndTime           datetime64[ns, UTC]
StartPort                    category
EndPort                      category
time              datetime64[ns, UTC]
shiptype                     category
Length                          int64
Breadth                         int64
Draught                       float64
Latitude                      float64
Longitude                     float64
SOG                           float64
COG                           float64
TH                              int64
Destination                  category
AisSourcen             string[python]
dtype: object

### Step 2: Clean Destination Names

In [4]:
# Ensure 'Destination' has at least one alphabetic character and is not just a country code
text_columns = df.select_dtypes(include=['string']).columns
for col in text_columns:
    df[col] = df[col].str.upper()  # Ensure string type and uppercase

df['Destination'] = df['Destination'].apply(clean_destination)
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

Unnamed: 0,index,Destination
0,0,HAMBURG
517,517,DEHAM
2894,2894,DEBRE
5160,5160,DEBRV
12382,12382,ELBE.RC
...,...,...
998470,998470,SZCZECIN
1004254,1004254,SEHAD
1039679,1039679,GDANSK.VIANOK
1049962,1049962,GDYNIA.PL


In [5]:
# Create mask for rows containing '>'
mask = df['Destination'].str.contains('>', na=False)
df.loc[mask, 'Destination'] = df.loc[mask, 'Destination'].str.split('>').str[1]
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

Unnamed: 0,index,Destination
0,0,HAMBURG
517,517,DEHAM
2894,2894,DEBRE
5160,5160,DEBRV
12382,12382,ELBE.RC
...,...,...
998470,998470,SZCZECIN
1004254,1004254,SEHAD
1039679,1039679,GDANSK.VIANOK
1049962,1049962,GDYNIA.PL


In [6]:
def replace_with_key(df, column):
    df[column] = df[column].apply(lambda x: match_names(x))
    return df

df = replace_with_key(df, 'Destination')
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

Unnamed: 0,index,Destination
0,0,DE.HAM
2894,2894,DE.BRE
5160,5160,DE.BRV
76207,76207,DK.KOB
159264,159264,DE.STA
243332,243332,
303836,303836,PL.GDN
479678,479678,PL.GDY
486420,486420,LT.KLJ
532599,532599,DE.KEL


In [7]:
print(len(df))
df = df.drop_duplicates()
len(df)

1060708


913622

# Step 3: Missing Values

In [8]:
# Note now in noise handling
# (df.isnull().sum() / len(df) * 100, 1)
#

In [9]:
# df.drop(columns=['AisSourcen'], inplace=True)  # Drop the 'AisSource' column as it is not needed really
#
# df['Length'] = df['Length'].replace(0, np.nan)
# df['Breadth'] = df['Breadth'].replace(0, np.nan)
# df['Draught'] = df['Draught'].replace(0, np.nan)
#
# # Step 2: Calculate mean by shiptype (automatically ignores NaN)
# length_means = df.groupby('shiptype')['Length'].transform('mean')
# breadth_means = df.groupby('shiptype')['Breadth'].transform('mean')
# draught_means = df.groupby('shiptype')['Draught'].transform('mean')
#
# # Step 3: Fill NaN with shiptype means
# df['Length'] = df['Length'].fillna(length_means)
# df['Breadth'] = df['Breadth'].fillna(breadth_means)
# df['Draught'] = df['Draught'].fillna(draught_means)
#
# df.isnull().sum() / len(df) * 100, 1

In [10]:
# print(len(df))
# df = fill_missing_destinations_by_proximity(df)
# df = df.drop_duplicates()
# print(len(df))
#
# df.isnull().sum() / len(df) * 100, 1

In [11]:
df.to_parquet(output_path)