In [41]:
import pandas as pd
import os

In [43]:
file_path = '../models/cleaned_atr.csv'
output_path = '../data/prepared_data.parquet'

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
    # Load the file if it exists
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,TripID,StartLatitude,StartLongitude,StartTime,EndLatitude,EndLongitude,EndTime,StartPort,EndPort,time,...,Length,Breadth,Draught,Latitude,Longitude,SOG,COG,TH,Destination,AisSourcen
0,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:07:00+01:00,...,277,42,11.54,53.57,8.53,0.7,331.2,143,HAMBURG,DAIS1.81b.90b.71.71a
1,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:10:00+01:00,...,277,42,11.54,53.57,8.53,1.6,315.3,117,HAMBURG,DAIS1.81b.90b.71.71a
2,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:10:00+01:00,...,277,42,11.54,53.57,8.53,2.8,322.6,100,HAMBURG,DAIS1.81b.90b.71.71a
3,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:12:00+01:00,...,277,42,11.54,53.57,8.53,2.8,286.3,74,HAMBURG,DAIS1.81b.90b.71.71a
4,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:16:00+01:00,...,277,42,11.54,53.57,8.53,4.3,333.1,333,HAMBURG,DAIS1.81b.90b.71.71a


# Step 1
   Assign proper data types to columns.

In [44]:
df['Destination'] = df['Destination'].fillna("nan").astype('string')
df['AisSourcen'] = df['AisSourcen'].fillna("nan").astype('string')

# Convert time columns to datetime with timezone awareness
df['StartTime'] = pd.to_datetime(df['StartTime'], utc=True)
df['EndTime'] = pd.to_datetime(df['EndTime'], utc=True)
df['time'] = pd.to_datetime(df['time'], utc=True)  # Appears to have timezone info (+01:00)

df['StartPort'] = df['StartPort'].astype('string').astype('category')
df['EndPort'] = df['EndPort'].astype('string').astype('category')
df.dtypes

TripID                          int64
StartLatitude                 float64
StartLongitude                float64
StartTime         datetime64[ns, UTC]
EndLatitude                   float64
EndLongitude                  float64
EndTime           datetime64[ns, UTC]
StartPort                    category
EndPort                      category
time              datetime64[ns, UTC]
shiptype                        int64
Length                          int64
Breadth                         int64
Draught                       float64
Latitude                      float64
Longitude                     float64
SOG                           float64
COG                           float64
TH                              int64
Destination            string[python]
AisSourcen             string[python]
dtype: object

# Step 2
   Normalize the columns

In [45]:
from dest_normalization import *

In [46]:
text_columns = df.select_dtypes(include=['string']).columns
for col in text_columns:
    df[col] = df[col].str.upper()  # Ensure string type and uppercase

df['Destination'] = df['Destination'].where(
        df['Destination'].str.contains(r'[A-Za-z]', na=False),
        "NAN"
    ) # Atle ast one alphabetic

df['Destination'] = df['Destination'].apply(
    lambda x: "NAN" if re.match(r'^[A-Z]{2}$', str(x)) else x
) #only country code

df['Destination'] = df['Destination'].apply(clean_destination)

mask = df['Destination'].str.contains('>', na=False)
df.loc[mask, 'Destination'] = df.loc[mask, 'Destination'].str.split('>').str[1]

In [47]:
name_german = {
    'DEHAM': ["HAMBURG", "HAMBUG", "HH", "HAM"],
    'DEBRV': ["BREMERHAVEN", "BREMENHAVEN", "BRV", "DEBHV", "BHV"],
    'DEBRE': ["BREMEN", "BRE"],
    'DEKEL': ["KIEL", "KEL"],
    'DEHAM.FINKENWERDER': ["FINKENWERDER", "FINKENWERD"],
    'DEHAM.BLEXEN': ["BLEXEN"],
    'DESTA': ["STADE", "STAD", "STA"],
    'DEBRB' : ["BRUNSBUETTEL", "BRUNSBUETT", "BRB"],
    'DEHAM.ELBE': ["ELBE"],
    'DEVTT': ["HIDDENSEE", "VTT"],
    'DEWVN': ["WILHELMSHAVEN", "WVN"],
    'country': ["DE"]
}

name_poland = {
    'PLGDN': ["GDANSK", "GDANK", "GDN"],
    'PLGDY': ["GDYNIA", "GYDNIA", "GYDINIA", "GDY", "GDYNA"],
    'SZCZECIN': ["SZCZECIN", "SZCZECIN", "SZZ"],
     'country': ["PL"]
}

name_lythuania = {
    'LTKLJ': ["KLAIPEDA", "KLJ"],
    'country': ["LT"]
}

name_sweden = {
    'SEHAD': ["HALMSTAD", "HAD"],
    'SENOK': ["NOK"],
    'SEAHU': ["AHUS", "AHU"],
    'country': ["SE"]
}

name_russia = {
        'RUKGD': ["KALININGRAD", "KALININGRAD", "KAL"],
        'country': ["RU"]
}

name_denmark = {
    'DKKOB': ["KOBENHAVN", "COPENHAGEN", "COPENHAGUE", "CPH"],
    'country': ["DK"]
}

name_finland = {
    'FIHKO': ["HANKO", "HKO"],
    'country': ["FI"]
}

name_belgium = {
    "BEANR": ["BEANR", "ANR"],
    'country': ["BE"]
}
full_dict = [name_german, name_poland, name_lythuania, name_sweden, name_russia, name_finland, name_belgium]

#NOTE FOR NOW MATCH NAME REPLACES ALL NAME
def replace_with_key(df, column, name_variants):
    df[column] = df[column].apply(lambda x: match_names(x, name_variants))
    return df

In [48]:
df = replace_with_key(df, 'Destination', full_dict)
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

Unnamed: 0,index,Destination
0,0,DEHAM
2894,2894,DEBRE
5160,5160,DEBRV
12382,12382,DEHAM.ELBE.RC
13229,13229,DEBRV.VIA.NOK
...,...,...
982201,982201,RUKGD
998470,998470,SZCZECIN
1004254,1004254,SEHAD
1039679,1039679,PLGDN.VIANOK


In [49]:
mask = df['Destination'].str.contains('.', na=False)
df.loc[mask, 'Destination'] = df.loc[mask, 'Destination'].str.split('.').str[0]
# NOTE Remove everything after dot and /
mask = df['Destination'].str.contains('/', na=False)
df.loc[mask, 'Destination'] = df.loc[mask, 'Destination'].str.split('/').str[0]

df['Destination'] = df['Destination'].astype('category')
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])


Unnamed: 0,index,Destination
0,0,DEHAM
2894,2894,DEBRE
5160,5160,DEBRV
76207,76207,COPENHAGEN
159264,159264,DESTA
243332,243332,NAN
295939,295939,DEIM
303836,303836,PLGDN
387562,387562,NORDEN
479678,479678,PLGDY


# Step 3
   Missing values & duplicates

In [50]:
df = df.drop_duplicates()
df.drop(columns=['AisSourcen'], inplace=True)  # Drop the 'AisSource' column as it is not needed really
df['Draught'] = df['Draught'].fillna(df['Draught'].median())
df['Destination'] = df.groupby('TripID')['Destination'].transform(lambda x: x.ffill().bfill())

In [51]:
df.to_parquet(output_path)