In [None]:
import os
from data_cleaning.utils.normalization_utils import clean_destination, match_names
import pandas as pd

In [None]:
# Update the file path to move up one directory
file_path = '../../data/cleaned_atr.csv'
output_path = '../../data/2_destination_norm.parquet'

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")

df = pd.read_csv(file_path)
df.head()

### Step 1: Type Normalization

In [None]:
df['Destination'] = df['Destination'].fillna("nan").astype('string')
df['AisSourcen'] = df['AisSourcen'].fillna("nan").astype('string')

df['StartTime'] = pd.to_datetime(df['StartTime'], utc=True)
df['EndTime'] = pd.to_datetime(df['EndTime'], utc=True)
df['time'] = pd.to_datetime(df['time'], utc=True)  # Appears to have timezone info (+01:00)
df['shiptype'] = df['shiptype'].astype('category')

df['StartPort'] = df['StartPort'].astype('string').astype('category')
df['EndPort'] = df['EndPort'].astype('string').astype('category')
df['Destination'] = df['Destination'].astype('string').astype('category')
df.dtypes

### Step 2: Clean Destination Names

In [None]:
# Ensure 'Destination' has at least one alphabetic character and is not just a country code
text_columns = df.select_dtypes(include=['string']).columns
for col in text_columns:
    df[col] = df[col].str.upper()  # Ensure string type and uppercase

df['Destination'] = df['Destination'].apply(clean_destination)
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

In [None]:
# Create mask for rows containing '>'
mask = df['Destination'].str.contains('>', na=False)
df.loc[mask, 'Destination'] = df.loc[mask, 'Destination'].str.split('>').str[1]
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

In [None]:
def replace_with_key(df, column):
    df[column] = df[column].apply(lambda x: match_names(x))
    return df

df = replace_with_key(df, 'Destination')
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

In [None]:
print(len(df))
df = df.drop_duplicates()
len(df)

In [None]:
df.to_parquet(output_path)