In [None]:
import os
from data_cleaning.processing_utils import *
import re

In [None]:
# Update the file path to move up one directory
file_path = '../models/cleaned_atr.csv'
output_path = '../data/prepared.parquet'

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")

df = pd.read_csv(file_path)
df.head()

### Step 1: Type Normalization

In [None]:
df['Destination'] = df['Destination'].fillna("nan").astype('string')
df['AisSourcen'] = df['AisSourcen'].fillna("nan").astype('string')

df['StartTime'] = pd.to_datetime(df['StartTime'], utc=True)
df['EndTime'] = pd.to_datetime(df['EndTime'], utc=True)
df['time'] = pd.to_datetime(df['time'], utc=True)  # Appears to have timezone info (+01:00)

df['StartPort'] = df['StartPort'].astype('string').astype('category')
df['EndPort'] = df['EndPort'].astype('string').astype('category')
df['Destination'] = df['Destination'].astype('string').astype('category')
df.dtypes

### Step 2: Clean Destination Names

In [None]:
# Ensure 'Destination' has at least one alphabetic character and is not just a country code
text_columns = df.select_dtypes(include=['string']).columns
for col in text_columns:
    df[col] = df[col].str.upper()  # Ensure string type and uppercase

df['Destination'] = df['Destination'].apply(clean_destination)
df['Destination'].unique()

In [None]:
# Create mask for rows containing '>'
mask = df['Destination'].str.contains('>', na=False)
df.loc[mask, 'Destination'] = df.loc[mask, 'Destination'].str.split('>').str[1]
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

In [None]:
def replace_with_key(df, column, name_variants):
    df[column] = df[column].apply(lambda x: match_names(x, name_variants))
    return df

df = replace_with_key(df, 'Destination', full_dict)

mask = df['Destination'].str.contains('.', na=False)
df.loc[mask, 'Destination'] = df.loc[mask, 'Destination'].str.split('.').str[0] # NOTE Remove everything after dot

df['Destination'] = df['Destination'].apply(
    lambda x: "NAN" if not re.search(r'[A-Za-z]', str(x)) or re.match(r'^[A-Z]{2}$', str(x)) else x
)

df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

In [None]:
print(len(df))
df = df.drop_duplicates()
len(df)

# Step 3: Missing Values

In [None]:
(df.isnull().sum() / len(df) * 100, 1)


In [None]:
df.drop(columns=['AisSourcen'], inplace=True)  # Drop the 'AisSource' column as it is not needed really

df['Length'] = df['Length'].replace(0, np.nan)
df['Breadth'] = df['Breadth'].replace(0, np.nan)
df['Draught'] = df['Draught'].replace(0, np.nan)

# Step 2: Calculate mean by shiptype (automatically ignores NaN)
length_means = df.groupby('shiptype')['Length'].transform('mean')
breadth_means = df.groupby('shiptype')['Breadth'].transform('mean')
draught_means = df.groupby('shiptype')['Draught'].transform('mean')

# Step 3: Fill NaN with shiptype means
df['Length'] = df['Length'].fillna(length_means)
df['Breadth'] = df['Breadth'].fillna(breadth_means)
df['Draught'] = df['Draught'].fillna(draught_means)

df.isnull().sum() / len(df) * 100, 1

In [None]:
print(len(df))
df = fill_missing_destinations_by_proximity(df)
df = df.drop_duplicates()
print(len(df))

df.isnull().sum() / len(df) * 100, 1

In [None]:
df.to_parquet(output_path)