In [1]:
from data_cleaning.utils.normalization_utils import clean_destination, match_names
import pandas as pd
from tqdm.notebook import tqdm # For progress bars in notebook

In [2]:
input_files = ["../../data/raw_data/b-h.csv", "../../data/raw_data/k-g.csv"]
output_file = "data/2_destination_norm.parquet"

In [None]:
dfs = []
dfs_clean = []
for input_f in input_files:
    dfs.append(pd.read_csv(input_f,
                           engine='python',
                           on_bad_lines='skip',
                           na_values=['', '?']
    ))

### Step 1: Type Normalization

In [None]:
COLUMNS_TO_DROP = ['ID', 'Name', 'Callsign', 'MMSI', 'AisSourcen']

for df in dfs:
    df_clean = df.drop(columns=COLUMNS_TO_DROP, errors='ignore')
    df_clean['StartTime'] = pd.to_datetime(df_clean['StartTime'], utc=True)
    df_clean['EndTime'] = pd.to_datetime(df_clean['EndTime'], utc=True)
    df_clean['time'] = pd.to_datetime(df_clean['time'], utc=True)  # Appears to have timezone info (+01:00)

    df_clean['StartPort'] = df_clean['StartPort'].astype('string').astype('category')
    df_clean['EndPort'] = df_clean['EndPort'].astype('string').astype('category')
    df_clean['shiptype'] = df_clean['shiptype'].astype('category')
    df_clean['Destination'] = df_clean['Destination'].astype('string').astype('category')

    dfs_clean.append(df_clean)

In [None]:
def check_mixed_types_and_examples(df):
    mixed = []
    for column in df.columns:
        unique_types = df[column].apply(type).unique()
        if len(unique_types) > 1:
            print(f"Column '{column}' has mixed types: {unique_types}")
            # Display examples of string and float values
            string_values = df[column][df[column].apply(type) == str].head()
            float_values = df[column][df[column].apply(type) == float].head()
            print(f"Examples of string values in '{column}':\n{string_values}")
            print(f"Examples of float values in '{column}':\n{float_values}")
            mixed.append(column)
    return mixed

check_mixed_types_and_examples(dfs_clean[0])

In [None]:
df_norm = pd.concat(dfs_clean, ignore_index=True)
df_norm = df_norm.sort_values(['TripID', 'time']).reset_index(drop=True)

In [None]:
df_norm.info()

### Step 2: Clean Destination Names

In [None]:
df = df_norm.copy()

In [None]:
# Ensure 'Destination' has at least one alphabetic character and is not just a country code
text_columns = df.select_dtypes(include=['string']).columns
for col in text_columns:
    df[col] = df[col].str.upper()  # Ensure string type and uppercase

df['Destination'] = df['Destination'].apply(clean_destination)
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

In [None]:
# Create mask for rows containing '>'
mask = df['Destination'].str.contains('>', na=False)
df.loc[mask, 'Destination'] = df.loc[mask, 'Destination'].str.split('>').str[1]
df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

In [None]:
df['Destination'] = df['Destination'].progress_apply(lambda x: match_names(x))

df[['Destination']].reset_index().drop_duplicates(subset=['Destination'])

In [None]:
from data_cleaning.utils.fill_missing_utils import get_entries_with_missing_values
get_entries_with_missing_values(df, 'Destination')


In [None]:
print(len(df))
df = df.drop_duplicates()
len(df)

In [None]:
df.to_parquet(output_file)