In [6]:
import pandas as pd
import os
import numpy as np

In [7]:
file_path = "../../data/normalized_destinations.parquet"

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")

df = pd.read_parquet(file_path)
df = df.replace(["absent", "nan", "NAN", "NaN", "", np.nan, None], pd.NA)

In [8]:
def check_categorical_columns(df, sample_size=10, threshold=0.1):
    """s
    Identifies object-type columns suitable for categorical conversion.

    Parameters:
        df (DataFrame): Input dataframe to analyze
        sample_size (int): Number of unique values to display (default: 10)
        threshold (float): Maximum unique/total ratio for categorical (default: 0.1)
    """
    object_cols = df.select_dtypes(include=['object', 'string']).columns

    if not object_cols.any():
        print("No object-type columns found")
        return

    print("Categorical column analysis:")
    print("-" * 40)
    recommended_cols = []

    for col in object_cols:
        # Clean data and calculate metrics
        clean_series = df[col].dropna()
        nunique = clean_series.nunique()
        total_rows = len(df)
        ratio = nunique / total_rows

        if ratio < threshold:
            print(f"[Recommended] Column: {col}")
            print(f"Unique values: {nunique} (of {total_rows} total, {ratio:.1%})")
            print(f"Sample values: {clean_series.unique()[:sample_size].tolist()}")

            # Memory analysis
            orig_mem = df[col].memory_usage(deep=True) / 1024
            cat_mem = df[col].astype('category').memory_usage(deep=True) / 1024
            print(f"Memory usage: {orig_mem:.1f} KB -> {cat_mem:.1f} KB")
            print(f"Reduction: {1 - (cat_mem/orig_mem):.0%}")
            print("-" * 20)
            recommended_cols.append(col)
        else:
            print(f"[Not recommended] Column: {col} - too many unique values ({nunique}, {ratio:.1%})")
            pass
    return recommended_cols

check_categorical_columns(df)

Categorical column analysis:
----------------------------------------
[Recommended] Column: Draught
Unique values: 238 (of 1060708 total, 0.0%)
Sample values: [11.54, 6.6, 8.99, 6.99, 8.71, 5.4, 5.56, 7.2, 6.5, 7.33]
Memory usage: 33528.6 KB -> 2081.8 KB
Reduction: 94%
--------------------
[Recommended] Column: Destination
Unique values: 27 (of 1060708 total, 0.0%)
Sample values: ['DEHAM', 'DEBRE', 'DEBRV', 'DEHAM/AIRBUS', 'COPENHAGEN', 'DESTA', 'DEHAM/EUR', 'DEIM', 'PLGDN', 'NORDEN']
Memory usage: 64206.9 KB -> 1038.7 KB
Reduction: 98%
--------------------
[Recommended] Column: AisSourcen
Unique values: 223 (of 1060708 total, 0.0%)
Sample values: ['DAIS1.81B.90B.71.71A', '51.DAIS1.81B.90B.71.71A', 'DAIS1.81B.71.71A', 'H7001.DAIS1.81B.71.71A', '51.DAIS1.81B.71.71A', '81B.71.71A', '51.81B.71.71A', '51.81B.90B.71.71A', 'DAIS2.81B.90B.71A', 'DAIS2.81B.71A']
Memory usage: 71341.9 KB -> 2095.8 KB
Reduction: 97%
--------------------


['Draught', 'Destination', 'AisSourcen']

In [9]:
df['Destination'] = df['Destination'].astype('category')

In [10]:
save_path = '../../data/normalized_destinations_cat.parquet'
df.to_parquet(save_path)