In [None]:
import pandas as pd
import os
import numpy as np

In [None]:
file_path = "../data/2_destination_norm.parquet"
output_path = "../data/2_1_norm_full.parquet"

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")

df = pd.read_parquet(file_path)
df = df.replace(["absent", "nan", "NAN", "NaN", "", np.nan, None], pd.NA)

In [None]:
def check_categorical_columns(df, sample_size=10, threshold=0.5):
    """s
    Identifies object-type columns suitable for categorical conversion.

    Parameters:
        df (DataFrame): Input dataframe to analyze
        sample_size (int): Number of unique values to display (default: 10)
        threshold (float): Maximum unique/total ratio for categorical (default: 0.1)
    """
    object_cols = df.select_dtypes(include=['object', 'string']).columns

    if not object_cols.any():
        print("No object-type columns found")
        return

    print("Categorical column analysis:")
    print("-" * 40)
    recommended_cols = []

    for col in object_cols:
        # Clean data and calculate metrics
        clean_series = df[col].dropna()
        nunique = clean_series.nunique()
        total_rows = len(df)
        ratio = nunique / total_rows

        if ratio < threshold:
            print(f"[Recommended] Column: {col}")
            print(f"Unique values: {nunique} (of {total_rows} total, {ratio:.1%})")
            print(f"Sample values: {clean_series.unique()[:sample_size].tolist()}")

            # Memory analysis
            orig_mem = df[col].memory_usage(deep=True) / 1024
            cat_mem = df[col].astype('category').memory_usage(deep=True) / 1024
            print(f"Memory usage: {orig_mem:.1f} KB -> {cat_mem:.1f} KB")
            print(f"Reduction: {1 - (cat_mem/orig_mem):.0%}")
            print("-" * 20)
            recommended_cols.append(col)
        else:
            print(f"[Not recommended] Column: {col} - too many unique values ({nunique}, {ratio:.1%})")
            pass
    return recommended_cols

check_categorical_columns(df)

In [None]:
df['Destination'] = df['Destination'].astype('category')

In [None]:
df.to_parquet(output_path)