In [None]:
import pandas as pd
import os
from pandas.api.types import infer_dtype

# Step 1
Load the cvs file with cleaned attributes.
Check the beginning of the file to ensure it is loaded correctly.

In [None]:
# Update the file path to move up one directory
file_path = '../../models/cleaned_atr.csv'
output_path = '../../data/type_norm.parquet'

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")

# Load the file if it exists
df = pd.read_csv(file_path)
df.head()

In [None]:
len(df)

# Step 2
   Assign proper data types to columns.

In [None]:
df.dtypes

In [None]:
df.dtypes[df.dtypes == 'object'] # Printing all with 'object' type

Some 'object' columns are labeled that way because pandas can't automatically infer their true type,
even though all values may actually be the same type (e.g., all strings).
In other cases, the column truly has mixed types—like a mix of strings and floats—which can cause issues later.
So we need to check for mixed types in these columns and convert them to a consistent type.

In [None]:
def check_mixed_types_and_examples(df):
    mixed = []
    for column in df.columns:
        unique_types = df[column].apply(type).unique()
        if len(unique_types) > 1:
            print(f"Column '{column}' has mixed types: {unique_types}")
            # Display examples of string and float values
            string_values = df[column][df[column].apply(type) == str].head()
            float_values = df[column][df[column].apply(type) == float].head()
            print(f"Examples of string values in '{column}':\n{string_values}")
            print(f"Examples of float values in '{column}':\n{float_values}")
            mixed.append(column)
    return mixed

check_mixed_types_and_examples(df)

**So we can see that the problem was with NaN values that are interpreted as float. We will convert the columns to string type to avoid mixed types issues.**


In [None]:
df['Destination'] = df['Destination'].fillna("nan").astype('string')
df['AisSourcen'] = df['AisSourcen'].fillna("nan").astype('string')
check_mixed_types_and_examples(df)

Now lets check the 'object' type data with no issue of types mixing

In [None]:
# Iterate through columns with 'object' but not mixed
def check_object_and_examples(df):
    for col in df.select_dtypes(include=['object']).columns:
        col_type = infer_dtype(df[col])

        if 'mixed' in col_type:
            print(f"Skipping '{col}' (has mixed types)")
            continue

        example_value = df[col].dropna().iloc[0] if not df[col].dropna().empty else "No data"
        print(f"Column: {col}, Example: {example_value}")

check_object_and_examples(df)

In [None]:
# Convert time columns to datetime with timezone awareness
df['StartTime'] = pd.to_datetime(df['StartTime'], utc=True)
df['EndTime'] = pd.to_datetime(df['EndTime'], utc=True)
df['time'] = pd.to_datetime(df['time'], utc=True)  # Appears to have timezone info (+01:00)

df['StartPort'] = df['StartPort'].astype('string').astype('category')
df['EndPort'] = df['EndPort'].astype('string').astype('category')

#NOTE discovered separately late
df['Destination'] = df['Destination'].astype('string').astype('category')

check_object_and_examples(df)

In [None]:
df.to_parquet(output_path)