In [11]:
import pandas as pd
import os
from pandas.api.types import infer_dtype

# Step 1
Load the cvs file with cleaned attributes.
Check the beginning of the file to ensure it is loaded correctly.

In [12]:
# Update the file path to move up one directory
file_path = '../../data/cleaned_atr.csv'
output_path = '../data/1_type_norm.parquet'

if not os.path.exists(file_path):
    print(f"File not found: {file_path}")

# Load the file if it exists
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,TripID,StartLatitude,StartLongitude,StartTime,EndLatitude,EndLongitude,EndTime,StartPort,EndPort,time,...,Length,Breadth,Draught,Latitude,Longitude,SOG,COG,TH,Destination,AisSourcen
0,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:07:00+01:00,...,277,42,11.54,53.57,8.53,0.7,331.2,143,HAMBURG,DAIS1.81b.90b.71.71a
1,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:10:00+01:00,...,277,42,11.54,53.57,8.53,1.6,315.3,117,HAMBURG,DAIS1.81b.90b.71.71a
2,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:10:00+01:00,...,277,42,11.54,53.57,8.53,2.8,322.6,100,HAMBURG,DAIS1.81b.90b.71.71a
3,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:12:00+01:00,...,277,42,11.54,53.57,8.53,2.8,286.3,74,HAMBURG,DAIS1.81b.90b.71.71a
4,39131,53.57,8.53,'2016-01-24 08:06',53.53,9.9,'2016-01-24 16:44',BREMERHAVEN,HAMBURG,2016-01-24 09:16:00+01:00,...,277,42,11.54,53.57,8.53,4.3,333.1,333,HAMBURG,DAIS1.81b.90b.71.71a


# Step 2
   Assign proper data types to columns.

In [13]:
df.dtypes

TripID              int64
StartLatitude     float64
StartLongitude    float64
StartTime          object
EndLatitude       float64
EndLongitude      float64
EndTime            object
StartPort          object
EndPort            object
time               object
shiptype            int64
Length              int64
Breadth             int64
Draught           float64
Latitude          float64
Longitude         float64
SOG               float64
COG               float64
TH                  int64
Destination        object
AisSourcen         object
dtype: object

In [14]:
df.dtypes[df.dtypes == 'object'] # Printing all with 'object' type

StartTime      object
EndTime        object
StartPort      object
EndPort        object
time           object
Destination    object
AisSourcen     object
dtype: object

Some 'object' columns are labeled that way because pandas can't automatically infer their true type,
even though all values may actually be the same type (e.g., all strings).
In other cases, the column truly has mixed types—like a mix of strings and floats—which can cause issues later.
So we need to check for mixed types in these columns and convert them to a consistent type.

In [15]:
def check_mixed_types_and_examples(df):
    mixed = []
    for column in df.columns:
        unique_types = df[column].apply(type).unique()
        if len(unique_types) > 1:
            print(f"Column '{column}' has mixed types: {unique_types}")
            # Display examples of string and float values
            string_values = df[column][df[column].apply(type) == str].head()
            float_values = df[column][df[column].apply(type) == float].head()
            print(f"Examples of string values in '{column}':\n{string_values}")
            print(f"Examples of float values in '{column}':\n{float_values}")
            mixed.append(column)
    return mixed

check_mixed_types_and_examples(df)

Column 'Destination' has mixed types: [<class 'str'> <class 'float'>]
Examples of string values in 'Destination':
0    HAMBURG
1    HAMBURG
2    HAMBURG
3    HAMBURG
4    HAMBURG
Name: Destination, dtype: object
Examples of float values in 'Destination':
243332    NaN
243333    NaN
480142    NaN
480143    NaN
480144    NaN
Name: Destination, dtype: object
Column 'AisSourcen' has mixed types: [<class 'str'> <class 'float'>]
Examples of string values in 'AisSourcen':
0    DAIS1.81b.90b.71.71a
1    DAIS1.81b.90b.71.71a
2    DAIS1.81b.90b.71.71a
3    DAIS1.81b.90b.71.71a
4    DAIS1.81b.90b.71.71a
Name: AisSourcen, dtype: object
Examples of float values in 'AisSourcen':
1048    NaN
1049    NaN
1050    NaN
1051    NaN
1052    NaN
Name: AisSourcen, dtype: object


['Destination', 'AisSourcen']

**So we can see that the problem was with NaN values that are interpreted as float. We will convert the columns to string type to avoid mixed types issues.**


In [16]:
df['Destination'] = df['Destination'].fillna("nan").astype('string')
df['AisSourcen'] = df['AisSourcen'].fillna("nan").astype('string')
check_mixed_types_and_examples(df)

[]

Now lets check the 'object' type data with no issue of types mixing

In [17]:
# Iterate through columns with 'object' but not mixed
def check_object_and_examples(df):
    for col in df.select_dtypes(include=['object']).columns:
        col_type = infer_dtype(df[col])

        if 'mixed' in col_type:
            print(f"Skipping '{col}' (has mixed types)")
            continue

        example_value = df[col].dropna().iloc[0] if not df[col].dropna().empty else "No data"
        print(f"Column: {col}, Example: {example_value}")

check_object_and_examples(df)

Column: StartTime, Example: '2016-01-24 08:06'
Column: EndTime, Example: '2016-01-24 16:44'
Column: StartPort, Example: BREMERHAVEN
Column: EndPort, Example: HAMBURG
Column: time, Example: 2016-01-24 09:07:00+01:00


In [18]:
# Convert time columns to datetime with timezone awareness
df['StartTime'] = pd.to_datetime(df['StartTime'], utc=True)
df['EndTime'] = pd.to_datetime(df['EndTime'], utc=True)
df['time'] = pd.to_datetime(df['time'], utc=True)  # Appears to have timezone info (+01:00)

df['StartPort'] = df['StartPort'].astype('string').astype('category')
df['EndPort'] = df['EndPort'].astype('string').astype('category')
df['shiptype'] = df['shiptype'].astype('category')

#NOTE discovered separately later
df['Destination'] = df['Destination'].astype('string').astype('category')

check_object_and_examples(df)

In [19]:
df.to_parquet(output_path)