In [1]:
import pandas as pd

In [2]:
input_files = ["../../data/raw_data/b-h.csv", "../../data/raw_data/k-g.csv"]
output_file = "../data/1_merged_typed_data.parquet"

In [3]:
dfs = []
dfs_clean = []
for input_f in input_files:
    dfs.append(pd.read_csv(input_f,
                           engine='python',
                           on_bad_lines='skip',
                           na_values=['', '?']
    ))

In [4]:
COLUMNS_TO_DROP = ['ID', 'Name', 'Callsign', 'MMSI', 'AisSourcen']

for df in dfs:
    df_clean = df.drop(columns=COLUMNS_TO_DROP, errors='ignore')
    df_clean['StartTime'] = pd.to_datetime(df_clean['StartTime'], utc=True)
    df_clean['EndTime'] = pd.to_datetime(df_clean['EndTime'], utc=True)
    df_clean['time'] = pd.to_datetime(df_clean['time'], utc=True)  # Appears to have timezone info (+01:00)

    df_clean['StartPort'] = df_clean['StartPort'].astype('string').astype('category')
    df_clean['EndPort'] = df_clean['EndPort'].astype('string').astype('category')
    df_clean['shiptype'] = df_clean['shiptype'].astype('category')
    df_clean['Destination'] = df_clean['Destination'].astype('string').astype('category')

    dfs_clean.append(df_clean)

In [5]:
def check_mixed_types_and_examples(df):
    mixed = []
    for column in df.columns:
        unique_types = df[column].apply(type).unique()
        if len(unique_types) > 1:
            print(f"Column '{column}' has mixed types: {unique_types}")
            # Display examples of string and float values
            string_values = df[column][df[column].apply(type) == str].head()
            float_values = df[column][df[column].apply(type) == float].head()
            print(f"Examples of string values in '{column}':\n{string_values}")
            print(f"Examples of float values in '{column}':\n{float_values}")
            mixed.append(column)
    return mixed

check_mixed_types_and_examples(dfs_clean[0])

Column 'Destination' has mixed types: [<class 'str'> nan]
Examples of string values in 'Destination':
0    HAMBURG
1    HAMBURG
2    HAMBURG
3    HAMBURG
4    HAMBURG
Name: Destination, dtype: category
Categories (53, string): [BLEXEN.ROAD, BREMENHAVEN, BREMERHAVEN, BREMERHAVEN.VIA.NOK, ..., HHLO.PS, NORDENHAM, SEAHU.>.DEBRV, STADE]
Examples of float values in 'Destination':
Series([], Name: Destination, dtype: category
Categories (53, string): [BLEXEN.ROAD, BREMENHAVEN, BREMERHAVEN, BREMERHAVEN.VIA.NOK, ..., HHLO.PS, NORDENHAM, SEAHU.>.DEBRV, STADE])


['Destination']

In [6]:
df_final = pd.concat(dfs_clean, ignore_index=True)

In [7]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1060708 entries, 0 to 1060707
Data columns (total 20 columns):
 #   Column          Non-Null Count    Dtype              
---  ------          --------------    -----              
 0   TripID          1060708 non-null  int64              
 1   StartLatitude   1060708 non-null  float64            
 2   StartLongitude  1060708 non-null  float64            
 3   StartTime       1060708 non-null  datetime64[ns, UTC]
 4   EndLatitude     1060708 non-null  float64            
 5   EndLongitude    1060708 non-null  float64            
 6   EndTime         1060708 non-null  datetime64[ns, UTC]
 7   StartPort       1060708 non-null  string             
 8   EndPort         1060708 non-null  string             
 9   time            1060708 non-null  datetime64[ns, UTC]
 10  shiptype        1060708 non-null  int64              
 11  Length          1060708 non-null  int64              
 12  Breadth         1060708 non-null  int64              
 1

In [8]:
df_final.sort_values('time').groupby(['TripID', 'StartPort'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f685123c550>

In [9]:
df_final.to_parquet(output_file)