In [15]:
#!/usr/bin/env python3

In [16]:
"""
 Initial Data Exploration
"""

# import libs
import pandas as pd
import re
import json
from datetime import datetime
from collections import Counter
import glob

In [26]:
print("="*70)
print("MARITIME DATA EXPLORATION")
print("="*70)
print()

MARITIME DATA EXPLORATION



In [30]:
input_path = "../raw_data/arrivals/arrivals_2025-01-23_2026-01-23.json"
output_path = "../raw_data/arrivals/cleaned_arrivals_2025-01-23_2026-01-23.json"

with open(input_path, "r") as fin, open(output_path, "w") as fout:
    for lineno, line in enumerate(fin, 1):
        line = line.rstrip()  # remove newline
        line = line.lstrip(",")  # remove any leading commas

        # Add trailing comma if missing
        if not line.endswith(","):
            line += ","

        fout.write(line + "\n")

In [37]:
input_path = "../raw_data/departures/departures_2025-01-23_2026-01-23.json"
output_path = "../raw_data/departures/cleaned_departures_2025-01-23_2026-01-23.json"

with open(input_path, "r") as fin, open(output_path, "w") as fout:
    for lineno, line in enumerate(fin, 1):
        line = line.rstrip()  # remove newline
        line = line.lstrip(",")  # remove any leading commas

        # Add trailing comma if missing
        if not line.endswith(","):
            line += ","

        fout.write(line + "\n")

In [31]:
print("="*70)
print("1. Arrivals Data")
print("="*70)

#load arrivals json 
arrivals_files = glob.glob('../raw_data/arrivals/cleaned_*.json')
print(f"Found {len(arrivals_files)} arrivals file(s)")

1. Arrivals Data
Found 1 arrivals file(s)


In [32]:
arrivals_data = []
for file in arrivals_files:
    with open(file, 'r') as f:
        data = json.load(f)
        arrivals_data.extend(data)
        print(f"  {file}: {len(data)} records")
print(f"\nTotal arrival records: {len(arrivals_data)}")
print()

  ../raw_data/arrivals/cleaned_arrivals_2025-01-23_2026-01-23.json: 141713 records

Total arrival records: 141713



In [33]:
# Convert to DataFrame
df_arrivals = pd.json_normalize(arrivals_data)
print("Arrivals DataFrame shape:", df_arrivals.shape)
print("\nArrivals columns:")
print(df_arrivals.columns.tolist())
print()

print("Sample arrival record:")
print(df_arrivals.head(1).T)
print()

Arrivals DataFrame shape: (141713, 7)

Arrivals columns:
['arrivedTime', 'locationFrom', 'locationTo', 'vesselParticulars.vesselName', 'vesselParticulars.callSign', 'vesselParticulars.imoNumber', 'vesselParticulars.flag']

Sample arrival record:
                                                0
arrivedTime                   2025-01-23 00:09:00
locationFrom                                 SEAS
locationTo                                   WJFP
vesselParticulars.vesselName      SUKSES ABADI 02
vesselParticulars.callSign                 YB3679
vesselParticulars.imoNumber               5630168
vesselParticulars.flag                         ID



In [34]:
# Check for IMO numbers
if 'vesselParticulars.imoNumber' in df_arrivals.columns:
    print("✓ Arrivals have IMO numbers")
    print(f"  Unique vessels: {df_arrivals['vesselParticulars.imoNumber'].nunique()}")
else:
    print("✗ No IMO numbers in arrivals data")
print()

✓ Arrivals have IMO numbers
  Unique vessels: 19273



In [38]:
print("="*70)
print("2. Departures  Data")
print("="*70)

#load departures json 
departures_files = glob.glob('../raw_data/departures/cleaned_*.json')
print(f"Found {len(departures_files)} departures file(s)")

2. Departures  Data
Found 1 departures file(s)


In [39]:
departures_data = []
for file in departures_files:
    with open(file, 'r') as f:
        data = json.load(f)
        departures_data.extend(data)
        print(f"  {file}: {len(data)} records")

print(f"\nTotal departure records: {len(departures_data)}")
print()

  ../raw_data/departures/cleaned_departures_2025-01-23_2026-01-23.json: 125468 records

Total departure records: 125468



In [40]:
df_departures = pd.json_normalize(departures_data)
print("Departures DataFrame shape:", df_departures.shape)
print("\nDepartures columns:")
print(df_departures.columns.tolist())
print()

print("Sample departure record:")
print(df_departures.head(1).T)
print()

Departures DataFrame shape: (125468, 5)

Departures columns:
['departedTime', 'vesselParticulars.vesselName', 'vesselParticulars.callSign', 'vesselParticulars.imoNumber', 'vesselParticulars.flag']

Sample departure record:
                                                0
departedTime                  2025-01-23 00:45:00
vesselParticulars.vesselName                AVILA
vesselParticulars.callSign                V7A7504
vesselParticulars.imoNumber               9492098
vesselParticulars.flag                         MH



In [44]:
print("="*70)
print("3. Vessel Position  Data")
print("="*70)

#load departures json 
positions_files = glob.glob('../raw_data/positions/snapshot_*.json')
print(f"Found {len(positions_files)} position snapshot file(s)")

3. Vessel Position  Data
Found 1 position snapshot file(s)


In [45]:
positions_data = []
for file in positions_files:
    with open(file, 'r') as f:
        data = json.load(f)
        positions_data.extend(data)
        print(f"  {file}: {len(data)} records")

print(f"\nTotal position records: {len(positions_data)}")
print()


  ../raw_data/positions/snapshot_20260123_085758.json: 1246 records

Total position records: 1246



In [46]:
df_positions = pd.json_normalize(positions_data)
print("Positions DataFrame shape:", df_positions.shape)
print("\nPositions columns:")
print(df_positions.columns.tolist())
print()

print("Sample position record:")
print(df_positions.head(1).T)
print()

Positions DataFrame shape: (1246, 23)

Positions columns:
['latitude', 'longitude', 'latitudeDegrees', 'longitudeDegrees', 'speed', 'course', 'heading', 'dimA', 'dimB', 'timeStamp', 'vesselParticulars.vesselName', 'vesselParticulars.callSign', 'vesselParticulars.imoNumber', 'vesselParticulars.flag', 'vesselParticulars.vesselLength', 'vesselParticulars.vesselBreadth', 'vesselParticulars.vesselDepth', 'vesselParticulars.vesselType', 'vesselParticulars.grossTonnage', 'vesselParticulars.netTonnage', 'vesselParticulars.deadweight', 'vesselParticulars.mmsiNumber', 'vesselParticulars.yearBuilt']

Sample position record:
                                                   0
latitude                                    0.025806
longitude                                   1.815568
latitudeDegrees                             1.478558
longitudeDegrees                          104.024369
speed                                            0.0
course                                           0.0
heading 

In [47]:
print("\n4. DATA QUALITY ANALYSIS")
print("-"*70)

print("\n--- Arrivals Data Quality ---")
print(f"Missing values:")
print(df_arrivals.isnull().sum())
print()

print("\n--- Departures Data Quality ---")
print(f"Missing values:")
print(df_departures.isnull().sum())
print()

print("\n--- Positions Data Quality ---")
print(f"Missing values:")
print(df_positions.isnull().sum())
print()


4. DATA QUALITY ANALYSIS
----------------------------------------------------------------------

--- Arrivals Data Quality ---
Missing values:
arrivedTime                     0
locationFrom                    0
locationTo                      0
vesselParticulars.vesselName    0
vesselParticulars.callSign      0
vesselParticulars.imoNumber     0
vesselParticulars.flag          0
dtype: int64


--- Departures Data Quality ---
Missing values:
departedTime                    0
vesselParticulars.vesselName    0
vesselParticulars.callSign      0
vesselParticulars.imoNumber     0
vesselParticulars.flag          0
dtype: int64


--- Positions Data Quality ---
Missing values:
latitude                           0
longitude                          0
latitudeDegrees                    0
longitudeDegrees                   0
speed                              0
course                             0
heading                            0
dimA                               0
dimB                       

In [50]:
print("\n5. VESSEL ANALYSIS")
print("-"*70)

print("\nVessel types in position snapshot:")
if 'vesselParticulars.vesselType' in df_positions.columns:
    type_counts = df_positions['vesselParticulars.vesselType'].value_counts()
    print(type_counts)
else:
    print("No vessel type data")
print()

print("\nVessels by flag:")
if 'vesselParticulars.flag' in df_positions.columns:
    flag_counts = df_positions['vesselParticulars.flag'].value_counts().head(10)
    print(flag_counts)
print()

print("\nVessel age distribution:")
if 'vesselParticulars.yearBuilt' in df_positions.columns:
    current_year = 2026
    df_positions['age'] = current_year - df_positions['vesselParticulars.yearBuilt'].astype(int)
    print(df_positions['age'].describe())
print()


5. VESSEL ANALYSIS
----------------------------------------------------------------------

Vessel types in position snapshot:
vesselParticulars.vesselType
TA    287
TU    176
BC    141
CS    128
CH     95
FB     72
SV     65
YA     59
FR     53
LP     28
LN     23
DR     19
LC     12
UT     11
CC      9
CO      8
LU      8
FV      6
PV      6
HS      6
WB      6
CL      6
WA      4
BA      4
CX      4
OT      2
RE      2
RV      2
LV      1
OR      1
SR      1
DL      1
Name: count, dtype: int64


Vessels by flag:
vesselParticulars.flag
ID    220
SG    167
PA    139
MY    137
LR    103
MH     89
HK     47
CN     40
MT     21
BS     18
Name: count, dtype: int64


Vessel age distribution:
count    1246.000000
mean      244.548957
std       639.211396
min         0.000000
25%        11.000000
50%        16.000000
75%        23.000000
max      2026.000000
Name: age, dtype: float64



In [51]:
print("\n6. LOCATION ANALYSIS")
print("-"*70)

print("\nTop origin locations (arrivals):")
if 'locationFrom' in df_arrivals.columns:
    print(df_arrivals['locationFrom'].value_counts().head(20))
print()

print("\nTop destination locations (arrivals):")
if 'locationTo' in df_arrivals.columns:
    print(df_arrivals['locationTo'].value_counts().head(20))
print()


6. LOCATION ANALYSIS
----------------------------------------------------------------------

Top origin locations (arrivals):
locationFrom
SEAE     53021
SEAW     36187
SEABA    36154
SEAS      6493
SEAKM     3659
SEATP     2070
SEABI     1890
SEA        970
SEAPG      850
SEADS      167
SEAHB      166
SEANP       51
SEAN        35
Name: count, dtype: int64


Top destination locations (arrivals):
locationTo
PEBGC    15508
PEBGA    15200
PWBGA    11580
TMFT     10486
PEBGB    10216
RFT1      5852
ACBTH     5386
RFT5      5324
PJSB      4959
RFT3      4703
PGBG      4512
RFT6      3854
RFT4      3511
RFT2      3238
AEHC      2757
PWBGB     2375
WJFP      1991
JR21      1854
J26R1     1587
RFT7      1556
Name: count, dtype: int64

