In [54]:
from matplotlib import pyplot as plt
import xarray as xr
import numpy as np
import pandas as pd
import IPython.display as display

In [55]:
xrds = xr.open_dataset('../data/raw/IBTrACS.WP.v04r01.nc')
display.display(xrds)

In [56]:
features_selected = ['sid', 'name', 'time', 'lat', 'lon']

xrds_selected = xrds[features_selected]
df = xrds_selected.to_dataframe().reset_index()
df = df[features_selected]

df.rename(columns={'sid': 'StormID', 'name': 'Name', 'time': 'Time', 'lat': 'Lat', 'lon': 'Lon'}, inplace=True)
display.display(df)

Unnamed: 0,StormID,Name,Time,Lat,Lon
0,b'1884177N17124',b'UNNAMED',1884-06-24 16:00:00.000026880,16.500000,124.000000
1,b'1884177N17124',b'UNNAMED',1884-06-24 18:00:00.000040192,16.500000,123.800003
2,b'1884177N17124',b'UNNAMED',1884-06-24 21:00:00.000040192,16.600000,123.500000
3,b'1884177N17124',b'UNNAMED',1884-06-25 00:00:00.000040192,16.700001,123.199997
4,b'1884177N17124',b'UNNAMED',1884-06-25 03:00:00.000040192,16.799999,122.900002
...,...,...,...,...,...
1505155,b'2024295N15136',b'TRAMI',NaT,,
1505156,b'2024295N15136',b'TRAMI',NaT,,
1505157,b'2024295N15136',b'TRAMI',NaT,,
1505158,b'2024295N15136',b'TRAMI',NaT,,


In [57]:
# Ensure 'Time' is in datetime format
df['Time'] = pd.to_datetime(df['Time'], errors='coerce')

# Remove rows where 'Time' is NaN or before the year 1900
df_filtered = df.dropna(subset=['Time'])  # Drop rows where 'Time' is NaN
df_filtered = df_filtered[df_filtered['Time'].dt.year >= 1900]  # Keep only rows where 'Time' is >= 1900

# Display the filtered DataFrame
display.display(df_filtered)

Unnamed: 0,StormID,Name,Time,Lat,Lon
91800,b'1900171N15117',b'UNNAMED',1900-06-19 16:00:00.000026880,15.3,117.300003
91801,b'1900171N15117',b'UNNAMED',1900-06-19 18:00:00.000040192,15.4,117.199997
91802,b'1900171N15117',b'UNNAMED',1900-06-19 21:00:00.000040192,15.6,117.199997
91803,b'1900171N15117',b'UNNAMED',1900-06-20 00:00:00.000040192,15.9,117.000000
91804,b'1900171N15117',b'UNNAMED',1900-06-20 03:00:00.000040192,16.1,116.900002
...,...,...,...,...,...
1504808,b'2024295N15136',b'TRAMI',2024-10-21 12:00:00.000039936,13.1,129.000000
1504809,b'2024295N15136',b'TRAMI',2024-10-21 15:00:00.000039936,13.0,128.600006
1504810,b'2024295N15136',b'TRAMI',2024-10-21 18:00:00.000039936,13.1,128.199997
1504811,b'2024295N15136',b'TRAMI',2024-10-21 21:00:00.000039936,13.3,127.800003


In [58]:
# Step 1: Filter out rows where the time is not in 3-hour intervals
valid_intervals = df['Time'].dt.hour % 3 == 0

# Gather the set of IDs (or StormIDs) that do not follow the 3-hour interval pattern
invalid_ids = df.loc[~valid_intervals, 'StormID'].unique()  # Assuming 'StormID' is the identifier

# Remove rows with invalid intervals
df_cleaned = df_filtered[valid_intervals].copy()

# Step 2: Check if the remaining times are in consecutive 3-hour intervals for each StormID
# First, sort the DataFrame by 'StormID' and 'Time'
df_cleaned = df_cleaned.sort_values(by=['StormID', 'Time'])

# Check if the time differences are exactly 3 hours (timedelta of 3 hours)
df_cleaned['time_diff'] = df_cleaned.groupby('StormID')['Time'].diff()
invalid_consecutive_ids = df_cleaned[df_cleaned['time_diff'] != pd.Timedelta(hours=3)]['StormID'].unique()

# Step 3: Combine both sets of invalid IDs
all_invalid_ids = set(invalid_ids).union(set(invalid_consecutive_ids))

# Display the set of invalid IDs
print("Invalid StormIDs:", all_invalid_ids)
print("Valid Count: ", len(np.unique(all_invalid_ids)[0]))

# Optional: Display the filtered DataFrame without invalid intervals
display.display(df_cleaned)

Invalid StormIDs: {b'1952348N08129', b'2020263N21136', b'1898315N15123', b'1952359N09151', b'2020195N17125', b'1994001N05156', b'1983167N05145', b'1918245N17112', b'2012086N10116', b'1938325N08132', b'1917198N14127', b'1899266N25133', b'1975305N14125', b'1957223N08145', b'2009328N06108', b'2019210N16117', b'2009221N10201', b'2006264N13115', b'1976115N07163', b'1977276N16155', b'1949335N06138', b'2019243N06136', b'1912251N10138', b'1947306N08140', b'1939197N15131', b'2016279N19130', b'1936238N18117', b'2010240N15142', b'1933256N13147', b'1987245N17120', b'1999233N32152', b'1934213N15149', b'1994348N04175', b'2011344N12117', b'1954240N13151', b'2022180N15117', b'1923197N06140', b'2002195N11172', b'2009192N16130', b'1950126N09151', b'1993211N07161', b'1987343N05154', b'2007327N09152', b'1916308N10117', b'1923164N07143', b'1960327N09161', b'1906300N16124', b'1960282N21132', b'1954223N22138', b'1989187N15132', b'1915245N16148', b'1971274N06134', b'1903291N17126', b'1993336N05164', b'1979228

  df_cleaned = df_filtered[valid_intervals].copy()


Unnamed: 0,StormID,Name,Time,Lat,Lon,time_diff
91801,b'1900171N15117',b'UNNAMED',1900-06-19 18:00:00.000040192,15.4,117.199997,NaT
91802,b'1900171N15117',b'UNNAMED',1900-06-19 21:00:00.000040192,15.6,117.199997,0 days 03:00:00
91803,b'1900171N15117',b'UNNAMED',1900-06-20 00:00:00.000040192,15.9,117.000000,0 days 03:00:00
91804,b'1900171N15117',b'UNNAMED',1900-06-20 03:00:00.000040192,16.1,116.900002,0 days 03:00:00
91806,b'1900171N15117',b'UNNAMED',1900-06-20 06:00:00.000040192,16.4,116.599998,0 days 03:00:00
...,...,...,...,...,...,...
1504808,b'2024295N15136',b'TRAMI',2024-10-21 12:00:00.000039936,13.1,129.000000,0 days 03:00:00
1504809,b'2024295N15136',b'TRAMI',2024-10-21 15:00:00.000039936,13.0,128.600006,0 days 03:00:00
1504810,b'2024295N15136',b'TRAMI',2024-10-21 18:00:00.000039936,13.1,128.199997,0 days 03:00:00
1504811,b'2024295N15136',b'TRAMI',2024-10-21 21:00:00.000039936,13.3,127.800003,0 days 03:00:00


In [59]:
# Assuming df_cleaned contains the 'Lat' and 'Lon' columns
df_cleaned['x'] = np.cos(np.radians(df_cleaned['Lat'])) * np.cos(np.radians(df_cleaned['Lon']))
df_cleaned['y'] = np.cos(np.radians(df_cleaned['Lat'])) * np.sin(np.radians(df_cleaned['Lon']))
df_cleaned['z'] = np.sin(np.radians(df_cleaned['Lat']))

# Display the updated DataFrame with the new coordinates
display.display(df_cleaned)

Unnamed: 0,StormID,Name,Time,Lat,Lon,time_diff,x,y,z
91801,b'1900171N15117',b'UNNAMED',1900-06-19 18:00:00.000040192,15.4,117.199997,NaT,-0.440686,0.857482,0.265556
91802,b'1900171N15117',b'UNNAMED',1900-06-19 21:00:00.000040192,15.6,117.199997,0 days 03:00:00,-0.440260,0.856653,0.268920
91803,b'1900171N15117',b'UNNAMED',1900-06-20 00:00:00.000040192,15.9,117.000000,0 days 03:00:00,-0.436621,0.856918,0.273959
91804,b'1900171N15117',b'UNNAMED',1900-06-20 03:00:00.000040192,16.1,116.900002,0 days 03:00:00,-0.434690,0.856820,0.277315
91806,b'1900171N15117',b'UNNAMED',1900-06-20 06:00:00.000040192,16.4,116.599998,0 days 03:00:00,-0.429542,0.857775,0.282341
...,...,...,...,...,...,...,...,...,...
1504808,b'2024295N15136',b'TRAMI',2024-10-21 12:00:00.000039936,13.1,129.000000,0 days 03:00:00,-0.612943,0.756922,0.226651
1504809,b'2024295N15136',b'TRAMI',2024-10-21 15:00:00.000039936,13.0,128.600006,0 days 03:00:00,-0.607890,0.761490,0.224951
1504810,b'2024295N15136',b'TRAMI',2024-10-21 18:00:00.000039936,13.1,128.199997,0 days 03:00:00,-0.602315,0.765406,0.226651
1504811,b'2024295N15136',b'TRAMI',2024-10-21 21:00:00.000039936,13.3,127.800003,0 days 03:00:00,-0.596468,0.768962,0.230050


In [60]:
df_cleaned.set_index('StormID', inplace=True)
print(df_cleaned.index)

Index([b'1900171N15117', b'1900171N15117', b'1900171N15117', b'1900171N15117',
       b'1900171N15117', b'1900171N15117', b'1900171N15117', b'1900171N15117',
       b'1900171N15117', b'1900171N15117',
       ...
       b'2024295N15136', b'2024295N15136', b'2024295N15136', b'2024295N15136',
       b'2024295N15136', b'2024295N15136', b'2024295N15136', b'2024295N15136',
       b'2024295N15136', b'2024295N15136'],
      dtype='object', name='StormID', length=227056)


In [61]:
df_cleaned[df_cleaned['Name'] == b'HATO']

Unnamed: 0_level_0,Name,Time,Lat,Lon,time_diff,x,y,z
StormID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
b'2017232N19130',b'HATO',2017-08-19 12:00:00.000039936,19.1,130.100006,NaT,-0.608664,0.722812,0.327218
b'2017232N19130',b'HATO',2017-08-19 15:00:00.000039936,19.299999,129.600006,0 days 03:00:00,-0.601602,0.727211,0.330514
b'2017232N19130',b'HATO',2017-08-19 18:00:00.000039936,19.0,129.600006,0 days 03:00:00,-0.602696,0.728534,0.325568
b'2017232N19130',b'HATO',2017-08-19 21:00:00.000039936,19.1,129.199997,0 days 03:00:00,-0.597235,0.732283,0.327218
b'2017232N19130',b'HATO',2017-08-20 00:00:00.000039936,19.299999,128.800003,0 days 03:00:00,-0.591389,0.73554,0.330514
b'2017232N19130',b'HATO',2017-08-20 03:00:00.000039936,19.4,128.300003,0 days 03:00:00,-0.58459,0.740219,0.332161
b'2017232N19130',b'HATO',2017-08-20 06:00:00.000039936,19.5,127.699997,0 days 03:00:00,-0.576451,0.74584,0.333807
b'2017232N19130',b'HATO',2017-08-20 09:00:00.000039936,19.6,127.199997,0 days 03:00:00,-0.569567,0.750377,0.335452
b'2017232N19130',b'HATO',2017-08-20 12:00:00.000039936,19.6,126.699997,0 days 03:00:00,-0.562997,0.755319,0.335452
b'2017232N19130',b'HATO',2017-08-20 15:00:00.000039936,19.700001,126.300003,0 days 03:00:00,-0.557363,0.758758,0.337095


In [62]:
np.unique(df_cleaned.index).shape

(3924,)

In [63]:
# Save the cleaned dataframe to a pickle file
df_cleaned.to_pickle("../data/processed/IBTrACS.WP.v04r01.processed.nowind.pkl")