## DATA PREPROCESSING

## Data cleaning

In [1]:
import pandas as pd

In [2]:
from src.data.load_data import load_raw_data
from src.data.apply_missing_values_rules import apply_missing_values_rules
from src.data.clean_categorical import clean_categorical_values

### Load raw dataset

In [3]:
df = load_raw_data()
df.head()

Loaded raw dataset
 File: /Users/anitarazafi/Desktop/masters/code/feature-selection/data/raw/ibtracs.last3years.list.v04r01.csv
 Rows: 23276
 Columns: ['SID', 'SEASON', 'NUMBER', 'BASIN', 'SUBBASIN', 'NAME', 'ISO_TIME', 'NATURE', 'LAT', 'LON', 'WMO_WIND', 'WMO_PRES', 'WMO_AGENCY', 'TRACK_TYPE', 'DIST2LAND', 'LANDFALL', 'IFLAG', 'USA_AGENCY', 'USA_ATCF_ID', 'USA_LAT', 'USA_LON', 'USA_RECORD', 'USA_STATUS', 'USA_WIND', 'USA_PRES', 'USA_SSHS', 'USA_R34_NE', 'USA_R34_SE', 'USA_R34_SW', 'USA_R34_NW', 'USA_R50_NE', 'USA_R50_SE', 'USA_R50_SW', 'USA_R50_NW', 'USA_R64_NE', 'USA_R64_SE', 'USA_R64_SW', 'USA_R64_NW', 'USA_POCI', 'USA_ROCI', 'USA_RMW', 'USA_EYE', 'TOKYO_LAT', 'TOKYO_LON', 'TOKYO_GRADE', 'TOKYO_WIND', 'TOKYO_PRES', 'TOKYO_R50_DIR', 'TOKYO_R50_LONG', 'TOKYO_R50_SHORT', 'TOKYO_R30_DIR', 'TOKYO_R30_LONG', 'TOKYO_R30_SHORT', 'TOKYO_LAND', 'CMA_LAT', 'CMA_LON', 'CMA_CAT', 'CMA_WIND', 'CMA_PRES', 'HKO_LAT', 'HKO_LON', 'HKO_CAT', 'HKO_WIND', 'HKO_PRES', 'KMA_LAT', 'KMA_LON', 'KMA_CAT', 'KM

Unnamed: 0,SID,SEASON,NUMBER,BASIN,SUBBASIN,NAME,ISO_TIME,NATURE,LAT,LON,...,BOM_GUST_PER,REUNION_GUST,REUNION_GUST_PER,USA_SEAHGT,USA_SEARAD_NE,USA_SEARAD_SE,USA_SEARAD_SW,USA_SEARAD_NW,STORM_SPEED,STORM_DIR
0,,Year,,,,,,,degrees_north,degrees_east,...,second,kts,second,ft,nmile,nmile,nmile,nmile,kts,degrees
1,2022008S13148,2022,1.0,SP,EA,TIFFANY,2022-01-08 00:00:00,MX,-12.6,147.7,...,45,,,,,,,,6,160
2,2022008S13148,2022,1.0,SP,EA,TIFFANY,2022-01-08 03:00:00,MX,-12.9,147.8,...,,,,,,,,,5,160
3,2022008S13148,2022,1.0,SP,EA,TIFFANY,2022-01-08 06:00:00,MX,-13.1,147.9,...,45,,,,,,,,4,160
4,2022008S13148,2022,1.0,SP,EA,TIFFANY,2022-01-08 09:00:00,MX,-13.2,147.9,...,,,,,,,,,2,165


### Remove header details row

In [4]:
# drop duplicated header/metadata row
df = df.iloc[1:].reset_index(drop=True)

### Normalize the missing values to NaN

In [5]:
df_copy = apply_missing_values_rules(df)

  df[obj_cols] = df[obj_cols].replace(generic_missing, np.nan)


In [6]:
df_copy

Unnamed: 0,SID,SEASON,NUMBER,BASIN,SUBBASIN,NAME,ISO_TIME,NATURE,LAT,LON,...,BOM_GUST_PER,REUNION_GUST,REUNION_GUST_PER,USA_SEAHGT,USA_SEARAD_NE,USA_SEARAD_SE,USA_SEARAD_SW,USA_SEARAD_NW,STORM_SPEED,STORM_DIR
0,2022008S13148,2022,1,SP,EA,TIFFANY,2022-01-08 00:00:00,MX,-12.6,147.7,...,45,,,,,,,,6,160
1,2022008S13148,2022,1,SP,EA,TIFFANY,2022-01-08 03:00:00,MX,-12.9,147.8,...,,,,,,,,,5,160
2,2022008S13148,2022,1,SP,EA,TIFFANY,2022-01-08 06:00:00,MX,-13.1,147.9,...,45,,,,,,,,4,160
3,2022008S13148,2022,1,SP,EA,TIFFANY,2022-01-08 09:00:00,MX,-13.2,147.9,...,,,,,,,,,2,165
4,2022008S13148,2022,1,SP,EA,TIFFANY,2022-01-08 12:00:00,TS,-13.3,148.0,...,45,,,,,,,,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23270,2025339S14210,2026,105,SP,MM,UNNAMED,2025-12-05 18:00:00,NR,-17.4,-147.1,...,,,,,,,,,14,160
23271,2025339S14210,2026,105,SP,MM,UNNAMED,2025-12-05 21:00:00,NR,-18.0,-147.0,...,,,,,,,,,13,175
23272,2025339S14210,2026,105,SP,MM,UNNAMED,2025-12-06 00:00:00,NR,-18.7,-147.0,...,,,,,,,,,14,170
23273,2025339S14210,2026,105,SP,MM,UNNAMED,2025-12-06 03:00:00,NR,-19.4,-146.8,...,,,,,,,,,16,165


### Overview of missing values

In [7]:
missing_counts = df_copy.isna().sum()
missing_percent = (df_copy.isna().mean() * 100)
missing_summary = pd.DataFrame({
    "missing_count": missing_counts,
    "missing_percent": missing_percent
}).sort_values("missing_count")

In [8]:
pd.set_option("display.max_rows", None)
missing_summary

Unnamed: 0,missing_count,missing_percent
SID,0,0.0
STORM_SPEED,0,0.0
USA_SSHS,0,0.0
IFLAG,0,0.0
DIST2LAND,0,0.0
TRACK_TYPE,0,0.0
LON,0,0.0
LAT,0,0.0
STORM_DIR,0,0.0
ISO_TIME,0,0.0


### Save missing values post-processed data frame

In [9]:
df_copy.to_parquet("../data/processed/df_missing_standardized.parquet")

### Fix data types

#### Time

In [10]:
df_copy["ISO_TIME"] = pd.to_datetime(df_copy["ISO_TIME"], errors="coerce")
df_copy[["ISO_TIME"]].info()
df_copy["ISO_TIME"].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23275 entries, 0 to 23274
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   ISO_TIME  23275 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 182.0 KB


0   2022-01-08 00:00:00
1   2022-01-08 03:00:00
2   2022-01-08 06:00:00
3   2022-01-08 09:00:00
4   2022-01-08 12:00:00
Name: ISO_TIME, dtype: datetime64[ns]

In [11]:
nat_count = df_copy["ISO_TIME"].isna().sum()
print(f"Number of NaT in ISO_TIME: {nat_count}")

Number of NaT in ISO_TIME: 0


#### Numeric values

In [12]:
numeric_cols = df_copy.select_dtypes(include="object").columns[
    df_copy.select_dtypes(include="object").apply(lambda col: pd.to_numeric(col, errors="coerce").notna().any())
]
for col in numeric_cols:
    df_copy[col] = pd.to_numeric(df_copy[col], errors="coerce")

In [13]:
# quick check
# df_copy[numeric_cols].dtypes

#### Categorical values

In [14]:
df_copy_2 = clean_categorical_values(df_copy)

In [15]:
# df_copy_2.dtypes

### Handle duplicate values

In [18]:
# Handle duplicates
df_copy_3 = df_copy_2.drop_duplicates()

### Sort data

In [19]:
# Sort by identifiers and time
df_copy_3 = df_copy_3.sort_values(
    by=["SID", "ISO_TIME"],
    ascending=[True, True]
).reset_index(drop=True)

In [21]:
# df_copy_3

### Create a time delta column

In [27]:
df_copy_3["delta_hours"] = (
    df_copy_3.groupby("SID")["ISO_TIME"]
           .diff()
           .dt.total_seconds() / 3600
)

In [28]:
df_copy_3["delta_hours"].value_counts().head()

delta_hours
3.0    22573
1.0      172
2.0       35
1.5       11
2.5        7
Name: count, dtype: int64

### Encode categorical variables

In [29]:
categorical_cols = df_copy_3.select_dtypes(include="object").columns

df_encoded = pd.get_dummies(
    df_copy_3,
    columns=categorical_cols,
    dummy_na=True
)

  df_encoded = pd.get_dummies(


### Save cleaned dataset

In [34]:
df_encoded.to_parquet("../data/processed/df_final.parquet")