## DATA OVERVIEW

### Imports

In [1]:
import pandas as pd

In [2]:
from src.data.load_data import load_raw_data

### Load raw dataset

In [3]:
df = load_raw_data()
df.head()

Loaded raw dataset
 File: /Users/anitarazafi/Desktop/masters/code/feature-selection/data/raw/ibtracs.last3years.list.v04r01.csv
 Rows: 23276
 Columns: ['SID', 'SEASON', 'NUMBER', 'BASIN', 'SUBBASIN', 'NAME', 'ISO_TIME', 'NATURE', 'LAT', 'LON', 'WMO_WIND', 'WMO_PRES', 'WMO_AGENCY', 'TRACK_TYPE', 'DIST2LAND', 'LANDFALL', 'IFLAG', 'USA_AGENCY', 'USA_ATCF_ID', 'USA_LAT', 'USA_LON', 'USA_RECORD', 'USA_STATUS', 'USA_WIND', 'USA_PRES', 'USA_SSHS', 'USA_R34_NE', 'USA_R34_SE', 'USA_R34_SW', 'USA_R34_NW', 'USA_R50_NE', 'USA_R50_SE', 'USA_R50_SW', 'USA_R50_NW', 'USA_R64_NE', 'USA_R64_SE', 'USA_R64_SW', 'USA_R64_NW', 'USA_POCI', 'USA_ROCI', 'USA_RMW', 'USA_EYE', 'TOKYO_LAT', 'TOKYO_LON', 'TOKYO_GRADE', 'TOKYO_WIND', 'TOKYO_PRES', 'TOKYO_R50_DIR', 'TOKYO_R50_LONG', 'TOKYO_R50_SHORT', 'TOKYO_R30_DIR', 'TOKYO_R30_LONG', 'TOKYO_R30_SHORT', 'TOKYO_LAND', 'CMA_LAT', 'CMA_LON', 'CMA_CAT', 'CMA_WIND', 'CMA_PRES', 'HKO_LAT', 'HKO_LON', 'HKO_CAT', 'HKO_WIND', 'HKO_PRES', 'KMA_LAT', 'KMA_LON', 'KMA_CAT', 'KM

Unnamed: 0,SID,SEASON,NUMBER,BASIN,SUBBASIN,NAME,ISO_TIME,NATURE,LAT,LON,...,BOM_GUST_PER,REUNION_GUST,REUNION_GUST_PER,USA_SEAHGT,USA_SEARAD_NE,USA_SEARAD_SE,USA_SEARAD_SW,USA_SEARAD_NW,STORM_SPEED,STORM_DIR
0,,Year,,,,,,,degrees_north,degrees_east,...,second,kts,second,ft,nmile,nmile,nmile,nmile,kts,degrees
1,2022008S13148,2022,1.0,SP,EA,TIFFANY,2022-01-08 00:00:00,MX,-12.6,147.7,...,45,,,,,,,,6,160
2,2022008S13148,2022,1.0,SP,EA,TIFFANY,2022-01-08 03:00:00,MX,-12.9,147.8,...,,,,,,,,,5,160
3,2022008S13148,2022,1.0,SP,EA,TIFFANY,2022-01-08 06:00:00,MX,-13.1,147.9,...,45,,,,,,,,4,160
4,2022008S13148,2022,1.0,SP,EA,TIFFANY,2022-01-08 09:00:00,MX,-13.2,147.9,...,,,,,,,,,2,165


### Dataset characteristics

#### Dimension

In [4]:
df.shape

(23276, 174)

In [5]:
df.info

<bound method DataFrame.info of                  SID SEASON NUMBER BASIN SUBBASIN     NAME  \
0                      Year                                  
1      2022008S13148   2022      1    SP       EA  TIFFANY   
2      2022008S13148   2022      1    SP       EA  TIFFANY   
3      2022008S13148   2022      1    SP       EA  TIFFANY   
4      2022008S13148   2022      1    SP       EA  TIFFANY   
...              ...    ...    ...   ...      ...      ...   
23271  2025339S14210   2026    105    SP       MM  UNNAMED   
23272  2025339S14210   2026    105    SP       MM  UNNAMED   
23273  2025339S14210   2026    105    SP       MM  UNNAMED   
23274  2025339S14210   2026    105    SP       MM  UNNAMED   
23275  2025339S14210   2026    105    SP       MM  UNNAMED   

                  ISO_TIME NATURE            LAT           LON  ...  \
0                                  degrees_north  degrees_east  ...   
1      2022-01-08 00:00:00     MX          -12.6         147.7  ...   
2      202

#### Data types

In [6]:
pd.set_option("display.max_rows", None)
# df.dtypes
df.dtypes.head()

SID         object
SEASON      object
NUMBER      object
BASIN       object
SUBBASIN    object
dtype: object

In [7]:
# pd.reset_option("display.max_rows")

In [8]:
df["ISO_TIME"].head()

0                       
1    2022-01-08 00:00:00
2    2022-01-08 03:00:00
3    2022-01-08 06:00:00
4    2022-01-08 09:00:00
Name: ISO_TIME, dtype: object

### Data quick statistics

In [9]:
# df.describe() # Most column values are NaN so this is not relevant for now

### Check if the data frame is ordered

In [10]:
df["ISO_TIME"].is_monotonic_increasing

False

### Unique identifier

In [11]:
# Check number of unique storms
df["SID"].nunique()

411

In [12]:
# Verify combination of SID + SEASON is unique
# df.groupby(["SID", "SEASON"]).size()
df.groupby(["SID", "SEASON"]).size().head()

SID            SEASON
               Year        1
2022008S13148  2022       75
2022008S17173  2022       97
2022020S13060  2022       45
2022024S09090  2022      141
dtype: int64

### Column grouping

In [13]:
# by BASIN
df["BASIN"].value_counts()

BASIN
WP    6450
SI    5303
EP    4183
SP    1881
NI    1350
         1
Name: count, dtype: int64

In [14]:
# by SUBBASIN
df["SUBBASIN"].value_counts()

SUBBASIN
MM    14127
WA     2342
BB      942
EA      819
CP      529
CS      487
AS      408
GM      328
          1
Name: count, dtype: int64

## Number of missing values per column

In [15]:
# SID
sid_dtype = df["SID"].dtype
sid_missing_counts = df["SID"].isna().sum() + (df["SID"].isin(["", " "])).sum()
print(f"SID column dtype: {sid_dtype}, missing values: {sid_missing_counts}")

SID column dtype: object, missing values: 1


In [16]:
# SEASON 
season_dtype = df["SEASON"].dtype
season_missing_counts = df["SEASON"].isna().sum() + (df["SEASON"].isin(["", " "])).sum()
print(f"SEASON column dtype: {season_dtype}, missing values: {season_missing_counts}")

SEASON column dtype: object, missing values: 0


In [17]:
# BASIN
basin_dtype = df["BASIN"].dtype
basin_missing_counts = df["BASIN"].isna().sum() + (df["BASIN"] == "MM").sum()
print(f"BASIN column dtype: {basin_dtype}, missing values: {basin_missing_counts}")

BASIN column dtype: object, missing values: 4108


In [20]:
# SUBBASIN
subbasin_dtype = df["SUBBASIN"].dtype
subbasin_missing_counts = df["SUBBASIN"].isna().sum() + (df["SUBBASIN"] == "MM").sum()
print(f"SUBBASIN column dtype: {subbasin_dtype}, missing values: {subbasin_missing_counts}")

SUBBASIN column dtype: object, missing values: 17420


In [22]:
# NATURE
nature_dtype = df["NATURE"].dtype
valid_nature_types = ["DS", "TS", "ET", "SS", "NR", "MX"]
missing_nature = (
    df["NATURE"].isna().sum()
    + (~df["NATURE"].isin(valid_nature_types)).sum()
)

print(f"NATURE column dtype: {nature_dtype} missing/invalid values: {missing_nature}")

NATURE column dtype: object missing/invalid values: 1
