In [1]:
import pandas as pd
import numpy as np

In [2]:
cols = [
    "year", "DJ", "JF", "FM", "MA", "AM", "MJ",
    "JJ", "JA", "AS", "SO", "ON", "ND"
]

def read_index(file):

    index_raw = pd.read_csv(
      file,
      sep=r"\s+",
      header=None,
      skiprows=1,
      engine="python",
      on_bad_lines="skip"
    )

    index_raw = index_raw[index_raw[0].astype(str).str.match(r"\d{4}")]
    index_raw[0] = index_raw[0].astype(int)

    index_raw = index_raw.iloc[:, :13]
    index_raw.columns = cols

    index_long = index_raw.melt(
      id_vars="year",
      var_name="season",
      value_name="mei"
    )

    season_to_month = {
        "DJ": 1, "JF": 2, "FM": 3, "MA": 4,
        "AM": 5, "MJ": 6, "JJ": 7, "JA": 8,
        "AS": 9, "SO": 10, "ON": 11, "ND": 12
    }

    index_long["month"] = index_long["season"].map(season_to_month)

    index_long["year_adj"] = index_long["year"]

    index_long["fecha"] = pd.to_datetime(
        dict(
            year=index_long["year_adj"],
            month=index_long["month"],
            day=1
        )
    )

    return (
        index_long
        .replace(-999.0, pd.NA)
        .dropna(subset=["mei"])
        .set_index("fecha")
        .sort_index()
        [["mei"]]
    )


In [None]:
PATH_INPUT = "../data/enso/raw"
PATH_OUTPUT = "../data/curated/"

In [None]:
df_mei = read_index(f"{PATH_INPUT}/meiv2.data.txt")
df_nino_12 = read_index(f"{PATH_INPUT}/nino12.long.anom.data.txt")
df_nino_3 = read_index(f"{PATH_INPUT}/nino3.long.anom.data.txt")
df_nino_34 = read_index(f"{PATH_INPUT}/nino34.long.anom.data.txt")
df_nino_4 = read_index(f"{PATH_INPUT}/nino4.long.anom.data.txt")
df_oni = read_index(f"{PATH_INPUT}/oni.data.txt")
df_soi = read_index(f"{PATH_INPUT}/soi.long.data.txt")

In [4]:
df_enso = pd.concat(
    [df_mei, df_nino_12, df_nino_3, df_nino_34, df_nino_4, df_oni, df_soi],
    axis=1
).reset_index()

df_enso.columns = ["fecha", "mei", "nino12", "nino3", "nino34", "nino4", "oni", "soi"]
df_enso = df_enso[(df_enso['fecha'].dt.year >= 1961) & (df_enso['fecha'].dt.year <= 2024)]

In [5]:
def to_numeric(df, col):
    df[col] = pd.to_numeric(df[col], errors="coerce")

df_enso["fecha"] = pd.to_datetime(df_enso["fecha"])
for col in df_enso.columns[1:]:
    to_numeric(df_enso, col)

In [6]:
def fase_mensual_oni(x):
    if pd.isna(x):
        return pd.NA
    elif x >= 0.5:
        return "Niño"
    elif x <= -0.5:
        return "Niña"
    else:
        return "Neutral"

df_enso["fase_mensual"] = df_enso["oni"].apply(fase_mensual_oni)


In [7]:
def clasificacion_noaa(df, col_fase="fase_mensual", min_meses=5):
    """
    Clasifica eventos Niño/Niña según criterio NOAA:
    - Umbral ±0.5
    - Persistencia mínima de 5 meses consecutivos
    """
    df = df.copy()

    # Identificar cambios de fase
    df["grupo"] = (
        df[col_fase] != df[col_fase].shift()
    ).cumsum()

    # Contar duración de cada grupo
    duraciones = (
        df
        .groupby("grupo")[col_fase]
        .agg(["first", "size"])
    )

    # Grupos válidos según NOAA
    grupos_validos = duraciones[
        (duraciones["first"].isin(["Niño", "Niña"])) &
        (duraciones["size"] >= min_meses)
    ].index

    # Inicializar como Neutral
    df["fase_noaa"] = "Neutral"

    # Asignar Niño / Niña solo a los grupos válidos
    df.loc[df["grupo"].isin(grupos_validos), "fase_noaa"] = (
        df.loc[df["grupo"].isin(grupos_validos), col_fase]
    )

    return df.drop(columns=["grupo"])


In [8]:
df_enso = clasificacion_noaa(df_enso)
df_enso.tail(50)

Unnamed: 0,fecha,mei,nino12,nino3,nino34,nino4,oni,soi,fase_mensual,fase_noaa
1858,2020-11-01,-1.13,-0.87,-0.89,-1.01,-0.49,-1.27,0.85,Niña,Niña
1859,2020-12-01,-1.14,-0.76,-0.77,-0.98,-0.68,-1.19,1.76,Niña,Niña
1860,2021-01-01,-1.2,-0.58,-0.8,-1.04,-0.93,-1.05,1.64,Niña,Niña
1861,2021-02-01,-0.96,-0.74,-0.8,-0.94,-0.76,-0.93,1.02,Niña,Niña
1862,2021-03-01,-0.79,-0.5,-0.64,-0.72,-0.53,-0.84,-0.2,Niña,Niña
1863,2021-04-01,-0.95,-0.96,-0.67,-0.55,-0.27,-0.66,0.3,Niña,Niña
1864,2021-05-01,-1.07,-0.73,-0.53,-0.41,-0.15,-0.48,0.49,Neutral,Neutral
1865,2021-06-01,-1.05,0.08,-0.06,-0.06,0.02,-0.38,0.22,Neutral,Neutral
1866,2021-07-01,-1.44,-0.14,-0.17,-0.2,-0.11,-0.4,1.61,Neutral,Neutral
1867,2021-08-01,-1.29,-0.17,-0.28,-0.38,-0.19,-0.49,0.27,Neutral,Neutral


In [11]:
df_enso.drop(columns=["fase_mensual"], inplace=True)

In [None]:
df_enso.to_csv(f"{PATH_OUTPUT}/enso.csv", index=False)