### PRUEBA IMPUTACION CON BiLSTM

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) 

In [3]:
# Temperaturas diurnas (diarias)
df_dia = pd.read_csv("csv/ts_d_diario.csv", parse_dates=["fecha"],
    dtype={"pino": "float64",
           "estepa": "float64",
           "mes": "int64",
           "delta": "float64"
          })
df_dia.head(10)

Unnamed: 0,fecha,estepa,pino,mes,delta
0,2020-01-01,35.433453,33.666374,1,1.767079
1,2020-01-02,36.195872,31.34284,1,4.853032
2,2020-01-03,35.245773,31.284172,1,3.961601
3,2020-01-04,38.821326,31.450303,1,7.371023
4,2020-01-05,,17.944593,1,
5,2020-01-06,,,1,
6,2020-01-07,25.59098,19.523039,1,6.067942
7,2020-01-08,,,1,
8,2020-01-09,,,1,
9,2020-01-10,27.450967,22.472114,1,4.978853


In [4]:
# Temperaturas nocturnas (diarias)
df_noche = pd.read_csv("csv/ts_n_diario.csv", parse_dates=["fecha"],
    dtype={"pino": "float64",
           "estepa": "float64",
           "mes": "int64",
           "delta": "float64"
          })
df_noche.head(10)

Unnamed: 0,fecha,estepa,pino,mes,delta
0,2020-01-01,12.923966,14.703904,1,-1.779938
1,2020-01-02,12.885076,13.202792,1,-0.317715
2,2020-01-03,12.973098,13.5779,1,-0.604802
3,2020-01-04,12.445712,12.29334,1,0.152373
4,2020-01-05,,7.932499,1,
5,2020-01-06,,,1,
6,2020-01-07,4.402563,6.985357,1,-2.582794
7,2020-01-08,,,1,
8,2020-01-09,,,1,
9,2020-01-10,4.718868,4.187017,1,0.531851


## Imputación

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Bidirectional, LSTM, Dropout, Dense, TimeDistributed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

2025-12-07 10:07:26.914848: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
df = df_dia.copy() # imputa los faltantes de las temperaturas diurnas
salida = 'csv/ts_d_diario_imputado.csv'

In [7]:
#df = df_noche.copy() # imputa los faltantes de las temperaturas nocturnas
#salida = 'csv/ts_n_diario_imputado.csv'

In [8]:
cols_to_impute = ['estepa', 'pino']
data = df[cols_to_impute].copy()

#data['day_of_year'] = df['fecha'].dt.dayofyear
#data['sin_day'] = np.sin(2 * np.pi * data['day_of_year'] / 365.25)
#data['cos_day'] = np.cos(2 * np.pi * data['day_of_year'] / 365.25)
#data = data.drop('day_of_year', axis=1)

#data['semana_del_año'] = df['fecha'].dt.isocalendar().week
#data['sin_day'] = np.sin(2 * np.pi * data['semana_del_año'] / 52)
#data['cos_day'] = np.cos(2 * np.pi * data['semana_del_año'] / 52)
#data = data.drop('semana_del_año', axis=1)

data.head(10)

Unnamed: 0,estepa,pino
0,35.433453,33.666374
1,36.195872,31.34284
2,35.245773,31.284172
3,38.821326,31.450303
4,,17.944593
5,,
6,25.59098,19.523039
7,,
8,,
9,27.450967,22.472114


In [9]:
# Parámetros
TEST_SIZE_FOR_MASK = 0.15        # 15 % de valores CONOCIDOS los borramos artificialmente para validar
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [10]:
# Máscara de valores REALMENTE faltantes + máscara de validación artificial

real_missing = data.isna().values                     # True donde faltan de verdad
known_values = ~real_missing                          # True donde SÍ tenemos valor real

# Borramos artificialmente un % de los valores que sí conocemos (para validar)
mask_val = np.random.random(data.shape) < TEST_SIZE_FOR_MASK
mask_val = mask_val & known_values                    # solo borramos donde había valor real

validation_missing = mask_val.copy()                  # estos los usaremos para medir error realista
train_missing = real_missing | validation_missing     # valores que el modelo NO verá en entrenamiento

# Serie que el modelo verá (con todos los gaps: reales + artificiales)
data_train = data.copy()
data_train.values[train_missing] = np.nan

In [11]:
# Relleno provisorio para poder entrenar (solo interpolación simple)
data_filled = data_train.interpolate(method='linear', limit_direction='both')
data_filled = data_filled.fillna(method='bfill').fillna(method='ffill')

In [12]:
# Normalización
scaler = StandardScaler()
data_scaled = pd.DataFrame(
    scaler.fit_transform(data_filled),
    index=data_filled.index,
    columns=data_filled.columns
)

In [13]:
# Preparamos tensores (una sola secuencia grande)
X_train = data_scaled.values.reshape(1, -1, 2)      # shape (1, n_timesteps, 2)
X_with_real_gaps = np.where(train_missing, 0, data_scaled.values)
X_with_real_gaps = X_with_real_gaps.reshape(1, -1, 2)
y_train = X_train[:, :, :2]    # solo estepa y pino

In [14]:
# Modelo BiLSTM multivariado

model = Sequential([
    Input(shape=(None, 2)),
    Bidirectional(LSTM(128, activation='tanh', return_sequences=True, recurrent_dropout=0.2)),
    Bidirectional(LSTM(64, activation='tanh', return_sequences=True, recurrent_dropout=0.2)),
    Bidirectional(LSTM(32, activation='tanh', return_sequences=True, recurrent_dropout=0.1)),
    TimeDistributed(Dense(2))
])

model.compile(optimizer=Adam(learning_rate=0.0005),
              loss='mse')

model.summary()

In [15]:
early_stopping = EarlyStopping(monitor='loss', patience=15, restore_best_weights=True)

# Entrena el modelo
history = model.fit(X_train, y_train, epochs=250, batch_size=1, 
                    callbacks=[early_stopping],
                    verbose=1)


Epoch 1/250


KeyboardInterrupt: 

In [None]:
# Imputación final sobre los gaps reales + artificiales
pred_scaled = model.predict(X_with_real_gaps, verbose=0)[0]   # (n_timesteps, 2)
pred_original = scaler.inverse_transform(pred_scaled)[:, :2]   # las 2 primeras
pred_df = pd.DataFrame(pred_original, index=df.index, columns=cols_to_impute)

In [None]:
# Serie final imputada (solo sustituimos donde realmente faltaba el dato)
df_imputed = df.copy()
df_imputed[cols_to_impute] = df[cols_to_impute].fillna(pred_df[cols_to_impute])

In [None]:
# VALIDACIÓN ENMASCARADA

mae_por_columna = {}

if validation_missing.any():
    rows, cols = np.where(validation_missing)
    
    y_true = data.values[rows, cols]
    y_pred = pred_original[rows, cols]
    
    for col_idx, col_name in enumerate(cols_to_impute):
        mask_col = (cols == col_idx)
        if mask_col.any():
            true_col = y_true[mask_col]
            pred_col = y_pred[mask_col]
            mae = mean_absolute_error(true_col, pred_col)
            mae_por_columna[col_name] = mae
            print(f"  {col_name:6} MAE = {mae:.3f} °C  ({mask_col.sum()} valores)")

else:
    print("No hay valores ocultados artificialmente para validar")
    mae_por_columna = {col: np.nan for col in cols_to_impute}


In [None]:
Q1 = df[['estepa', 'pino']].quantile(0.25)
Q3 = df[['estepa', 'pino']].quantile(0.75)
IQR = Q3 - Q1

MAE_relativo_pino = mae_por_columna["pino"] / IQR["pino"] * 100
MAE_relativo_estepa = mae_por_columna["estepa"] / IQR["estepa"] * 100

print(f'\nPino  : MAE = {mae_por_columna["pino"]:.3f} °C  IQR {IQR["pino"]:.4f}  MAE relativo (%): {MAE_relativo_pino:.2f}')
print(f'Estepa: MAE = {mae_por_columna["estepa"]:.3f} °C  IQR {IQR["estepa"]:.4f}  MAE relativo (%): {MAE_relativo_estepa:.2f}\n')


Pino  : MAE = 4.398 °C  IQR 19.9034  MAE relativo (%): 22.09
Estepa: MAE = 4.933 °C  IQR 23.0537  MAE relativo (%): 21.40

Pino  : MAE = 4.060 °C  IQR 19.9034  MAE relativo (%): 20.40
Estepa: MAE = 4.112 °C  IQR 23.0537  MAE relativo (%): 17.84

### Resultado

Recomputo las columnas faltantes y exporto los resultados para usarlos en los modelos predictivos.

In [None]:
df.head(15)

In [None]:
# recalculo las columnas mes y delta
df_imputed = df_imputed.reset_index()
df_imputed['mes'] = df_imputed['fecha'].dt.month
df_imputed['delta'] = df_imputed['estepa']- df_imputed['pino']
df_imputed.head(15)

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(15,8), sharex=True)
for i, col in enumerate(cols_to_impute):
    ax[i].plot(df_imputed.index, df_imputed[col], label='Imputada (BiLSTM)', linewidth=1.2)
    ax[i].scatter(df.index, df[col], c='red', s=8, label='Datos originales', zorder=5)
    ax[i].set_title(col)
    ax[i].legend()
plt.tight_layout()
plt.show()

In [None]:
# Exporta el resultado
# df_imputed[['fecha','estepa','pino','mes','delta']].to_csv(salida, index=False)