### Imputación de datos faltantes

Imputa las temperaturas faltantes por diferentes métodos:
- simple (lineal)
- por tiempo
- polinómica de grado 2
- BiLSTM

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [24]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) 

In [25]:
# después de imputar, completo las otras columnas, que dependen de las primeras.
def set_mes_delta(df):
    df['mes'] = df['fecha'].dt.month
    df['delta'] = df['estepa']- df['pino']

In [26]:
# Temperaturas diurnas (diarias)
df_dia = pd.read_csv("csv/ts_d_diario.csv", parse_dates=["fecha"],
    dtype={"pino": "float64",
           "estepa": "float64",
           "mes": "int64",
           "delta": "float64"
          })
df_dia.head(10)

Unnamed: 0,fecha,estepa,pino,mes,delta
0,2020-01-01,35.433453,33.666374,1,1.767079
1,2020-01-02,36.195872,31.34284,1,4.853032
2,2020-01-03,35.245773,31.284172,1,3.961601
3,2020-01-04,38.821326,31.450303,1,7.371023
4,2020-01-05,,17.944593,1,
5,2020-01-06,,,1,
6,2020-01-07,25.59098,19.523039,1,6.067942
7,2020-01-08,,,1,
8,2020-01-09,,,1,
9,2020-01-10,27.450967,22.472114,1,4.978853


## Imputación simple

In [27]:
df_imputed = df_dia.copy()

df_imputed= df_imputed.set_index('fecha')
df_imputed['estepa'] = df_imputed['estepa'].interpolate(method='linear')
df_imputed['pino']   = df_imputed['pino'].interpolate(method='linear')
df_imputed = df_imputed.reset_index()

set_mes_delta(df_imputed)
df_imputed.head(15)

Unnamed: 0,fecha,estepa,pino,mes,delta
0,2020-01-01,35.433453,33.666374,1,1.767079
1,2020-01-02,36.195872,31.34284,1,4.853032
2,2020-01-03,35.245773,31.284172,1,3.961601
3,2020-01-04,38.821326,31.450303,1,7.371023
4,2020-01-05,34.411211,17.944593,1,16.466618
5,2020-01-06,30.001096,18.733816,1,11.26728
6,2020-01-07,25.59098,19.523039,1,6.067942
7,2020-01-08,26.210976,20.506064,1,5.704912
8,2020-01-09,26.830971,21.489089,1,5.341882
9,2020-01-10,27.450967,22.472114,1,4.978853


In [28]:
# Exporta el resultado
salida = 'csv/ts_imputado_simple.csv'
df_imputed[['fecha','estepa','pino','mes','delta']].to_csv(salida, index=False)

## Imputación por interpolación. Método 'time'

In [29]:
df_imputed = df_dia.copy()

df_imputed= df_imputed.set_index('fecha')
df_imputed['estepa'] = df_imputed['estepa'].interpolate(method='time')
df_imputed['pino']   = df_imputed['pino'].interpolate(method='time')
df_imputed = df_imputed.reset_index()

set_mes_delta(df_imputed)
df_imputed.head(20)

Unnamed: 0,fecha,estepa,pino,mes,delta
0,2020-01-01,35.433453,33.666374,1,1.767079
1,2020-01-02,36.195872,31.34284,1,4.853032
2,2020-01-03,35.245773,31.284172,1,3.961601
3,2020-01-04,38.821326,31.450303,1,7.371023
4,2020-01-05,34.411211,17.944593,1,16.466618
5,2020-01-06,30.001096,18.733816,1,11.26728
6,2020-01-07,25.59098,19.523039,1,6.067942
7,2020-01-08,26.210976,20.506064,1,5.704912
8,2020-01-09,26.830971,21.489089,1,5.341882
9,2020-01-10,27.450967,22.472114,1,4.978853


In [30]:
# Exporta el resultado
salida = 'csv/ts_imputado_time.csv'
df_imputed[['fecha','estepa','pino','mes','delta']].to_csv(salida, index=False)

## Imputación por interpolación. Método polinómico

In [31]:
df_imputed = df_dia.copy()

df_imputed= df_imputed.set_index('fecha')
df_imputed['estepa'] = df_imputed['estepa'].interpolate(method='polynomial', order=2)
df_imputed['pino']   = df_imputed['pino'].interpolate(method='polynomial', order=2)
df_imputed = df_imputed.reset_index()

set_mes_delta(df_imputed)
df_imputed.head(15)

Unnamed: 0,fecha,estepa,pino,mes,delta
0,2020-01-01,35.433453,33.666374,1,1.767079
1,2020-01-02,36.195872,31.34284,1,4.853032
2,2020-01-03,35.245773,31.284172,1,3.961601
3,2020-01-04,38.821326,31.450303,1,7.371023
4,2020-01-05,37.634369,17.944593,1,19.689776
5,2020-01-06,31.017693,15.534118,1,15.483575
6,2020-01-07,25.59098,19.523039,1,6.067942
7,2020-01-08,22.457512,20.567543,1,1.889969
8,2020-01-09,22.104028,19.62861,1,2.475419
9,2020-01-10,27.450967,22.472114,1,4.978853


In [32]:
# Exporta el resultado
salida = 'csv/ts_imputado_polinomio.csv'
df_imputed[['fecha','estepa','pino','mes','delta']].to_csv(salida, index=False)

## Imputación con BiLSTM

In [33]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Bidirectional, LSTM, Dropout, Dense, TimeDistributed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

2025-12-08 08:06:42.204800: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [34]:
df = df_dia.copy()

In [35]:
cols_to_impute = ['estepa', 'pino']
data = df[cols_to_impute].copy()

#data['day_of_year'] = df['fecha'].dt.dayofyear
#data['sin_day'] = np.sin(2 * np.pi * data['day_of_year'] / 365.25)
#data['cos_day'] = np.cos(2 * np.pi * data['day_of_year'] / 365.25)
#data = data.drop('day_of_year', axis=1)

#data['semana_del_año'] = df['fecha'].dt.isocalendar().week
#data['sin_day'] = np.sin(2 * np.pi * data['semana_del_año'] / 52)
#data['cos_day'] = np.cos(2 * np.pi * data['semana_del_año'] / 52)
#data = data.drop('semana_del_año', axis=1)

data.head(10)

Unnamed: 0,estepa,pino
0,35.433453,33.666374
1,36.195872,31.34284
2,35.245773,31.284172
3,38.821326,31.450303
4,,17.944593
5,,
6,25.59098,19.523039
7,,
8,,
9,27.450967,22.472114


In [36]:
# Parámetros
TEST_SIZE_FOR_MASK = 0.15        # 15 % de valores CONOCIDOS los borramos artificialmente para validar
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [37]:
# Máscara de valores REALMENTE faltantes + máscara de validación artificial

real_missing = data.isna().values                     # True donde faltan de verdad
known_values = ~real_missing                          # True donde SÍ tenemos valor real

# Borramos artificialmente un % de los valores que sí conocemos (para validar)
mask_val = np.random.random(data.shape) < TEST_SIZE_FOR_MASK
mask_val = mask_val & known_values                    # solo borramos donde había valor real

validation_missing = mask_val.copy()                  # estos los usaremos para medir error realista
train_missing = real_missing | validation_missing     # valores que el modelo NO verá en entrenamiento

# Serie que el modelo verá (con todos los gaps: reales + artificiales)
data_train = data.copy()
data_train.values[train_missing] = np.nan

In [38]:
# Relleno provisorio para poder entrenar (solo interpolación simple)
data_filled = data_train.interpolate(method='linear', limit_direction='both')
data_filled = data_filled.fillna(method='bfill').fillna(method='ffill')

In [39]:
# Normalización
scaler = StandardScaler()
data_scaled = pd.DataFrame(
    scaler.fit_transform(data_filled),
    index=data_filled.index,
    columns=data_filled.columns
)

In [40]:
# Preparamos tensores (una sola secuencia grande)
X_train = data_scaled.values.reshape(1, -1, 2)      # shape (1, n_timesteps, 2)
X_with_real_gaps = np.where(train_missing, 0, data_scaled.values)
X_with_real_gaps = X_with_real_gaps.reshape(1, -1, 2)
y_train = X_train[:, :, :2]    # solo estepa y pino

In [41]:
# Modelo BiLSTM multivariado

model = Sequential([
    Input(shape=(None, 2)),
    Bidirectional(LSTM(128, activation='tanh', return_sequences=True, recurrent_dropout=0.2)),
    Bidirectional(LSTM(64, activation='tanh', return_sequences=True, recurrent_dropout=0.2)),
    Bidirectional(LSTM(32, activation='tanh', return_sequences=True, recurrent_dropout=0.1)),
    TimeDistributed(Dense(2))
])

model.compile(optimizer=Adam(learning_rate=0.0005),
              loss='mse')

model.summary()

In [42]:
early_stopping = EarlyStopping(monitor='loss', patience=15, restore_best_weights=True)

# Entrena el modelo
history = model.fit(X_train, y_train, epochs=250, batch_size=1, 
                    callbacks=[early_stopping],
                    verbose=1)


Epoch 1/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 68s/step - loss: 0.9210
Epoch 2/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.5427
Epoch 3/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.3870
Epoch 4/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 0.2500
Epoch 5/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.1786
Epoch 6/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.1398
Epoch 7/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.1288
Epoch 8/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.1367
Epoch 9/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 0.2010
Epoch 10/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 0.2131
Epoch 11/250
[1m1/1[0m [

In [43]:
# Imputación final sobre los gaps reales + artificiales
pred_scaled = model.predict(X_with_real_gaps, verbose=0)[0]   # (n_timesteps, 2)
pred_original = scaler.inverse_transform(pred_scaled)[:, :2]   # las 2 primeras
pred_df = pd.DataFrame(pred_original, index=df.index, columns=cols_to_impute)

In [44]:
# Serie final imputada (solo sustituimos donde realmente faltaba el dato)
df_imputed = df.copy()
df_imputed[cols_to_impute] = df[cols_to_impute].fillna(pred_df[cols_to_impute])

In [45]:
# VALIDACIÓN ENMASCARADA

mae_por_columna = {}

if validation_missing.any():
    rows, cols = np.where(validation_missing)
    
    y_true = data.values[rows, cols]
    y_pred = pred_original[rows, cols]
    
    for col_idx, col_name in enumerate(cols_to_impute):
        mask_col = (cols == col_idx)
        if mask_col.any():
            true_col = y_true[mask_col]
            pred_col = y_pred[mask_col]
            mae = mean_absolute_error(true_col, pred_col)
            mae_por_columna[col_name] = mae
            print(f"  {col_name:6} MAE = {mae:.3f} °C  ({mask_col.sum()} valores)")

else:
    print("No hay valores ocultados artificialmente para validar")
    mae_por_columna = {col: np.nan for col in cols_to_impute}


  estepa MAE = 4.593 °C  (248 valores)
  pino   MAE = 4.202 °C  (226 valores)


In [46]:
Q1 = df[['estepa', 'pino']].quantile(0.25)
Q3 = df[['estepa', 'pino']].quantile(0.75)
IQR = Q3 - Q1

MAE_relativo_pino = mae_por_columna["pino"] / IQR["pino"] * 100
MAE_relativo_estepa = mae_por_columna["estepa"] / IQR["estepa"] * 100

print(f'\nPino  : MAE = {mae_por_columna["pino"]:.3f} °C  IQR {IQR["pino"]:.4f}  MAE relativo (%): {MAE_relativo_pino:.2f}')
print(f'Estepa: MAE = {mae_por_columna["estepa"]:.3f} °C  IQR {IQR["estepa"]:.4f}  MAE relativo (%): {MAE_relativo_estepa:.2f}\n')


Pino  : MAE = 4.202 °C  IQR 19.9034  MAE relativo (%): 21.11
Estepa: MAE = 4.593 °C  IQR 23.0537  MAE relativo (%): 19.92




Pino  : MAE = 4.398 °C  IQR 19.9034  MAE relativo (%): 22.09
Estepa: MAE = 4.933 °C  IQR 23.0537  MAE relativo (%): 21.40

Pino  : MAE = 4.060 °C  IQR 19.9034  MAE relativo (%): 20.40
Estepa: MAE = 4.112 °C  IQR 23.0537  MAE relativo (%): 17.84

### Resultado

Recomputo las columnas faltantes y exporto los resultados para usarlos en los modelos predictivos.

In [47]:
# recalculo las columnas mes y delta
df_imputed = df_imputed.reset_index()
set_mes_delta(df_imputed)
df_imputed.head(15)

Unnamed: 0,index,fecha,estepa,pino,mes,delta
0,0,2020-01-01,35.433453,33.666374,1,1.767079
1,1,2020-01-02,36.195872,31.34284,1,4.853032
2,2,2020-01-03,35.245773,31.284172,1,3.961601
3,3,2020-01-04,38.821326,31.450303,1,7.371023
4,4,2020-01-05,25.380878,17.944593,1,7.436285
5,5,2020-01-06,24.355303,19.609804,1,4.745499
6,6,2020-01-07,25.59098,19.523039,1,6.067942
7,7,2020-01-08,23.469503,18.861063,1,4.60844
8,8,2020-01-09,23.904425,19.318012,1,4.586412
9,9,2020-01-10,27.450967,22.472114,1,4.978853


In [50]:
# Exporta el resultado
salida = 'csv/ts_imputado_bilstm.csv'
df_imputed[['fecha','estepa','pino','mes','delta']].to_csv(salida, index=False)