In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import datetime

def obtener_tiempo(dfr):
    #Esta funcion permite obtener una columna Datetime a partir del agno, dia, hora y minuto de los datos iniciales
    dfr['Day'] = pd.to_datetime(dfr['Day'], format='%j').dt.strftime('%m-%d')
    dfr['Hour'] = dfr['Hour'].astype(str).str.zfill(2)
    dfr['Datetime'] = pd.to_datetime(dfr[['Year','Day', 'Hour', 'Minute']]
                   .astype(str).apply(' '.join, 1), format='%Y %m-%d %H %M') 
    return dfr

def sustituir(dfr):
    #Sustitucion de los valores 9999... por NaN (sabemos que la fila 2 son todo valores nulos para cada variable)
    valores_null = dfr.iloc[2, 4:]
    for i in range(len(valores_null)):
        dfr.iloc[:,i+4] = dfr.iloc[:,i+4].replace(valores_null[i], np.nan)
    return dfr

def imputar_por_interpolacion(dfr):
    #Sustituye los valores NaN por valores aproximados mediante el metodo de interpolacion
    dfr = dfr.interpolate(method='linear', limit_direction='forward')
    return dfr

def imputar_por_KNND(dfr):
    #Este metodo es para imputar las filas que no se hayan podido imputar con interpolacion (primera fila)
    imputer = KNNImputer(n_neighbors=3, weights = 'distance')
    dfr[:] = imputer.fit_transform(dfr)
    return dfr

def normalizar_datos(dfr):
    #Dado un DataFrame, devuelve el DataFrame con valores normalizados
    x = dfr.values #returns a numpy array
    standard_scaler = preprocessing.StandardScaler()
    dfr[:] = standard_scaler.fit_transform(x)
    #dfr = pd.DataFrame(x_scaled)
    return dfr

def desnormalizar_datos(dfr):
    #Dado un DataFrame, devuelve el DataFrame con valores desnormalizados
    x = dfr.values
    standard_scaler = preprocessing.StandardScaler()
    dfr[:] = standard_scaler.inverse_transform(x)
    return dfr

def eliminar_gaps(dfr, n):
    #Eliminacion de filas donde alguna columna tiene N NaNs consecutivos
    for columna in range(dfr.shape[1]):
        mask = dfr.iloc[:,columna].notna()
        a = mask.ne(mask.shift()).cumsum()
        dfr = dfr[(a.groupby(a).transform('size') < n) | mask]
    return dfr

def NaN_consecutivos(dfr):
    #Obtencion del numero maximo de NaN consecutivos segun columna
    nans_consecutivos = []
    for columna in range(df.shape[1]):
        nan_columna = max(df.iloc[:,columna].isnull().astype(int).groupby(df.iloc[:,columna].notnull().astype(int).cumsum()).sum())
        nans_consecutivos.append(nan_columna)

    df_nans = pd.DataFrame(columns = ['Variable','Numero de nans consecutivos'])
    df_nans.iloc[:,0] = df.columns
    df_nans.iloc[:,1] = nans_consecutivos
    return df_nans


from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
def build_model_complejo(input_timesteps, output_timesteps, num_links, num_inputs):    
    model = Sequential()
    model.add(BatchNormalization(name = 'batch_norm_0', input_shape = (input_timesteps, num_inputs, 1, 1)))
    model.add(ConvLSTM2D(name ='conv_lstm_1',
                         filters = 64, kernel_size = (10, 1),                       
                         padding = 'same', 
                         return_sequences = True))
    
    model.add(Dropout(0.30, name = 'dropout_1'))
    model.add(BatchNormalization(name = 'batch_norm_1'))

    model.add(ConvLSTM2D(name ='conv_lstm_2',
                         filters = 64, kernel_size = (5, 1), 
                         padding='same',
                         return_sequences = False))
    
    model.add(Dropout(0.20, name = 'dropout_2'))
    model.add(BatchNormalization(name = 'batch_norm_2'))
    
    model.add(Flatten())
    model.add(RepeatVector(output_timesteps))
    model.add(Reshape((output_timesteps, num_inputs, 1, 64)))
    
    model.add(ConvLSTM2D(name ='conv_lstm_3',
                         filters = 64, kernel_size = (10, 1), 
                         padding='same',
                         return_sequences = True))
    
    model.add(Dropout(0.20, name = 'dropout_3'))
    model.add(BatchNormalization(name = 'batch_norm_3'))
    
    model.add(ConvLSTM2D(name ='conv_lstm_4',
                         filters = 64, kernel_size = (5, 1), 
                         padding='same',
                         return_sequences = True))
    
    model.add(TimeDistributed(Dense(units=1, name = 'dense_1', activation = 'relu')))
    model.add(Dense(units=1, name = 'dense_2', activation = 'linear'))
    
    optimizer = tf.keras.optimizers.RMSprop(lr=0.004, clipvalue=1.0)
    model.compile(loss = "mse", optimizer = optimizer, metrics = ['mae', 'mse'])
    return model

def baseline(x_val, y_val):
  num_predicciones = len(y_val)
  timesteps_futuros = len(y_val[0])
  timesteps_pasados = len(x_val[0])
  variables = len(x_val[0][0])
  predicciones = np.zeros(shape=(y_val.shape))
  for prediccion in range(num_predicciones):
      dato = x_val[prediccion][timesteps_pasados-1]
      for num_timesteps_a_predecir in range(timesteps_futuros):
          predicciones[prediccion][num_timesteps_a_predecir] = dato
  return predicciones


from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
def obtener_metricas(y_val, y_preds, nombre_modelos):
    #Aplanamos los datos para que se puedan calcular las metricas
    y_val = np.squeeze(y_val)
    y_val = y_val.ravel()
    datos_mse  = []
    datos_r2   = []
    datos_rmse = []
    datos_mae  = []
    columnas   = []
    for modelo in range(len(nombre_modelos)):
        y_pred = np.squeeze(y_preds[modelo])
        y_pred = y_pred.ravel()
        #Calculamos las metricas
        r2   = r2_score(y_val, y_pred)
        mse  = mean_squared_error(y_val, y_pred)
        mae  = mean_absolute_error(y_val, y_pred)
        rmse = mse**0.5
        #Los anadimos a las listas correspondientes
        datos_mse.append(mse)
        datos_r2.append(r2)
        datos_rmse.append(rmse)
        datos_mae.append(mae)
        #Creamos una columna por modelo
        columnas.append(nombre_modelos[modelo])

    #Mostramos los resultados en forma de df
    nombres_metricas = ['R2', 'RMSE', 'MSE', 'MAE']
    valores = [datos_r2, datos_rmse, datos_mse, datos_mae]

    metricas = pd.DataFrame(valores, columns = columnas, index = nombres_metricas)
    return metricas



def desglose_por_timestep(y_val, predicciones):
    num_predicciones = predicciones.shape[0]
    num_timesteps   = predicciones.shape[1]

    timesteps_y_val  = []
    timesteps_y_pred = []
    
    #Creamos 4 listas, una para cada timestep
    for lista_timestep in range(num_timesteps):
        timestep_valN = []
        timesteps_y_val.append(timestep_valN)
        timestep_predN = []
        timesteps_y_pred.append(timestep_predN)
    #Metemos en cada una de los 4 listas, el correspondiente timestep de cada prediccion
    for prediccion in range(num_predicciones):
        for timestep in range(num_timesteps):
            timesteps_y_pred[timestep].append(predicciones[prediccion][timestep])
            timesteps_y_val[timestep].append(y_val[prediccion][timestep])
    
    return timesteps_y_val, timesteps_y_pred

def comparacion_modelos_timestep(y_val, predicciones, nombre_modelos):
  timesteps_y_preds = []
  for modelo in range(len(nombre_modelos)):
      timesteps_y_val, timesteps_y_pred_modelo = desglose_por_timestep(y_val, predicciones[modelo])
      timesteps_y_preds.append(timesteps_y_pred_modelo)
  for timestep in range(4):
      timesteps_modelos = []
      for modelo in range(len(nombre_modelos)):
          timesteps_modelos.append(timesteps_y_preds[modelo][timestep])
      print("=========TIMESTEP: ",timestep,"=========\n",obtener_metricas(timesteps_y_val[timestep], timesteps_modelos, nombre_modelos))


def comparacion_modelos_timestep_variables(y_val, predicciones, nombre_modelos, nombre_variables):
  desglose_preds = []
  for modelo in range(len(nombre_modelos)):
      desglose_val, desglose_pred = agrupar_variables_timestep(y_val, predicciones[modelo])
      desglose_preds.append(desglose_pred)
  
  for paso in range(4):
    print("===================TIMESTEP ", paso,"===================")
    for variable in range(len(nombre_variables)):
        variableN_modelos = []
        for modelo in range(len(nombre_modelos)):
          variableN_modelos.append(desglose_preds[modelo][paso][variable])
        print("=====VARIABLE", nombre_variables[variable],"====")
        print(obtener_metricas(desglose_val[paso][variable], variableN_modelos, nombre_modelos))


def agrupar_variables_timestep(y_val, y_pred):
    num_predicciones = y_val.shape[0]
    num_timesteps    = y_val.shape[1]
    num_variables    = y_val.shape[2]
    #Creacion de listas
    desglose_val  = []
    desglose_pred = []
    for paso in range(num_timesteps):
        timestepN_val  = []
        timestepN_pred = []
        for i in range(num_variables):
            timestepN_varI_val = []
            timestepN_val.append(timestepN_varI_val)
            
            timestepN_varI_pred = []
            timestepN_pred.append(timestepN_varI_pred)
            
        desglose_val.append(timestepN_val)
        desglose_pred.append(timestepN_pred)
    
    #Desglose
    for prediccion in range(num_predicciones):
        for paso in range(num_timesteps):
            for var in range(num_variables):
                desglose_val[paso][var].append(y_val[prediccion][paso][var])
                desglose_pred[paso][var].append(y_pred[prediccion][paso][var])
    
    return desglose_val, desglose_pred




def resumen_comparativa(y_val, predicciones, nombre_modelos, variables):
    pd.set_option("display.max_rows", None, "display.max_columns", None)

    #1-Comparativa general de los modelos
    print("===================================================COMPARATIVA GENERAL DE LOS MODELOS===================================================")
    print(obtener_metricas(y_val, predicciones, nombre_modelos))

    #2-Comparativa por timestep de los modelos
    print("===================================================COMPARATIVA POR TIMESTEP DE LOS MODELOS===================================================")
    print(comparacion_modelos_timestep(y_val, predicciones, nombre_modelos))

    print("===================================================COMPARATIVA POR TIMESTEP Y VARIABLES DE LOS MODELOS===================================================")
    #3-Comparativa por timestep y variable de los modelos
    print(comparacion_modelos_timestep_variables(y_val, predicciones, nombre_modelos, variables))


def obtener_predicciones_variables(predicciones_tormentas, nombre_variables):
  num_predicciones = len(predicciones_tormentas)
  num_pasos        = len(predicciones_tormentas[0])
  num_variables    = len(predicciones_tormentas[0][0])

  lista_variables = []


  for variable in range(num_variables):
    valores_variable = []
    lista_variables.append(valores_variable)

  for predicciones in range(num_predicciones - 3744, num_predicciones, 4):
    for paso in range(num_pasos):
      for variable in range(num_variables):
        dato = np.squeeze(predicciones_tormentas[predicciones][paso][variable])
        lista_variables[variable].append(dato)

  return lista_variables


def obtener_df_predicciones(predicciones_variables, df_tormenta_real):
    df_predicciones = pd.DataFrame(prediccions_variables)
    df_predicciones = df_predicciones.T
    df_predicciones.columns = variables
    df_predicciones.set_index(df_tormenta_real.index, inplace=True)

    df_predicciones[:] = scaler.inverse_transform(df_predicciones.values)
    df_tormenta_real[:] = scaler.inverse_transform(df_tormenta_real.values)


    nombre_observado = variables+' observado'
    nombre_predicho  = variables+ ' predicho'

    df_predicciones=df_predicciones.astype(float)
    df_tormenta_real=df_tormenta_real.astype(float)

    #Grafico para ver de manera clara donde se encuentran los NaN en cada variable (discontinuidad)
    plt.rcParams.update({'legend.fontsize': 15})
    ax = df_predicciones.plot(subplots = True, figsize=(25, 25), markersize = 20)
    df_tormenta_real.plot(ax = ax, subplots = True, figsize=(25, 20), linestyle = ':', color = 'grey', sharex = False,
                            fontsize=22, markersize = 20)


#1-Montamos Drive para poder acceder a los datos y los leemos

In [2]:
#Leemos los datos
#df = pd.read_csv('datos.csv', header=0)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
df = pd.read_csv('/content/drive/MyDrive/TFG_codigo/Prediccion_general/datos.csv', header=0)

#2-Preprocesamos los datos y los normalizamos

In [3]:
df = sustituir(df)
df = imputar_por_interpolacion(df)
imputar_por_KNND(df.iloc[:3, :])
df = obtener_tiempo(df)
df = df.drop(['Year', 'Day', 'Hour', 'Minute'], axis = 1)
df = df.set_index('Datetime')
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


Unnamed: 0_level_0,IMF(nT),Bx GSM(nT),By GSM(nT),Bz GSM(nT),Flow Speed(km/s),Proton Density(n/cc),Proton Temperature(K)
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1995-01-01 00:00:00,1.37,0.130,1.17,-0.670,311.453358,18.422649,17401.558702
1995-01-01 00:05:00,1.26,0.090,1.12,-0.500,311.400000,18.460000,17347.000000
1995-01-01 00:10:00,1.46,0.037,1.30,-0.586,311.560000,18.348000,17510.600000
1995-01-01 00:15:00,1.66,-0.016,1.48,-0.672,311.720000,18.236000,17674.200000
1995-01-01 00:20:00,1.86,-0.069,1.66,-0.758,311.880000,18.124000,17837.800000
...,...,...,...,...,...,...,...
2021-02-18 23:35:00,2.87,2.130,0.03,-1.860,364.100000,4.670000,58514.000000
2021-02-18 23:40:00,2.92,2.210,0.07,-1.880,364.900000,4.400000,62883.000000
2021-02-18 23:45:00,2.90,2.450,0.05,-1.540,364.100000,4.650000,58624.000000
2021-02-18 23:50:00,2.92,2.380,0.13,-1.660,363.700000,4.810000,67353.000000


In [4]:
from sklearn.preprocessing import StandardScaler
x = df.values #returns a numpy array
scaler = StandardScaler()
scaler.fit(x)
df[:] = scaler.transform(x)

In [6]:
df

Unnamed: 0_level_0,IMF(nT),Bx GSM(nT),By GSM(nT),Bz GSM(nT),Flow Speed(km/s),Proton Density(n/cc),Proton Temperature(K)
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1995-01-01 00:00:00,-1.424353,0.032687,0.291615,-0.189644,-1.146234,2.347966,-0.802707
1995-01-01 00:05:00,-1.460589,0.021451,0.278938,-0.138110,-1.146766,2.355286,-0.803304
1995-01-01 00:10:00,-1.394705,0.006564,0.324574,-0.164180,-1.145172,2.333336,-0.801513
1995-01-01 00:15:00,-1.328821,-0.008324,0.370210,-0.190250,-1.143577,2.311387,-0.799722
1995-01-01 00:20:00,-1.262937,-0.023212,0.415845,-0.216320,-1.141983,2.289437,-0.797931
...,...,...,...,...,...,...,...
2021-02-18 23:35:00,-0.930224,0.594484,0.002589,-0.550381,-0.621617,-0.347237,-0.352641
2021-02-18 23:40:00,-0.913753,0.616956,0.012730,-0.556443,-0.613645,-0.400151,-0.304812
2021-02-18 23:45:00,-0.920341,0.684372,0.007660,-0.453376,-0.621617,-0.351156,-0.351436
2021-02-18 23:50:00,-0.913753,0.664709,0.027942,-0.489753,-0.625602,-0.319800,-0.255878


#3-Creacion de datasets

In [8]:
#===================================Parametros para obtener train_dataset y val_dataset====================================
TRAIN_SPLIT   = int(0.8 * int(df.shape[0]))
STEP          = 1
past_history  = 40
future_target = 4



def multivariate_multioutput_data(dataset, target, start_index, end_index, history_size,
                      target_size, step, single_step=False):
    data = []
    labels = []

    start_index = start_index + history_size
    if end_index is None:
        end_index = len(dataset) - target_size

    for i in range(start_index, end_index):
        indices = range(i-history_size, i, step)
        data.append(dataset[indices])

        if single_step:
            labels.append(target[i+target_size])
        else:
            labels.append(target[i:i+target_size])

    return np.array(data)[:,:,:,np.newaxis,np.newaxis], np.array(labels)[:,:,:,np.newaxis,np.newaxis]



In [8]:
dataset = df.values

x_test, y_test = multivariate_multioutput_data(dataset, dataset,
                                             TRAIN_SPLIT, None, past_history,
                                             future_target, STEP)

In [9]:
x_test.shape, y_test.shape

((549806, 40, 7, 1, 1), (549806, 4, 7, 1, 1))

**CARGAMOS EL MODELO COMPLEJO**

In [5]:
modelo_c = build_model_complejo(40, 4, 7, 7)
modelo_c.load_weights('/content/drive/MyDrive/TFG_codigo/Prediccion_general/Modelo_Complejo/modelo_complejo_150epochs_pesos.h5')

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Realizamos predicciones

In [11]:
predicciones_tormenta_c = modelo_c.predict(x_test, verbose=1)
predicciones_baseline_tormenta = baseline(x_test, y_test)



Guardamos las predicciones

In [12]:
np.save('/content/drive/MyDrive/Prediccion_Tormenta/modelo_complejo_predicciones_generales', predicciones_tormenta_c)
np.save('/content/drive/MyDrive/Prediccion_Tormenta/predicciones_baseline_generales', predicciones_baseline_tormenta)

Cargamos las predicciones

In [6]:
predicciones_tormenta_c = np.load('/content/drive/MyDrive/Prediccion_Tormenta/modelo_complejo_predicciones_generales.npy')
predicciones_baseline_tormenta = np.load('/content/drive/MyDrive/Prediccion_Tormenta/predicciones_baseline_generales.npy')

#6-Evaluación del modelo

Generamos la estructura para que con las funciones podamos obtener las métricas de cada modelo para compararlos

In [15]:
predicciones = [predicciones_tormenta_c, predicciones_baseline_tormenta]
nombre_modelos = ['Complejo', 'Baseline']
variables = df.columns

**Generamos las comparativas entre los modelos**

In [16]:
resumen_comparativa(y_test, predicciones, nombre_modelos, variables)

      Complejo  Baseline
R2    0.889016  0.881896
RMSE  0.291028  0.300219
MSE   0.084698  0.090131
MAE   0.161835  0.150322
       Complejo  Baseline
R2    0.943319  0.948452
RMSE  0.207981  0.198342
MSE   0.043256  0.039340
MAE   0.117160  0.097527
       Complejo  Baseline
R2    0.900635  0.897181
RMSE  0.275373  0.280119
MSE   0.075830  0.078466
MAE   0.153307  0.141381
       Complejo  Baseline
R2    0.868931  0.857748
RMSE  0.316268  0.329484
MSE   0.100025  0.108560
MAE   0.178792  0.170084
       Complejo  Baseline
R2    0.843178  0.824202
RMSE  0.345945  0.366278
MSE   0.119678  0.134159
MAE   0.198081  0.192297
None
=====VARIABLE IMF(nT) ====
      Complejo  Baseline
R2    0.974938  0.980506
RMSE  0.123700  0.109097
MSE   0.015302  0.011902
MAE   0.075963  0.057717
=====VARIABLE Bx GSM(nT) ====
      Complejo  Baseline
R2    0.930442  0.934999
RMSE  0.234212  0.226409
MSE   0.054855  0.051261
MAE   0.150789  0.130816
=====VARIABLE By GSM(nT) ====
      Complejo  Baseline
R2  