Adele: BART, LSTM // RF, LR

In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from pybart.api import SklearnModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error

# Generate synthetic dataset with noise
np.random.seed(42)
time_series_length = 365 * 24  # One year of hourly data
original_data = np.sin(np.linspace(0, 10 * np.pi, time_series_length))  # Example time series

# Add noise to the dataset
noise = np.random.normal(0, 0.1, size=original_data.shape)  # Gaussian noise (mean=0, std=0.1)
data_with_noise = original_data + noise

# Introduce gaps to simulate missing values
gaps = np.random.choice(time_series_length, size=int(time_series_length * 0.3), replace=False)
data_with_noise_and_gaps = data_with_noise.copy()
data_with_noise_and_gaps[gaps] = np.nan

# Prepare DataFrame
df_with_noise = pd.DataFrame({
    "time": np.arange(time_series_length),
    "latent_heat_flux": data_with_noise_and_gaps
})

# Preview the data
print(df_with_noise.head())

# Prepare dataset
df = pd.DataFrame({"time": np.arange(time_series_length), "latent_heat_flux": data_with_gaps})

# Function to prepare sequences for LSTM
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

# Train/Test split
df_filled = df.copy()
train = df[~df['latent_heat_flux'].isna()]
test = df[df['latent_heat_flux'].isna()]

# ---- 1. BART ---- #
bart_model = SklearnModel()
X_train = train[['time']].values
y_train = train['latent_heat_flux'].values
bart_model.fit(X_train, y_train)

# Predict and fill gaps
df_filled.loc[test.index, 'latent_heat_flux'] = bart_model.predict(test[['time']].values)

# ---- 2. Random Forest ---- #
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
df_filled.loc[test.index, 'latent_heat_flux'] = rf_model.predict(test[['time']].values)

# ---- 3. Linear Regression ---- #
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
df_filled.loc[test.index, 'latent_heat_flux'] = lr_model.predict(test[['time']].values)

# ---- 4. LSTM ---- #
seq_length = 24  # Use the past 24 hours to predict the next value
imputer = SimpleImputer(strategy='mean')  # Fill initial gaps for LSTM preparation
df['latent_heat_flux'] = imputer.fit_transform(df[['latent_heat_flux']])

data_filled = df['latent_heat_flux'].values
X, y = create_sequences(data_filled, seq_length)

# Split into train/test
split_index = len(train) - seq_length
X_train, y_train = X[:split_index], y[:split_index]
X_test, y_test = X[split_index:], y[split_index:]

# Build LSTM model
lstm_model = Sequential([
    LSTM(50, activation='relu', input_shape=(seq_length, 1), return_sequences=False),
    Dense(1)
])
lstm_model.compile(optimizer='adam', loss='mse')

# Reshape for LSTM
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Train model
early_stopping = EarlyStopping(patience=5, restore_best_weights=True)
lstm_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Fill gaps
lstm_predictions = lstm_model.predict(X_test)
df_filled.loc[test.index, 'latent_heat_flux'] = lstm_predictions.flatten()

# ---- Final Output ---- #
print("Data filling complete. Choose the appropriate method's output for your needs.")


ModuleNotFoundError: No module named 'pybart'