In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import load_model

## Load the data
df = pd.read_csv(r"data.csv", parse_dates=['utctime'])

## Change into date time format
df['utctime'] = pd.to_datetime(df['utctime'], format = '%Y-%m-%d', errors  = 'coerce')
df.sort_values(by='utctime', inplace=True)

## Normalization of features

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

df[['water_level', 'sea_level', 'temp', 'pump1_status']] = scaler.fit_transform(df[['water_level', 'sea_level', 'temp', 'pump1_status']])
print(df.head())
## Making segments of data to avoid blanks in the data


# Create segments to avoid gaps in data
def create_segments(df: pd.DataFrame) -> list:
    segments = []
    current_segment = []

    for i in range(len(df)):
        if i == 0:
            # For the first row, add it directly to the current segment
            current_segment.append(df.iloc[i])
        else:
            # Calculate the time difference between the current and previous row
            time_diff = df['utctime'].iloc[i] - df['utctime'].iloc[i - 1]
            if time_diff <= pd.Timedelta(minutes=60):
                # If the time difference is less than or equal to 60 minutes, add the row to the current segment
                current_segment.append(df.iloc[i])
            else:
                # If the time difference is greater than 60 minutes, end the current segment
                # Convert the current segment list to a DataFrame and append it to segments
                segments.append(pd.DataFrame(current_segment))
                # Start a new segment with the current row
                current_segment = [df.iloc[i]]

    # Append the last segment
    if current_segment:
        segments.append(pd.DataFrame(current_segment))
    

    return segments

segments = create_segments(df)

print(f"Number of segments: {len(segments)}")
print(segments[0])

# Define parameters as feature
param = ['water_level', 'sea_level', 'temp', 'pump1_status']

# takes data_segment (segments that we created from data)
# Seq_inp_lenght (the lenght of each sequnce)
# seq_out_lenght (the length of minutes we are predicting)
# param (features)
# first_hr is the initial part of seq that needed to be converted in hours
# last_hour is the total period of sequnce
# step (it is created to get output at particular intervals instead every minute)
# Create sequences function
def create_sequences(data_segments: list, seq_inp_length: int, seq_out_length: int, param: list) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    hour_sequences = []
    minute_sequences = []
    pump_fut_seq, ys = [], []
    
    for segment in data_segments:
        segment = segment[param].values
        for i in range(len(segment) - seq_inp_length - seq_out_length + 1):
            # Input sequence for the first 12 hours (hourly averaged data)
            hour_start_idx = i
            hour_end_idx = i + 24 * 60
            hour_data = segment[hour_start_idx:hour_end_idx]

            hour_averaged = []
            for j in range(0, len(hour_data), 60):
                hour_averaged.append(np.mean(hour_data[j:j + 60, :], axis=0))     ## : all param taken for hourly data
            hour_averaged = np.array(hour_averaged)

            # Input sequence for the next 12 hours (minute-level data)
            minute_start_idx = i + 12 * 60
            minute_end_idx = i + seq_inp_length
            minute_data = segment[minute_start_idx:minute_end_idx, :]  

            # Known future values for pump1_status in the last 12 hours (minute-level)
            pump1_future = segment[minute_end_idx:minute_end_idx + seq_out_length:60, 3].reshape(-1, 1)
                     
            # Output sequence for water_level
            y = segment[minute_end_idx:minute_end_idx + seq_out_length:60, 0]  # Predicting next 12 hours, hourly

            hour_sequences.append(hour_averaged)
            minute_sequences.append(minute_data)
            pump_fut_seq.append(pump1_future)
            ys.append(y)

    return np.array(hour_sequences), np.array(minute_sequences), np.array(pump_fut_seq), np.array(ys)

### calling segments from above cell


print(f"Number of segments: {len(segments)}")

# making sequnces

SEQ_INP_LENGTH = 24 * 60  # 24 hours of input data in minutes
SEQ_OUT_LENGTH = 12 * 60  # 12 hours of output data in minutes


hour_sequences, minute_sequences, pump_fut_seq, ys = create_sequences(segments, SEQ_INP_LENGTH, SEQ_OUT_LENGTH, param)

## Now first three are input sequences and 
print("hour_sequences:", hour_sequences.shape)
print("minute_sequences:", minute_sequences.shape)
print("pump_fut_stat:", pump_fut_seq.shape)
print("ys:", ys.shape)
# Define the validation size
val_size = int(len(hour_sequences) * 0.2)

# Manually split the data into training and validation sets
hour_val, hour_train = hour_sequences[:val_size], hour_sequences[val_size:]
minute_val, minute_train = minute_sequences[:val_size], minute_sequences[val_size:]
pump_val, pump_train = pump_fut_seq[:val_size], pump_fut_seq[val_size:]
y_val, y_train = ys[:val_size], ys[val_size:]

print("hour_train shape:", hour_train.shape)
print("hour_val shape:", hour_val.shape)
print("minute_train shape:", minute_train.shape)
print("minute_val shape:", minute_val.shape)
print("pump_train shape:", pump_train.shape)
print("pump_val shape:", pump_val.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
### visualizing weights

# Load the saved model
model = load_model(r"LSTM_multi_model_steps.h5")

# Print the model summary (optional, but helpful to understand the model structure)
model.summary()

# Get the model weights
weights = model.get_weights()

# Print the weights
for i, weight in enumerate(weights):
    print(f"Layer {i+1} weights: {weight}")


import matplotlib.pyplot as plt

# Assuming the first layer has weights
first_layer_weights = weights[0]

# Plot the weights
plt.figure(figsize=(10, 2))
plt.imshow(first_layer_weights, aspect='auto', cmap='viridis')
plt.colorbar()
plt.title("First Layer Weights")
plt.show()
# Make predictions on the training set
y_train_pred = model.predict([hour_train, minute_train, pump_train])

# Make predictions on the validation set
y_val_pred = model.predict([hour_val, minute_val, pump_val])

print(y_train.shape, y_train_pred.shape)

print(y_val.shape, y_val_pred.shape)
y_train_inverse = [
    scaler.inverse_transform(
        np.concatenate((y_train[:, hours_ahead-1].reshape(-1, 1), np.zeros((y_train.shape[0], len(param) - 1))), axis=1)
    )[:, 0] 
    for hours_ahead in range(1, 13)
]

y_train_pred_inverse = [
    scaler.inverse_transform(
        np.concatenate((y_train_pred[:, hours_ahead-1].reshape(-1, 1), np.zeros((y_train_pred.shape[0], len(param) - 1))), axis=1)
    )[:, 0] 
    for hours_ahead in range(1, 13)
]
y_val_inverse = [
    scaler.inverse_transform(
        np.concatenate((y_val[:, hours_ahead-1].reshape(-1, 1), np.zeros((y_val.shape[0], len(param) - 1))), axis=1)
    )[:, 0] 
    for hours_ahead in range(1, 13)
]
y_val_pred_inverse = [
    scaler.inverse_transform(
        np.concatenate((y_val_pred[:, hours_ahead-1].reshape(-1, 1), np.zeros((y_val_pred.shape[0], len(param) - 1))), axis=1)
    ) [:, 0] 
    for hours_ahead in range(1, 13)
]



y_train_inverse = np.column_stack(y_train_inverse)
y_train_pred_inverse = np.column_stack(y_train_pred_inverse)
y_val_inverse = np.column_stack(y_val_inverse)
y_val_pred_inverse = np.column_stack(y_val_pred_inverse)

print(y_train_inverse.shape, y_val_pred_inverse.shape)
print(y_train_inverse)
import matplotlib.pyplot as plt


plt.figure(figsize=(8, 3))


plt.plot(y_train_inverse[5], label='Actual')
plt.plot(y_train_pred_inverse[5], label='Predicted')
plt.title('Train_data')
plt.xlabel('Time Steps')
plt.ylabel('cm')
plt.legend()
plt.show()

plt.figure(figsize=(8, 3))


plt.plot(y_val_inverse[5], label='Actual')
plt.plot(y_val_pred_inverse[5], label='Predicted')
plt.title('val_data')
plt.xlabel('Time Steps')
plt.ylabel('cm')
plt.legend()
plt.show()
### errors for each hour

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

mae = [mean_absolute_error(y_train_inverse[:, hour], y_train_pred_inverse[:, hour]) for hour in range(y_train_inverse.shape[1])]
mse = [mean_squared_error(y_train_inverse[:, hour], y_train_pred_inverse[:, hour]) for hour in range(y_train_inverse.shape[1])] 
rmse = np.sqrt(mse)
mape = [mean_absolute_percentage_error(y_train_inverse[:, hour], y_train_pred_inverse[:, hour]) for hour in range(y_train_inverse.shape[1])] 

print(mae, mse, rmse, mape)


## make a data frame 


# Create a DataFrame
errors = pd.DataFrame({
    'Hour': range(1, 12 + 1),
    'MAE': mae,
    'MSE': mse,
    'RMSE': rmse,
    'MAPE': mape
})

## save as csv
errors.to_csv(r"LSTM_conc_errors.csv")

# simple model

def predict_and_get_true_values_simple_model(y: list, time_steps_ahead: int) -> tuple[list, list]:
    y_predict = []
    y_true = []
    for i in range(0, len(y)-time_steps_ahead):
        y_predict.append(y[i])
        y_true.append(y[i+time_steps_ahead])
    return y_predict, y_true


def predict_and_get_true_values_for_all_segments(time_steps_ahead: int) -> tuple[list, list]:
    y_preds = []
    y_trues = []
    for segment in segments:
        y = segment["vandindtag_cm"].values
        y = scaler.inverse_transform(
            np.concatenate((y.reshape(-1, 1),
                           np.zeros((y.shape[0], len(param) - 1))), axis=1)
        )
        y = y[:, 0]
        y_pred, y_true = predict_and_get_true_values_simple_model(
            y, time_steps_ahead)
        y_preds.extend(y_pred)
        y_trues.extend(y_true)
    return y_preds, y_trues

# predict_and_get_true_values_for_all_segments(60*5)


time_steps_ahead = 60*12
y_preds, y_trues = predict_and_get_true_values_for_all_segments(
    time_steps_ahead)


y_preds
mae_simple = []
mse_simple = []
rmse_simple = []
mape_simple = []


for i in range(1, 13):
    y_preds, y_trues = predict_and_get_true_values_for_all_segments(
    i * 60)
    mae_simple.append(mean_absolute_error(y_trues, y_preds))
    mse_simple.append(mean_squared_error(y_trues, y_preds))
    rmse_simple = np.sqrt(mse_simple)
    mape_simple.append(mean_absolute_percentage_error(y_trues, y_preds))

print(mae_simple, mse_simple, rmse_simple, mape_simple)
## make a data frame 


# Create a DataFrame
errors_simple = pd.DataFrame({
    'Hour': range(1, 12 + 1),
    'MAE': mae_simple,
    'MSE': mse_simple,
    'RMSE': rmse_simple,
    'MAPE': mape_simple
})

## save as csv
errors_simple.to_csv(r"errors_simple_model1.csv")
### plot for LSTM model and simple model
import pandas as pd
import matplotlib.pyplot as plt

errors = pd.read_csv(r"LSTM_conc_errors.csv")


# Plotting the data with labels and styles
plt.figure(figsize=(10, 6))  # Set the figure size for better visibility

plt.plot(errors['Hour'], errors['MAE'], label='LSTM_MAE', marker='o', linestyle='-', color='b')

plt.plot(errors['Hour'], errors['RMSE'], label='LSTM_RMSE', marker='o', linestyle='-', color='g')


# Adding labels and title
plt.xlabel('Time (hours)', fontsize=12)
plt.ylabel('Model_Error (cm)', fontsize=12)
plt.title('LSTM Model error', fontsize=14)

# Adding grid lines for better readability
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
# Adding legend with better placement
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=12)

### plot for LSTM model and simple model
import pandas as pd
import matplotlib.pyplot as plt

errors = pd.read_csv(r"LSTM_conc_errors.csv")
errors_simple = pd.read_csv(r"errors_simple_model1.csv")



# Plotting the data with labels and styles
plt.figure(figsize=(10, 6))  # Set the figure size for better visibility

plt.plot(errors['Hour'], errors['MAE'], label='LSTM_MAE', marker='o', linestyle='-', color='b')
plt.plot(errors_simple['Hour'], errors_simple['MAE'], label='SIM_MAE', marker='s', linestyle='--', color='r')
plt.plot(errors['Hour'], errors['RMSE'], label='LSTM_RMSE', marker='o', linestyle='-', color='g')
plt.plot(errors_simple['Hour'], errors_simple['RMSE'], label='SIM_RMSE', marker='s', linestyle='--', color='y')

# Adding labels and title
plt.xlabel('Time (hours)', fontsize=12)
plt.ylabel('MOdel_Error (cm)', fontsize=12)
plt.title('Comparison of Simple Model with LSTM Model', fontsize=14)

# Adding grid lines for better readability
plt.grid(True, which='both', linestyle='--', linewidth=0.5)

# Annotate labels directly on the curves
for i, txt in enumerate(errors['MAE']):
    if i == len(errors['MAE']) - 1:  # Annotate at the end of the line
        plt.annotate('MAE_LSTM Model', (errors['Hour'][i], errors['MAE'][i]), textcoords="offset points", xytext=(10,10), ha='center', fontsize=12, color='b')

for i, txt in enumerate(errors_simple['MAE']):
    if i == len(errors_simple['MAE']) - 1:  # Annotate at the end of the line
        plt.annotate('MAE_Simple Model', (errors_simple['Hour'][i], errors_simple['MAE'][i]), textcoords="offset points", xytext=(10,10), ha='center', fontsize=12, color='r')


for i, txt in enumerate(errors['RMSE']):
    if i == len(errors['RMSE']) - 1:  # Annotate at the end of the line
        plt.annotate('RMSE_LSTM Model', (errors['Hour'][i], errors['RMSE'][i]), textcoords="offset points", xytext=(10,10), ha='center', fontsize=12, color='g')

for i, txt in enumerate(errors_simple['RMSE']):
    if i == len(errors_simple['RMSE']) - 1:  # Annotate at the end of the line
        plt.annotate('RMSE_Simple Model', (errors_simple['Hour'][i], errors_simple['RMSE'][i]), textcoords="offset points", xytext=(10,10), ha='center', fontsize=12, color='y')


# Adding legend with better placement
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=12)

# Adding markers for better distinction
plt.plot(errors['Hour'], errors['MAE'], 'bo')  # Blue circles for LSTM model
plt.plot(errors_simple['Hour'], errors_simple['MAE'], 'rs')  # Red squares for Simple model

# Show the plot
plt.show()
import pandas as pd

history = pd.read_csv(r"LSTM_conc_model_history.csv")

# Plot the training and validation loss: it is done to see if the model is convergig or not
# When loss curve decrease and becomes eventually statble, it means the model is training
# validation loss doesn't vary much and decreases very slowly because the model is trained
# and thus it is lower than loss curve


# Plot training and validation loss curves
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 5))
plt.plot(history['loss'], label='Training Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Function to create sequences for future predictions
def create_future_sequences(df: pd.DataFrame, seq_inp_length: int, param: list) -> tuple:
    segment = df[param].values[-seq_inp_length:]
    hour_data = [np.mean(segment[j:j + 60, :], axis=0) for j in range(0, seq_inp_length, 60)]
    hour_data = np.array(hour_data).reshape(1, -1, len(param))
    minute_data = segment[-12 * 60:].reshape(1, -1, len(param))
    return hour_data, minute_data


# Prepare data for future predictions
SEQ_INP_LENGTH = 24 * 60  # 24 hours of input data in minutes
param = ['water_level', 'sea_level', 'temp', 'pump1_status']
last_24_hours = df[-SEQ_INP_LENGTH:]
hour_data, minute_data = create_future_sequences(last_24_hours, SEQ_INP_LENGTH, param)
print(hour_data.shape)
print(minute_data.shape)
# Assume future pump1_pct values are known or kept constant
future_pump1_pct = np.tile(minute_data[0, -1, 3], (12, 1)).reshape(1, -1, 1)
future_pump1_pct.shape
# Make future predictions
future_predictions = model.predict([hour_data, minute_data, future_pump1_pct])
print("Future Predictions (normalized):", future_predictions)
# Ensure future_predictions is a numpy array
future_predictions = np.array(future_predictions)
# Inverse transform the future predictions
future_predictions_inverse = [
    scaler.inverse_transform(
        np.concatenate((future_predictions[:, hours_ahead-1].reshape(-1, 1), np.zeros((future_predictions.shape[0], len(param) - 1))), axis=1)
    )[:, 0] for hours_ahead in range(1, 13)
]
future_predictions_inverse = np.column_stack(future_predictions_inverse)
print("Future Predictions (inverse transformed):", future_predictions_inverse)
# Combine predictions with original data
future_dates = [df['utctime'].iloc[-1] + pd.Timedelta(hours=i) for i in range(1, 13)]
future_df = pd.DataFrame({
    'utctime': future_dates,
    #'vandindtag_cm': future_predictions_inverse.flatten()
    'vandindtag_cm': future_predictions.flatten()
})
# Check the shapes of the dataframes
print("Original DataFrame shape:", df.shape)
print("Future DataFrame shape:", future_df.shape)

combined_df = pd.concat([df[['utctime', 'water_level']], future_df])

combined_df.shape

inverse_array = scaler.inverse_transform(np.concatenate((combined_df["water_level"].values.reshape(-1, 1),
                       np.zeros((combined_df.shape[0], len(param) - 1))), axis=1)
    )[:, 0]
# import importlib
# importlib.reload(plt)
# Plot the results
plt.figure(figsize=(12, 6))
plt.plot(combined_df['utctime'],
         inverse_array, label='Original + Predicted')
plt.axvline(x=df['utctime'].iloc[-1], color='r',
            linestyle='--', label='Prediction Start')
plt.xlim(combined_df.iloc[-10000]['utctime'], combined_df.iloc[-1]['utctime'])
#plt.ylim(120, 180)
plt.xlabel('Time')
plt.ylabel('water_level')
plt.title('Original Data and Future Predictions')
plt.legend()
plt.show()