In [None]:
import os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime, timedelta
from keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D,UpSampling3D, Conv3DTranspose, Flatten, Concatenate, Dense, TimeDistributed, Bidirectional, Input, Reshape  
from keras.models import Model
from sklearn.model_selection import train_test_split
from keras.layers import LSTM
from keras.callbacks import EarlyStopping
from time import sleep
import seaborn as sns

In [None]:
parquet_df =  pd.read_parquet("/home/arman_abouali/Downloads/DWD/Original_files/DWD_window/X_data_window.parquet")
parquet_df = parquet_df.sort_values(by='Key', ascending=True)
parquet_df['Key'] = pd.to_datetime(parquet_df['Key'], format='%Y%m%d_%H%M')
#parquet_df['Image_Sum'] = parquet_df['Value'].apply(lambda x: sum(sum(row) for row in x))
parquet_df.reset_index(drop=True, inplace=True)
#parquet_df

In [None]:
# Now you can perform your operations
parquet_idx = pd.date_range("2003-11-01 00:00:00", "2017-12-31 23:45:00", freq="15min")
parquet_df.reset_index(drop=True, inplace=True)
parquet_df = parquet_df.set_index(parquet_idx)
parquet_df = parquet_df.reindex(parquet_idx)
parquet_df = parquet_df.drop('Key', axis=1)

In [None]:
def flatten_list_of_arrays_and_divide(array_list):
    # Concatenate the arrays in the list
    concatenated_array = np.concatenate(array_list)
    # Divide every element by 10
    divided_array = concatenated_array / 10
    # Flatten the array
    return divided_array.flatten()

# Apply this updated function to each element in the 'Value' column
parquet_df['Value'] = parquet_df['Value'].apply(flatten_list_of_arrays_and_divide)

In [None]:
parquet_df['Image_Sum'] = parquet_df['Value'].apply(np.sum)

parquet_df

In [None]:
input_df = pd.read_csv('/home/arman_abouali/Downloads/DWD/input.csv', sep=';')

input_df['Zeit'] = input_df['Zeit'].replace("24:00:00", "00:00:00")
input_df['Zeit'] = pd.to_datetime(input_df['Datum'] + ' ' + input_df['Zeit'], format='%d.%m.%Y %H:%M:%S')
input_df = input_df.drop('Datum', axis=1)
input_df = input_df.sort_values(by='Zeit', ascending=True).reset_index(drop=True)
input_df['Sensor_Sum'] = input_df['GranetalsperreMin15Niederschlag'] + input_df['HahnenkleeMin15Niederschlag'] + input_df['Niederschlag_Gosequelle'] + input_df['Niederschlag_Abzuchtquelle']
input_df


In [None]:
idx = pd.date_range("2003-11-01 00:00:00", "2018-06-30 23:45:00", freq="15min")
input_df.reset_index(drop=True, inplace=True)
input_df = input_df.set_index(idx)
input_df = input_df.reindex(idx)

In [None]:
input_df = input_df.drop('Zeit', axis=1)

In [None]:
# Merge the DataFrames on their indices
merged_df = pd.merge(input_df, parquet_df, left_index=True, right_index=True, how='inner')
merged_df

In [None]:
merged_df.to_csv('merged_df.csv', index=True)

In [None]:
X_images = np.array(merged_df['Value'].tolist())

y = np.array(merged_df['SennhuetteMin15W'].tolist())
y = y.reshape(-1, 1)

print(f"Shape of X: {X_images.shape}")
print(f"Shape of y: {y.shape}")

In [None]:
scaler = MinMaxScaler()

# Reshape y to be a 2D array with one column
y = y.reshape(-1, 1)
y = scaler.fit_transform(y)

print(f"Shape of y_scaled: {y.shape}")

In [None]:
scaler = MinMaxScaler()
X_images_scaled = scaler.fit_transform(X_images.reshape(-1, 1))

In [None]:
def create_sequences(array, sequence_length):
    X = []
    for i in range(len(array)-sequence_length):
        end_idx = i + sequence_length
        sequence_x = array[i:end_idx]
        X.append(sequence_x)

    return np.array(X)

In [None]:
# Define the sequence length
seq_length = 96
X_images_sequence = create_sequences(X_images, sequence_length=seq_length)

# The labels for each sequence are the water level values aligned with the end of each sequence
y_sequence = y[seq_length:]

# Print out the shapes of the resulting arrays to confirm they're what we expect
print(f"Shape of X sequence: {X_images_sequence.shape}")
print(f"Shape of y sequence: {y_sequence.shape}")

In [None]:
train_len = int(X_images_sequence.shape[0]*0.7)
val_len = int(X_images_sequence.shape[0]*0.2)

In [None]:
# Split the data into training and temporary sets (the latter will be split into validation and test sets)
X_temp, X_test, y_temp, y_test = train_test_split(X_images_sequence, y_sequence, test_size=0.1, random_state=42, shuffle=False)

# Split the temporary set into validation and training sets
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42, shuffle=False)

# Print out the shapes of the resulting datasets
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_val: {X_val.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_val: {y_val.shape}")
print(f"Shape of y_test: {y_test.shape}")

In [None]:
model = Sequential()

# First LSTM layer with Dropout
model.add(LSTM(32, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
#model.add(Dropout(0.2))

# Second LSTM layer
model.add(LSTM(32, return_sequences=True , activation='swish'))
#model.add(Dropout(0.2))

# Third LSTM layer
model.add(LSTM(32))
#model.add(Dropout(0.2))

# Add a Batch Normalization layer
model.add(BatchNormalization())

# Dense layer
model.add(Dense(32, activation='swish'))
#model.add(Dropout(0.2))

# Output layer
model.add(Dense(1, activation='swish'))

# Compile the model
model.compile(loss='mae', optimizer=tf.keras.optimizers.Adam(1e-3), metrics=['mae'])

model.summary()

callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=10, monitor='val_loss'),
    tf.keras.callbacks.ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)
]

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=1024, validation_data=(X_val, y_val), callbacks=callbacks)
# Save the model
model_save_path = '/home/arman_abouali/Downloads/DWD/Original_files/DWD_window/LSTM_model.pb'  # Replace with your own path
model.save(model_save_path)

print(f"Model saved to {model_save_path}")

In [None]:
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.yscale('log')
plt.legend()
plt.show()

In [None]:
# prediction
y_hat_train = model.predict(X_train)

fig, ax = plt.subplots(figsize=(7, 6))  # Create only one plot

# Plot the predicted and reference data for the first column
ax.plot(y_hat_train[:, 0], label='Predicted 0')
ax.plot(y_train[:, 0], label='Reference 0', alpha=.5)
ax.legend()
ax.set_title('Sennhuette_LSTM Model Prediction')

plt.show()  # Display the plot


In [None]:
# prediction
y_hat_val = model.predict(X_val)

fig, ax = plt.subplots(figsize=(7, 6))  # Create only one plot

# Plot the predicted and reference data for the first column
ax.plot(y_hat_val[:, 0], label='Predicted')
ax.plot(y_val[:, 0], label='Reference', alpha=.5)
ax.legend()
ax.set_title('Sennhuette_LSTM Model Prediction')

plt.show()  # Display the plot


In [None]:
# prediction
y_hat_test = model.predict(X_test)

fig, ax = plt.subplots(figsize=(7, 6))  # Create only one plot

# Plot the predicted and reference data for the first column
ax.plot(y_hat_test[:, 0], label='Predicted 0')
ax.plot(y_test[:, 0], label='Reference 0', alpha=.5)
ax.legend()
ax.set_title('Sennhuette_LSTM Model Prediction')

plt.show()  # Display the plot


In [None]:
y_test_1d = np.ravel(y_test)
y_hat_test_1d = np.ravel(y_hat_test)

# Create the DataFrame
pred_df = pd.DataFrame({
    'measured': y_test_1d,
    'predicted': y_hat_test_1d
})

# Reset the index so that it starts from 0 and add a new unnamed index column
pred_df.reset_index(drop=True, inplace=True)

# Save the DataFrame to a CSV file with the unnamed index column
pred_df.to_csv('Prediction.csv', index=False)


In [None]:
predict_df =  pd.read_csv("/home/arman_abouali/Downloads/Prediction.csv")
predict_df

In [None]:
# Define the metrics
def mse(y_true, y_pred):
    return ((y_true - y_pred) ** 2).mean()

def rmse(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))

def mae(y_true, y_pred):
    return np.abs(y_true - y_pred).mean()

def r_squared(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (ss_res / ss_tot)

datasets = {
    'train': (y_train, y_hat_train),
    'test': (y_test, y_hat_test),
    'val': (y_val, y_hat_val)
}
def round_metric(metric_value, decimals=6):
    return round(metric_value, decimals)

for name, (y_true, y_pred) in datasets.items():
    print(f"Metrics for {name} dataset:")
    print(f"MSE: {round_metric(mse(y_true, y_pred))}")
    print(f"RMSE: {round_metric(rmse(y_true, y_pred))}")
    print(f"MAE: {round_metric(mae(y_true, y_pred))}")
    print(f"R-squared: {round_metric(r_squared(y_true, y_pred))}") 
    print("-" * 30)


# List of column names
column_names = ['Margarethenklippe_Pegel_now','Sennhuette_Pegel_now']

# Function to plot residuals
def plot_residuals(y_true, y_pred, column_name):
    residuals = y_true - y_pred
    plt.scatter(y_pred, residuals, alpha=0.5)
    plt.axhline(0, color='r', linestyle='--')
    plt.title(f"Residual Plot for {column_name}")
    plt.xlabel(f"Predicted Values for {column_name}")
    plt.ylabel("Residuals")
    plt.show()

# Calculate residuals for each column
for i in range(y_test.shape[1]):
    y_true_column = y_test[:, i]
    y_pred_column = y_hat_test[:, i]
    
    plot_residuals(y_true_column, y_pred_column, column_names[i])

In [None]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)
custom_index = 500
sequence_length = 8

# Calculate start index of y_test in y_df based on your provided splits
test_start_index = train_len + val_len

# Extract corresponding timestamps for y_test
test_time_stamps = input_df.index[test_start_index:test_start_index + len(y_test)]

# Now extract specific sequence timestamps for the custom index
sequence_time_stamps = test_time_stamps[custom_index:custom_index + sequence_length].strftime('%Y-%m-%d %H:%M:%S')

# Actual and predicted values for the sequence
actual_sequence = y_test[custom_index:custom_index + sequence_length]
predicted_sequence = y_hat_test[custom_index:custom_index + sequence_length]

# Plotting the selected sequence for one column
column_name = 'SennhuetteMin15W'

plt.figure(figsize=(10, 4))  # Adjust the figure size for better label readability
plt.plot(sequence_time_stamps, actual_sequence, label='Actual', marker='o')
plt.plot(sequence_time_stamps, predicted_sequence, label='Predicted', marker='x')
plt.title(f"Actual vs Predicted for {column_name}")
plt.xlabel("Time stamp")
plt.ylabel(column_name)
plt.xticks(rotation=90)  
plt.legend()
plt.tight_layout() 
plt.show()



In [None]:
y_ground_truth = y_test
y_prediction = y_hat_test

In [None]:
df_result = pd.DataFrame(columns = ["measured", "predicted"], index = range(y_ground_truth.size))
df_result ["measured"] = y_ground_truth
df_result ["predicted"] = y_prediction

In [None]:
df_result.to_csv("Arman_090224.csv", sep = ";", index = "0")