In [None]:
import os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import pyarrow as pa
from collections import Counter
import pyarrow.parquet as pq
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime, timedelta
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dropout, Dense 
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping

directory_path = "/home/arman_abouali/Downloads/DWD/"

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.set_logical_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=20000)])
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)



In [None]:
parquet_df =  pd.read_parquet("/home/arman_abouali/Downloads/X_data(1).parquet")
parquet_df.reset_index(drop=True, inplace=True)
parquet_df = parquet_df.sort_values(by='Key', ascending=True)
# Instantiate the scaler
#scaler = MinMaxScaler()
#parquet_df['Value'] = parquet_df['Value'].apply(lambda x: scaler.fit_transform([x])[0] if isinstance(x, list) and len(x) > 0 else x)
print(parquet_df)


In [None]:
values = parquet_df['Value'].values

width = 71
length = 41

X_images = np.zeros(shape=(len(values), width, length))
for ind, val in enumerate(values):
    X_images[ind, :, :] = np.stack(val,axis=1)
X_images = np.where(X_images<0, 0, X_images)

# X += np.random.normal(loc=0, scale=1, size=X.shape)
y_df = pd.read_csv('test.csv', sep=';', parse_dates=['Zeit'])
y_df.set_index('Zeit', inplace=True)

# Extract values for the desired date range
y = y_df.loc['2016-08-01':'2017-08-31'][['Margarethenklippe_Pegel_now', 'Sennhuette_Pegel_now']].values

# Verify the shape of X
print(f"Shape of X: {X_images.shape}")
print(f"Shape of y: {y.shape}")


In [None]:
scaler = MinMaxScaler()
y = scaler.fit_transform(y)
print(f"Shape of y_scaled: {y.shape}")

In [None]:
X_temp, X_im_test, y_temp, y_test = train_test_split(X_images, y, test_size=0.1, random_state=42)

X_im_train, X_im_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

# Reshaping
X_im_train = np.reshape(X_im_train, newshape=(-1, X_im_train.shape[1], X_im_train.shape[2], 1))
X_im_val = np.reshape(X_im_val, newshape=(-1, X_im_val.shape[1], X_im_val.shape[2], 1))
X_im_test = np.reshape(X_im_test, newshape=(-1, X_im_test.shape[1], X_im_test.shape[2], 1))


In [None]:
print("X_im_train shape:", X_im_train.shape)
print("X_im_val shape:", X_im_val.shape)
print("X_im_test shape:", X_im_test.shape)

print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
print("y_test shape:", y_test.shape)

In [None]:
model = Sequential()
model.add(Conv2D(filters=64, kernel_size=(3, 3), activation='relu', input_shape=(X_im_train.shape[1], X_im_train.shape[2], 1)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(filters=32, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(2,))
model.summary()

model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(1e-5), metrics=['mae'])

# Create an early stopping callback
#callbacks = [tf.keras.callbacks.EarlyStopping(patience=10, monitor='val_loss')]
callbacks = []
history = model.fit(X_im_train, y_train, epochs=1000, batch_size=256, validation_data=(X_im_val, y_val), callbacks=callbacks)



In [None]:
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.yscale('log')
plt.legend()
plt.show()

In [None]:
y_hat_train = model.predict(X_im_train)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# For the first column
axes[0].plot(y_hat_train[:, 0], label='Predicted 0')
axes[0].plot(y_train[:, 0], label='Reference 0', alpha=.5)
axes[0].legend()
axes[0].set_title('Margarethenklippe_Pegel_now')

# For the second column
axes[1].plot(y_hat_train[:, 1], label='Predicted 1')
axes[1].plot(y_train[:, 1], label='Reference 1', alpha=.5)
axes[1].legend()
axes[1].set_title('Sennhuette_Pegel_now')

plt.show()

In [None]:
y_hat_val = model.predict(X_im_val)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# For the first column
axes[0].plot(y_hat_val[:, 0], label='Predicted 0')
axes[0].plot(y_val[:, 0], label='Reference 0', alpha=.5)
axes[0].legend()
axes[0].set_title('Margarethenklippe_CNN_Validation')

# For the second column
axes[1].plot(y_hat_val[:, 1], label='Predicted 1')
axes[1].plot(y_val[:, 1], label='Reference 1', alpha=.5)
axes[1].legend()
axes[1].set_title('Sennhuette_CNN_Validation')

plt.show()

In [None]:
y_hat_test = model.predict(X_im_test)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# For the first column
axes[0].plot(y_hat_test[:, 0], label='Predicted 0')
axes[0].plot(y_test[:, 0], label='Reference 0', alpha=.5)
axes[0].legend()
axes[0].set_title('Margarethenklippe_CNN_Test')

# For the second column
axes[1].plot(y_hat_test[:, 1], label='Predicted 1')
axes[1].plot(y_test[:, 1], label='Reference 1', alpha=.5)
axes[1].legend()
axes[1].set_title('Sennhuette_CNN_Test')

plt.show()

In [None]:
# Define the metrics
def mse(y_true, y_pred):
    return ((y_true - y_pred) ** 2).mean()

def rmse(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))

def mae(y_true, y_pred):
    return np.abs(y_true - y_pred).mean()

def r_squared(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (ss_res / ss_tot)

datasets = {
    'train': (y_train, y_hat_train),
    'test': (y_test, y_hat_test),
    'val': (y_val, y_hat_val)
}

def round_metric(metric_value, decimals=6):
    return round(metric_value, decimals)

for name, (y_true, y_pred) in datasets.items():
    print(f"Metrics for {name} dataset:")
    print(f"MSE: {round_metric(mse(y_true, y_pred))}")
    print(f"RMSE: {round_metric(rmse(y_true, y_pred))}")
    print(f"MAE: {round_metric(mae(y_true, y_pred))}")
    print(f"R-squared: {round_metric(r_squared(y_true, y_pred))}") 
    print("-" * 30)


In [None]:
# List of column names
column_names = ['Margarethenklippe_Pegel_now','Sennhuette_Pegel_now']

# Function to plot residuals
def plot_residuals(y_true, y_pred, column_name):
    residuals = y_true - y_pred
    plt.scatter(y_pred, residuals, alpha=0.5)
    plt.axhline(0, color='r', linestyle='--')
    plt.title(f"Residual Plot for {column_name}")
    plt.xlabel(f"Predicted Values for {column_name}")
    plt.ylabel("Residuals")
    plt.show()

# Calculate residuals for each column
for i in range(y_test.shape[1]):
    y_true_column = y_test[:, i]
    y_pred_column = y_hat_test[:, i]
    
    plot_residuals(y_true_column, y_pred_column, column_names[i])

In [None]:
np.random.seed(42)  # For reproducibility
custom_index = 200 
sequence_length = 8

# Actual and predicted values for the sequence
actual_sequence = y_test[custom_index:custom_index + sequence_length]
predicted_sequence = y_hat_test[custom_index:custom_index + sequence_length]

# Plotting the selected sequence for both columns
column_names = ['Margarethenklippe_Pegel_now','Sennhuette_Pegel_now']

for i in range(2):
    plt.figure(figsize=(10, 4))
    plt.plot(actual_sequence[:, i], label='Actual', marker='o')
    plt.plot(predicted_sequence[:, i], label='Predicted', marker='x')
    plt.title(f"Actual vs Predicted for {column_names[i]}")
    plt.xlabel("Time step")
    plt.ylabel(column_names[i])
    plt.legend()
    plt.show()


In [None]:
np.random.seed(42)  # For reproducibility
custom_index = 200 
sequence_length = 24 

# Actual and predicted values for the sequence
actual_sequence = y_test[custom_index:custom_index + sequence_length]
predicted_sequence = y_hat_test[custom_index:custom_index + sequence_length]

# Plotting the selected sequence for both columns
column_names = ['Margarethenklippe_Pegel_now','Sennhuette_Pegel_now']

for i in range(2):
    plt.figure(figsize=(10, 4))
    plt.plot(actual_sequence[:, i], label='Actual', marker='o')
    plt.plot(predicted_sequence[:, i], label='Predicted', marker='x')
    plt.title(f"Actual vs Predicted for {column_names[i]}")
    plt.xlabel("Time step")
    plt.ylabel(column_names[i])
    plt.legend()
    plt.show()
