In [3]:
pip install PyWavelets

Collecting PyWavelets
  Downloading PyWavelets-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.4 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m
Installing collected packages: PyWavelets
Successfully installed PyWavelets-1.3.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
import tensorflow as tf
import tensorflow_addons as tfa

from keras import Sequential
from keras import layers
from keras.models import Model
from keras.layers import LSTM, Bidirectional, BatchNormalization, Dropout, Dense, Flatten, Conv1D
from keras.layers import MaxPooling1D, GRU, Input,Masking, Concatenate, dot
from tensorflow.keras.optimizers import Adam, SGD
from keras.losses import MeanAbsoluteError
from keras.metrics import RootMeanSquaredError
from keras.callbacks import EarlyStopping
from keras.callbacks import LearningRateScheduler
from keras.regularizers import l1, l2
from sklearn.model_selection import train_test_split
#from tensorflow.keras.optimizers import legacy
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pywt
from sklearn.model_selection import train_test_split

In [5]:
df = pd.read_excel('UpdatedDataSet.xlsx')
df.shape

(70128, 76)

In [7]:
from sklearn.preprocessing import MinMaxScaler

In [8]:
def pauta_criterion(series, threshold=3):
    """
    Identifies outliers in a series using the Pauta criterion (3-sigma rule).
    
    Parameters:
    - series: Pandas Series to analyze for outliers.
    - threshold: The number of standard deviations to use as the cutoff for outliers.
    
    Returns:
    - A boolean mask indicating which data points are outliers.
    """
    mean = series.mean()
    std_dev = series.std()
    outliers = (series - mean).abs() > threshold * std_dev
    return ~outliers  # Invert mask to keep non-outliers

# Filter data to exclude outliers based on Pauta criterion
non_outliers_mask = pauta_criterion(df['Target'])  # Apply only to 'Target' column
df_filtered = df[non_outliers_mask]

# Apply Min-Max scaling to all columns after removing outliers
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_filtered), columns=df.columns)


In [9]:
X_full = df_scaled.drop(columns=['Target'])  # All features except 'Target'
y = df_scaled['Target']  # Target variable

# Step 2: Calculate Information Gain (IG) and select top 20 features
from sklearn.feature_selection import mutual_info_regression
import numpy as np

# Calculate Information Gain scores
ig_scores = mutual_info_regression(X_full, y)

# Get indices of the top 20 features
ig_top20_indices = np.argsort(ig_scores)[-20:]  # Sort and select top 20 feature indices

# Select the top 20 features based on IG scores
X = X_full.iloc[:, ig_top20_indices]

# Display the selected features
print("\nTop 20 Features based on Information Gain (IG):")
print(X.columns)


Top 20 Features based on Information Gain (IG):
Index(['Feat 19', 'Feat 15', 'Feat 18', 'Feat 13', 'Feat 5', 'Feat 11',
       'Feat 8', 'Feat 4', 'Feat 2', 'Feat 48', 'Feat 7', 'Feat 14', 'Feat 17',
       'Feat 12', 'Feat 10', 'Feat 3', 'Feat 6', 'Feat 9', 'Feat 1',
       'Feat 16'],
      dtype='object')


In [10]:


# Assuming `X` is the DataFrame with the top 20 selected features from IG
# and `y` is the target variable

# Define a function to apply Wavelet Transformation to each feature
def wavelet_transform(X, wavelet='db1', level=1):
    """
    Apply wavelet transform to each feature in X.

    Parameters:
    - X: DataFrame of input features.
    - wavelet: Type of wavelet to use for the transform (default: 'db1').
    - level: Decomposition level (default: 1).

    Returns:
    - Transformed DataFrame where each feature has been wavelet-transformed.
    """
    transformed_data = []

    # Apply wavelet transform to each feature column
    for column in X.columns:
        coeffs = pywt.wavedec(X[column], wavelet, level=level)
        transformed_column = np.hstack(coeffs)  # Combine coefficients into a single array
        transformed_data.append(transformed_column[:len(X)])  # Match original length

    return pd.DataFrame(np.array(transformed_data).T, columns=X.columns)

# Apply Wavelet Transformation
X_wavelet = wavelet_transform(X)

# Split the transformed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_wavelet, y, test_size=0.2, random_state=42)

# Reshape data for LSTM (samples, timesteps, features)
X_train_reshaped = np.expand_dims(X_train, axis=-1)
X_test_reshaped = np.expand_dims(X_test, axis=-1)


In [11]:
epoch = 20
batch_size = 64
steps_per_epoch = len(X_train_reshaped) // batch_size

# Define cyclic learning rate
cyclic_lr = tfa.optimizers.CyclicalLearningRate(
    initial_learning_rate=1e-04,
    maximal_learning_rate=1e-02,
    scale_fn=lambda x: 1 / (2 ** (x - 1)),
    step_size=6 * steps_per_epoch
)

# Early stopping callback
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

# Define the optimizer
optimizer = Adam(learning_rate=cyclic_lr, amsgrad=True)

# Build the LSTM model
def base_model_lstm():
    model = Sequential()
    model.add(LSTM(units=128, return_sequences=True, activation="relu", 
                   input_shape=(X_train_reshaped.shape[1], 1), recurrent_dropout=0.2, 
                   kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01)))
    model.add(Flatten())
    model.add(Dense(units=128, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(1))
    return model

# Initialize and compile the LSTM model
lstm_model = base_model_lstm()
lstm_model.compile(optimizer=optimizer, loss='mean_absolute_error')
lstm_model.summary()

# Further split training data into training and validation sets
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train_reshaped, y_train, test_size=0.125, random_state=42)

# Train the model
history = lstm_model.fit(
    X_train_final, y_train_final, 
    validation_data=(X_val, y_val), 
    epochs=epoch,
    batch_size=batch_size, 
    callbacks=[callback]
)

# Predict on test set
y_predict = lstm_model.predict(X_test_reshaped).flatten()

# Evaluation metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_predict)
meanSqErr = metrics.mean_squared_error(y_test, y_predict)
rootMeanSqErr = np.sqrt(meanSqErr)

# Define MAPE function
def MAPE(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = MAPE(y_test, y_predict)

# Print results
print('Mean Absolute Error:', meanAbErr)
print('Mean Absolute Percentage Error:', mape)
print('Root Mean Square Error:', rootMeanSqErr)

# Create a DataFrame for actual vs predicted values
diff = pd.DataFrame({'Actual value': y_test.values, 'Predicted value': y_predict})
print(diff)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 20, 128)           66560     
                                                                 
 flatten (Flatten)           (None, 2560)              0         
                                                                 
 dense (Dense)               (None, 128)               327808    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 394,497
Trainable params: 394,497
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20


2024-11-05 20:06:01.952251: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-11-05 20:06:01.952320: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2024-11-05 20:06:01.952360: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2024-11-05 20:06:01.952395: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory
2024-11-05 20:06:01.982970: W tensorflow/stream_executor/platform/default/dso_loader.cc:64

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Mean Absolute Error: 0.043417696153482066
Mean Absolute Percentage Error: 7.860186136036605
Root Mean Square Error: 0.07916835719845637
       Actual value  Predicted value
0          0.432567         0.448031
1          0.453756         0.448004
2          0.309425         0.448187
3          0.435730         0.448072
4          0.447329         0.467005
...             ...              ...
13863      0.448684         0.448050
13864      0.432491         0.447974
13865      0.444417         0.448192
13866      0.453932         0.448168
13867      0.605493         0.447960

[13868 rows x 2 columns]
