In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn.preprocessing
import seaborn as sns
import plotly.graph_objects as go


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install openpyxl

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
#reading the data
path = '/kaggle/input/anomaly-detection-smart-meter-data-sample/Lastgang Elektroverbruche 160101-170511.xlsx'
df = pd.read_excel(path, engine='openpyxl', index_col=0)

In [None]:
df.isnull().sum()

In [None]:
#renaming the columns
df.index.name ='datetime'
df.columns = ['energy']
df.head()

In [None]:
df['date'] = df.index.date
df['time'] = df.index.time
df['year'] = df.index.year
df['weekday'] = df.index.strftime("%A")
df.head()

In [None]:
#Entire load curve and the daily load trends
_ = df.pivot_table(index=df.index, 
                     values='energy').plot(figsize=(15,4),
                     title='Entire Load Curve')
_ = df.pivot_table(index=df['time'], 
                     values='energy',
                     aggfunc=np.mean).plot(figsize=(15,4),
                     title='Daily Load Trends')

In [None]:
#Load distributions & daily load curve
_ = df['energy'].plot.hist(figsize=(15, 5), bins=100, title='Load Distribution')

_ = df.pivot_table(index=df['time'], 
                     columns='weekday', 
                     values='energy',
                     aggfunc=np.mean).plot(figsize=(15,4),
                     title='Energy Daily Load Curve Trends')

In [None]:
#normalize the energy data
def normalize_data(df):
    scaler = sklearn.preprocessing.MinMaxScaler()
    df['energy']=scaler.fit_transform(df['energy'].values.reshape(-1,1))
    return df

df_norm = normalize_data(df)
df_norm = df_norm.drop(columns=['date','time','year','weekday'])
df_norm.shape

In [None]:
df_norm.head()

# Simple RNN Model by ignoring the anomalies

In [None]:
#data_loading
def load_data(stock, seq_len):
    X_train = []
    y_train = []
    for i in range(seq_len, len(stock)):
        X_train.append(stock.iloc[i-seq_len : i, 0])
        y_train.append(stock.iloc[i, 0])
    
    X_test = X_train[40000:]             
    y_test = y_train[40000:]
    
    X_train = X_train[:40000]           
    y_train = y_train[:40000]
    
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    
    X_test = np.array(X_test)
    y_test = np.array(y_test)
    
    #4 reshape data to input into RNN models
    X_train = np.reshape(X_train, (40000, seq_len, 1))
    X_test = np.reshape(X_test, (X_test.shape[0], seq_len, 1))
    
    return [X_train, y_train, X_test, y_test]

In [None]:
seq_len = 20

X_train, y_train, X_test, y_test = load_data(df_norm, seq_len)

print('X_train.shape = ',X_train.shape)
print('y_train.shape = ', y_train.shape)
print('X_test.shape = ', X_test.shape)
print('y_test.shape = ',y_test.shape)

In [None]:
from sklearn.metrics import r2_score

from keras.layers import Dense,Dropout,SimpleRNN,LSTM
from keras.models import Sequential

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import confusion_matrix


from keras.layers import Conv1D, MaxPooling1D, Flatten

cnn_model = Sequential()

cnn_model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Dropout(0.15))

cnn_model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Dropout(0.15))

cnn_model.add(Flatten())
cnn_model.add(Dense(1))

cnn_model.summary()

In [None]:
cnn_model.compile(optimizer="adam", loss="MSE")
cnn_model.fit(X_train, y_train, epochs=10, batch_size=80)

In [None]:
# Make predictions using the CNN model
cnn_predictions = cnn_model.predict(X_test)

In [None]:
# Evaluate the CNN model using R2 Score and MSE
cnn_r2 = r2_score(y_test, cnn_predictions)
cnn_mse = mean_squared_error(y_test, cnn_predictions)

print(f'CNN Model R2 Score: {cnn_r2}')
print(f'CNN Model MSE: {cnn_mse}')

def plot_predictions(test, predicted, title):
    plt.figure(figsize=(16,4))
    plt.plot(test, color='blue',label='Actual power consumption data')
    plt.plot(predicted, alpha=0.7, color='orange',label='Predicted power consumption data')
    plt.title(title)
    plt.xlabel('Time')
    plt.ylabel('Normalized power consumption scale')
    plt.legend()
    plt.show()

# Plot predictions using the CNN model
plot_predictions(y_test, cnn_predictions, "Load Predictions Validation - CNN")

threshold_high = 0.5  # Adjust this threshold based on your problem

# Convert regression predictions to classification labels
cnn_class_predictions = np.where(cnn_predictions > threshold_high, 1, 0)
y_test_class = np.where(y_test > threshold_high, 1, 0)

# Calculate and display confusion matrix
conf_matrix = confusion_matrix(y_test_class, cnn_class_predictions)
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
from sklearn.metrics import classification_report

threshold_high = 0.5
cnn_pred_classes = (cnn_predictions > threshold_high).astype(int)
y_test_class = (y_test > threshold_high).astype(int)

# Evaluation metrics
cnn_r2 = r2_score(y_test, cnn_predictions)
cnn_conf_matrix = confusion_matrix(y_test_class, cnn_pred_classes)

print("CNN Model R2 Score:", cnn_r2)
print("Confusion Matrix:")
print(cnn_conf_matrix)
print("\nClassification Report:")
print(classification_report(y_test_class, cnn_pred_classes))

# Plot confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(cnn_conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
plt.title('Confusion Matrix - CNN Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from sklearn.metrics import r2_score, confusion_matrix, classification_report

from keras.layers import Conv1D, MaxPooling1D, Flatten, Dropout
from keras.models import Sequential

# Function to plot predictions
def plot_predictions(test, predicted, title):
    plt.figure(figsize=(16, 4))
    plt.plot(test, color='blue', label='Actual power consumption data')
    plt.plot(predicted, alpha=0.7, color='orange', label='Predicted power consumption data')
    plt.title(title)
    plt.xlabel('Time')
    plt.ylabel('Normalized power consumption scale')
    plt.legend()
    plt.show()

# Convert the RNN model to a CNN model
cnn_model = Sequential()

cnn_model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Dropout(0.15))

cnn_model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Dropout(0.15))

cnn_model.add(Flatten())
cnn_model.add(Dense(1))

cnn_model.summary()

cnn_model.compile(optimizer="adam", loss="MSE")
cnn_model.fit(X_train, y_train, epochs=10, batch_size=80)

# Make predictions using the CNN model
cnn_predictions = cnn_model.predict(X_test)

# Convert predictions and actual values to binary classes (1 for anomaly, 0 for normal)
threshold_high = 0.5
cnn_pred_classes = (cnn_predictions > threshold_high).astype(int)
y_test_class = (y_test > threshold_high).astype(int)

# Evaluation metrics
cnn_r2 = r2_score(y_test, cnn_predictions)
cnn_conf_matrix = confusion_matrix(y_test_class, cnn_pred_classes)

print("CNN Model R2 Score:", cnn_r2)
print("Confusion Matrix:")
print(cnn_conf_matrix)
print("\nClassification Report:")
print(classification_report(y_test_class, cnn_pred_classes))

# Plot confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(cnn_conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 14})
plt.title('Confusion Matrix - CNN Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
