In [None]:
import pandas as pd

energy_data = pd.read_csv("Extra//energy.csv")
energy_data['timestamp'] = pd.to_datetime(energy_data['timestamp'])
energy_data.set_index('timestamp', inplace=True)
resampled_energy = energy_data.resample("5s").mean()
resampled_energy = resampled_energy.fillna(method='ffill')
resampled_energy = resampled_energy.fillna(method='bfill')

env_data = pd.read_csv("Extra//environment.csv")
env_data['timestamp'] = pd.to_datetime(env_data['timestamp'])
env_data.set_index('timestamp', inplace=True)
resampled_env = env_data.resample("5s").mean()
resampled_env = resampled_env.fillna(method='ffill')
resampled_env = resampled_env.fillna(method='bfill')

resampled_energy['reactive_power'] = resampled_energy[["Reactive Power A average [kVAr]","Reactive Power B average [kVAr]","Reactive Power C average [kVAr]"]].mean(axis=1)
resampled_energy['thdi'] = resampled_energy[["THDI A average [%]","THDI B average [%]","THDI C average [%]"]].mean(axis=1)
resampled_energy['thdu'] = resampled_energy[["THDU A average [%]","THDU B average [%]","THDU C average [%]"]].mean(axis=1)
resampled_energy['current'] = resampled_energy[["Current A average [A]","Current B average [A]","Current C average [A]"]].mean(axis=1)
resampled_energy['voltage'] = resampled_energy[["Voltage A average [V]","Voltage B average [V]","Voltage C average [V]"]].mean(axis=1)
resampled_energy['power_factor'] = resampled_energy[["Power Factor A average","Power Factor B average","Power Factor C average"]].mean(axis=1)
useful_data = resampled_energy.join(resampled_env)
useful_data = useful_data[["reactive_power","power_factor","current","voltage","thdu","thdi","Xacc","yaw","pitch"]]
useful_data = useful_data.dropna()
display(useful_data)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(useful_data[useful_data.columns])

In [None]:
#find the optimal number of clusters
from hmmlearn import hmm
import numpy as np
np.random.seed(33)
import matplotlib.pyplot as plt

def mdl_score(model, data):
    n_features = data.shape[1]
    n_states = model.n_components
    
    n_transition_params = n_states * (n_states - 1)
    n_emission_params = n_states * n_features
    n_initial_state_params = n_states - 1
    n_params = n_transition_params + n_emission_params + n_initial_state_params
    
    adjusted_bic = model.score(data) - np.square(n_params) * np.log(data.shape[0])
    return adjusted_bic

n_states_range = range(4, 9)
mdl_scores = []

for n_states in n_states_range:
    mdl_scores_n_states = []

    for run in range(10):
        model = hmm.GaussianHMM(n_components=n_states, covariance_type='diag')
        model.fit(scaled_data)
        mdl = mdl_score(model, scaled_data)
        mdl_scores_n_states.append(mdl)

    avg_mdl = np.mean(mdl_scores_n_states)
    print(n_states)
    print(avg_mdl)
    mdl_scores.append(avg_mdl)

plt.figure(figsize=(8, 6))
plt.plot(n_states_range, mdl_scores, marker='o', linestyle='-', linewidth=2)
plt.xlabel("Number of hidden states")
plt.ylabel("Average MDL score")
plt.grid()
plt.show()

In [None]:
from hmmlearn import hmm
import numpy as np
np.random.seed(33)

n_clusters = n_states_range[np.argmax(mdl_scores)]
model = hmm.GaussianHMM(n_components = n_clusters, covariance_type='diag')
model.fit(scaled_data)
hidden_states = model.predict(scaled_data)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
min_maxed = pd.DataFrame(scaler.fit_transform(useful_data.values), columns=useful_data.columns, index=useful_data.index)

min_maxed = min_maxed.assign(states = hidden_states)
min_maxed.insert(loc=0, column='Date', value=pd.to_datetime(min_maxed.index))
min_maxed['colors'] = min_maxed['states'].map({0:'Off', 1: 'Moving', 2: 'Mode-1', 3: 'On', 4:'Mode-2'})
color_map = {"Off": "black", "On": "white", "Mode-1" : "cyan", "Mode-2": "yellow", "Moving": "green"}

In [None]:
import plotly.express as px
import plotly.graph_objects as go

new_df = min_maxed.loc['2022-11-08 14:00:00':'2022-11-08 16:00:00']

fig = px.line(new_df, x='Date', y='current')
fig.update_traces(line=dict(color='black'))
fig.update_layout(xaxis_title="Time", yaxis_title="Current average [A]", xaxis=dict(showgrid=False), yaxis=dict(showgrid=False))

#start background
start_mode = str(new_df.iloc[0]["colors"])
start_date = str(new_df.iloc[0]["Date"])

for index, row in new_df.iterrows():
    current_mode = row["colors"]
    if current_mode != start_mode:
        fig.add_vrect(x0=start_date, x1=str(row["Date"]), fillcolor=color_map[start_mode], opacity=0.5)
        start_mode = row["colors"]
        start_date = str(row["Date"])

fig.add_vrect(x0=start_date, x1=str(new_df.iloc[-1]["Date"]), fillcolor=color_map[start_mode], opacity=0.5)
#end background
fig.show()

In [None]:
#get data for anomaly detection
anomaly_data = resampled_energy.join(resampled_env)
anomaly_data = anomaly_data[['voc', 'pm1.0', 'pm2.5', 'pm10', 'CO2']]
hidden_states = min_maxed["states"]
hidden_states_aligned = hidden_states.reindex(anomaly_data.index)
anomaly_data = anomaly_data.assign(states=hidden_states_aligned)
anomaly_data = anomaly_data.dropna()
anomaly_data

In [None]:
clusters_data = []
clusters_indices = []

for i in range(n_clusters):
    cluster_data = anomaly_data[anomaly_data['states'] == i]
    clusters_data.append(cluster_data)
    cluster_indices = np.where(anomaly_data['states'].values == i)[0]
    clusters_indices.append(cluster_indices)

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, RepeatVector, TimeDistributed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Bidirectional

def create_autoencoder(timesteps, n_features):
    autoencoder = Sequential([
        Bidirectional(LSTM(64, activation='tanh', return_sequences=True), input_shape=(timesteps, n_features)),
        Bidirectional(LSTM(32, activation='tanh', return_sequences=False)),
        RepeatVector(timesteps),
        Bidirectional(LSTM(32, activation='tanh', return_sequences=True)),
        Bidirectional(LSTM(64, activation='tanh', return_sequences=True)),
        TimeDistributed(Dense(n_features))
    ])
    autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return autoencoder

In [None]:
timesteps = 10
all_anomalies = []

standard_scaler = StandardScaler()

for i in range(n_clusters):
    cluster_df = clusters_data[i].drop(columns=['states'])
    cluster_df = cluster_df.iloc[:-(timesteps + (cluster_df.shape[0] % timesteps))]
    scaled_cluster = standard_scaler.fit_transform(cluster_df.values)
    
    n_samples = scaled_cluster.shape[0] // timesteps
    reshaped_scaled_cluster = scaled_cluster.reshape(scaled_cluster.shape[0]//timesteps, timesteps, scaled_cluster.shape[1])
    n_features = reshaped_scaled_cluster.shape[2]

    autoencoder = create_autoencoder(timesteps, n_features)
    autoencoder.fit(reshaped_scaled_cluster, reshaped_scaled_cluster, epochs=15, batch_size=32, verbose=1)
    
    predictions = autoencoder.predict(reshaped_scaled_cluster)
    predictions = predictions.reshape(predictions.shape[0] * predictions.shape[1], predictions.shape[2])

    mse = np.square(np.subtract(scaled_cluster, predictions))
    threshold = np.percentile(mse, 99.99)

    anomalies = np.where(mse > threshold)
    original_datetime_indices = [clusters_indices[i][x] for x in anomalies[0]]
    original_indices = [original_datetime_indices, anomalies[1]]
    all_anomalies.append(original_indices)

display(all_anomalies)

In [None]:
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
plot_df = pd.DataFrame(scaler.fit_transform(anomaly_data.values), columns=anomaly_data.columns, index=anomaly_data.index)

for set in all_anomalies:
    for i in list(zip(set[0], set[1])):
        anomaly = i[0]

        bg_df = min_maxed.loc[str(min_maxed.iloc[anomaly-100]["Date"]):str(min_maxed.iloc[anomaly+100]["Date"])]
        new_df = plot_df.loc[str(min_maxed.iloc[anomaly-100]["Date"]):str(min_maxed.iloc[anomaly+100]["Date"])]
        new_df.insert(loc=0,column='Date', value=pd.to_datetime(new_df.index))

        line_fig = px.line(new_df, x = 'Date', y = anomaly_data.columns[i[1]])
        line_fig.update_traces(line=dict(color = 'black'))
        fig = go.Figure(data=line_fig.data).update_layout(xaxis_title="Time", yaxis_title=anomaly_data.columns[i[1]])

        bg_df['colors'] = bg_df['states'].map({0:'On', 1: 'Mode-2', 2: 'Off', 3: 'Moving', 4:'Mode-1'})
        color_map = {"Off": "black", "On": "white", "Mode-1" : "cyan", "Mode-2": "yellow", "Moving": "green"}
        
        #start background
        start_mode = str(bg_df.iloc[0]["colors"])
        start_date = str(bg_df.iloc[0]["Date"])

        for index, row in bg_df.iterrows():
            current_mode = row["colors"]
            if current_mode != start_mode:
                fig.add_vrect(x0=start_date, x1=str(row["Date"]), fillcolor=color_map[start_mode], opacity=0.5)
                start_mode = row["colors"]
                start_date = str(row["Date"])

        fig.add_vrect(x0=start_date, x1=str(bg_df.iloc[-1]["Date"]), fillcolor=color_map[start_mode], opacity=0.5)

        #fig.add_vrect(x0=str(useful_data.iloc[anomaly-5].name),x1=str(useful_data.iloc[anomaly-3].name),fillcolor="black", opacity=1)
        #fig.add_vrect(x0=str(useful_data.iloc[anomaly+4].name),x1=str(useful_data.iloc[anomaly+6].name),fillcolor="black", opacity=1)
        fig.add_vrect(x0=str(anomaly_data.iloc[anomaly-1].name),x1=str(anomaly_data.iloc[anomaly+1].name),fillcolor="red", opacity=0.5)
        #end background

        fig.update_layout(xaxis=dict(showgrid=False), yaxis=dict(showgrid=False))
        fig.show()
        #fig.write_image(f"pics/ae/anomaly_{anomaly_data.columns[i[1]]}_{(anomaly_data.iloc[anomaly].name).strftime('%Y_%m_%d-%I_%M_%S')}.jpg", width=1920, height=0.75*1080)