In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, Dataset, DataLoader

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn import preprocessing

import matplotlib.pyplot as plt

def label_load(row):
   if row['Active Power'] < 1 and row['Governor speed actual'] < 1:
      return 'Shutdown'
   elif row['Active Power'] < 3 and row['Governor speed actual'] < 250:
      return 'Warming'
   elif row['Active Power'] < 3 and row['Governor speed actual'] > 250:
      return 'No Load'
   elif row['Active Power'] >= 1 and row['Active Power'] < 20 and row['Governor speed actual'] > 250:
      return 'Low Load'
   elif row['Active Power'] >= 20 and row['Active Power'] < 40 and row['Governor speed actual'] > 250:
      return 'Rough Zone'
   elif row['Active Power'] >= 40 and row['Active Power'] < 50 and row['Governor speed actual'] > 250:
      return 'Part Load'
   elif row['Active Power'] >= 50 and row['Active Power'] < 65 and row['Governor speed actual'] > 250:
      return 'Efficient Load'
   elif row['Active Power'] >= 65 and row['Governor speed actual'] > 250:
      return 'High Load'
   else:
      return 'Undefined'
      
class TimeSeriesDataset(Dataset):
    def __init__(self, input_data, input_window, output_window):
        self.input_data = input_data
        self.input_window = input_window
        self.output_window = output_window
        self.block_len = input_window + output_window
        self.block_num = len(input_data) - self.block_len + 1
        self.inout_seq = self.create_input_sequences()

    def create_input_sequences(self):
        inout_seq = []
        for i in range(self.block_num):
            train_seq = self.input_data[i : i + self.input_window]
            train_label = self.input_data[i + self.output_window : i + self.input_window + self.output_window][:, 9:16]
            inout_seq.append((train_seq, train_label))
        return inout_seq

    def __len__(self):
        return len(self.inout_seq)

    def __getitem__(self, idx):
        train_seq, train_label = self.inout_seq[idx]
        return torch.FloatTensor(train_seq), torch.FloatTensor(train_label)

In [2]:
df_data_withtime = pd.read_pickle("/run/media/fourier/Data2/Pras/Vale/Data_Raw/LGS2/lgs2_olah.pickle")
mask = (df_data_withtime['TimeStamp'] >= '2020-01-01 00:00:00')
df_data_withtime = df_data_withtime.loc[mask]
df_data_withtime['TimeStamp'] = pd.to_datetime(df_data_withtime['TimeStamp'])

for column_name in df_data_withtime.columns:
    if column_name != 'Load_Type' and column_name != 'TimeStamp':
        df_data_withtime[column_name] = pd.to_numeric(df_data_withtime[column_name], downcast='float')

df_data_withtime = df_data_withtime.fillna(method='ffill')
print(len(df_data_withtime))

2103840


In [3]:
df_data_withtime.columns

Index(['TimeStamp', 'U-Lgs2-Ti-81204D-Ai',
       'Surface Air Cooler Water Inlet Temp',
       'Surface Air  Cooler Water Outlet Temp', 'Governor Unit Speed Actual',
       'Upper Guide Bearing X Vibration', 'Upper Guide Bearing Y Vibration',
       'Lower Guide Bearing X Vibration', 'Lower Guide Bearing Y Vibration',
       'Turbine Guide Bearing X Vibration',
       'Turbine Guide Bearing Y Vibration', 'Gen Voltage Ph 1',
       'Gen Voltage Ph 2', 'Gen Voltage Ph 3', 'Gen Current Ph 1',
       'Gen Current Ph 2', 'Gen Current Ph 3', 'Active Power ',
       'Reactive Power', 'Excitation Field Voltage',
       'Excitation Field Current', 'Gen Frequency', 'Power Factor (Modbus)',
       'Turb Gov Turbine Wicket Gate Position (%)', 'Penstock Pressure',
       'Governor Penstock Pressure', 'Penstock Flow', 'Gov Turbine Flow',
       'Efficiency', 'Generator Coolers Water Flow ',
       'Upper Guide Bearing Cooling Water Flow',
       'Lower Combined Bearing Cooling Water Flow ',
       

In [8]:
df_data_withtime.head()

Unnamed: 0,TimeStamp,U-Lgs2-Ti-81204D-Ai,Surface Air Cooler Water Inlet Temp,Surface Air Cooler Water Outlet Temp,Governor Unit Speed Actual,Upper Guide Bearing X Vibration,Upper Guide Bearing Y Vibration,Lower Guide Bearing X Vibration,Lower Guide Bearing Y Vibration,Turbine Guide Bearing X Vibration,...,L_U2_Gov_Bypass Valve Position,Stator Winding Rtd #13,Stator Winding Rtd #14,Stator Winding Rtd #15,Stator Core Rtd #12 Temp Air Outlet Temp,Upper Guide Bearing Metal Rtd #3 Air Outlet Temp,Upper Guide Bearing Oil Rtd,Lower Guide Bearing Metal Rtd #1,Lower Guide Bearing Metal Rtd #2,Lower Guide Bearing Oil Rtd
0,2020-01-01 00:00:00,70.0,68.43,70.0,274.359985,306.480011,295.019989,115.620003,104.019997,105.599998,...,-0.18,70.0,70.419998,69.470001,59.369999,52.0,51.0,56.0,55.0,72.0
1,2020-01-01 00:01:00,70.0,68.43,70.0,274.359985,290.940002,282.73999,138.580002,133.490005,114.940002,...,-0.18,70.0,70.419998,69.470001,59.360001,52.0,51.0,56.0,55.0,72.0
2,2020-01-01 00:02:00,70.0,68.43,70.0,274.359985,300.26001,274.929993,121.809998,118.300003,144.520004,...,-0.18,70.0,70.419998,69.480003,59.360001,52.0,51.0,56.0,55.0,72.0
3,2020-01-01 00:03:00,70.0,68.43,70.0,274.359985,309.880005,291.089996,114.800003,120.279999,133.279999,...,-0.18,70.0,70.419998,69.480003,59.360001,52.0,51.0,56.0,55.0,72.0
4,2020-01-01 00:04:00,70.0,68.43,70.0,274.359985,308.910004,295.790009,130.809998,116.410004,103.099998,...,-0.18,70.0,70.419998,69.480003,59.360001,52.0,51.0,56.0,55.0,72.0


In [9]:
df_anomaly = pd.read_excel("/run/media/fourier/Data2/Pras/Vale/Data_Raw/LGS2/shutdown_lgs2.xlsx")
df_anomaly['Start Time'] = pd.to_datetime(df_anomaly['Start Time'])
df_anomaly['End Time'] = pd.to_datetime(df_anomaly['End Time'])

# mask = (df_anomaly['Interal/External'] == 'Internal') & (df_anomaly['Shutdown Type'] == 'Unplanned')
# df_anomaly = df_anomaly.loc[~mask]

for index, row in df_anomaly.iterrows():
    masknot = (df_data_withtime['TimeStamp'] > (row['Start Time'] - timedelta(hours=24 * 7))) & (df_data_withtime['TimeStamp'] <= (row['End Time'] + timedelta(hours=24 * 7)))
    df_data_withtime = df_data_withtime.loc[~masknot]
    
print(len(df_data_withtime))

1895590


In [11]:
(2103840 - 1895590) / 60 / 24

144.61805555555557

In [13]:
df_data_withtime.reset_index(drop=True, inplace=True)

In [14]:
def filter_noise_es(df, alpha=0.4, reduction=False):
    import copy
    new_df = copy.deepcopy(df)
    
    for column in df:
        new_df[column] = df[column].ewm(alpha=alpha, adjust=False).mean()
    
    if reduction:
        return new_df[::len(df)]  # Adjust sparsity if needed
    else:
        return new_df

def wgn_pandas(df_withtime, snr, alpha=0.15, window_size=120):
    df_no_timestamp = df_withtime.drop(columns=['TimeStamp'])
    noisy_df = pd.DataFrame(index=df_no_timestamp.index, columns=df_no_timestamp.columns)

    for start in range(0, len(df_no_timestamp), window_size):
        window = df_no_timestamp.iloc[start:start + window_size]
        
        min_window, max_window = window.min(), window.max()
        #x = (window - min_window) / (max_window - min_window + 1e-4)
        Ps = np.sum(np.power(window, 2), axis=0) / len(window)
        Pn = Ps / (np.power(10, snr / 10))

        noise = np.random.randn(*window.shape) * np.sqrt(Pn.values)
        noisy_window = window + (noise / 100)

        noisy_df.iloc[start:start + window_size] = noisy_window
    
    noisy_df.reset_index(drop=True, inplace=True)
    noisy_df = filter_noise_es(pd.DataFrame(noisy_df, columns=noisy_df.columns), alpha)

    df_timestamp = df_withtime['TimeStamp']
    df_timestamp.reset_index(drop=True, inplace=True)

    df_withtime = pd.concat([df_timestamp, noisy_df], axis=1)
    return df_withtime

df_noisy_wgn = wgn_pandas(df_data_withtime, 30, alpha=0.15)

In [18]:
feature_set = ['Active Power', 'Reactive Power', 'Governor speed actual', 'UGB X displacement', 'UGB Y displacement',
    'LGB X displacement', 'LGB Y displacement', 'TGB X displacement',
    'TGB Y displacement', 'Stator winding temperature 13',
    'Stator winding temperature 14', 'Stator winding temperature 15',
    'Surface Air Cooler Air Outlet Temperature',
    'Surface Air Cooler Water Inlet Temperature',
    'Surface Air Cooler Water Outlet Temperature',
    'Stator core temperature', 'UGB metal temperature',
    'LGB metal temperature 1', 'LGB metal temperature 2',
    'LGB oil temperature', 'Penstock Flow', 'Turbine flow',
    'UGB cooling water flow', 'LGB cooling water flow',
    'Generator cooling water flow', 'Governor Penstock Pressure',
    'Penstock pressure', 'Opening Wicked Gate', 'UGB Oil Contaminant',
    'Gen Thrust Bearing Oil Contaminant']

df_data_withtime = df_noisy_wgn[['TimeStamp'] + feature_set]

In [16]:
df_data_withtime = df_noisy_wgn

In [17]:
df_data_withtime.shape

(1895590, 44)

In [18]:
df_label = pd.DataFrame({
    'TimeStamp': df_data_withtime['TimeStamp'],
    'label_anomaly': 0
})

for _, row in df_anomaly.iterrows():
    start_time = row['Start Time']
    end_time = row['End Time']
    pre_start_time = start_time - pd.Timedelta(hours=3)
    
    df_label.loc[
        (df_label['TimeStamp'] >= pre_start_time) & 
        (df_label['TimeStamp'] < start_time), 
        'label_anomaly'
    ] = 1
    
    # Remove timestamps between [start_time, end_time) from df_label and df_data_withtime
    mask_remove = (df_data_withtime['TimeStamp'] >= start_time) & (df_data_withtime['TimeStamp'] < end_time)
    df_data_withtime = df_data_withtime.loc[~mask_remove]
    df_label = df_label.loc[~mask_remove]

# Reset index of the resulting DataFrames if necessary
df_data_withtime.reset_index(drop=True, inplace=True)
df_label.reset_index(drop=True, inplace=True)


In [19]:
new_df = df_data_withtime

In [20]:
sampels = int(len(new_df) * 0.8)
train_data = new_df[:sampels]
test_data = new_df[sampels:]
test_label = df_label[sampels:]

train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)
test_label.reset_index(drop=True, inplace=True)

train_data.to_csv("/run/media/fourier/Data2/Pras/Vale/TN_Anom/DTAAD/data/CLGS2AWGN30ES15/train.csv", index=False)
test_data.to_csv("/run/media/fourier/Data2/Pras/Vale/TN_Anom/DTAAD/data/CLGS2AWGN30ES15/test.csv", index=False)
test_label.to_csv("/run/media/fourier/Data2/Pras/Vale/TN_Anom/DTAAD/data/CLGS2AWGN30ES15/test_label.csv", index=False)

In [21]:
import pandas as pd
import numpy as np
import os

In [22]:
def load_and_save(category, filename, dataset, dataset_folder):
    temp = np.genfromtxt(os.path.join(dataset_folder, category, filename),
                         dtype=np.float64,
                         delimiter=',')
    print(dataset, category, filename, temp.shape)
    np.save(os.path.join(output_folder, f"SMD/{dataset}_{category}.npy"), temp)
    return temp.shape

def load_and_save2(category, filename, dataset, dataset_folder, shape):
    temp = np.zeros(shape)
    with open(os.path.join(dataset_folder, 'interpretation_label', filename), "r") as f:
        ls = f.readlines()
    for line in ls:
        pos, values = line.split(':')[0], line.split(':')[1].split(',')
        start, end, indx = int(pos.split('-')[0]), int(pos.split('-')[1]), [int(i) - 1 for i in values]
        temp[start - 1:end - 1, indx] = 1
    print(dataset, category, filename, temp.shape)
    np.save(os.path.join(output_folder, f"SMD/{dataset}_{category}.npy"), temp)

def normalize3(a, min_a=None, max_a=None):
    if min_a is None: min_a, max_a = np.min(a, axis=0), np.max(a, axis=0)
    return ((a - min_a) / (max_a - min_a + 0.0001)), min_a, max_a


def convertNumpy(df):
    x = df[df.columns[3:]].values[::10, :]
    return (x - x.min(0)) / (x.ptp(0) + 1e-4)

In [23]:
dataset_folder = 'data/CLGS2AWGN30ES15'
df_train = pd.read_csv(os.path.join(dataset_folder, 'train.csv'))
df_test = pd.read_csv(os.path.join(dataset_folder, 'test.csv'))
df_train, df_test = df_train.values[:, 1:], df_test.values[:, 1:]
_, min_a, max_a = normalize3(np.concatenate((df_train, df_test), axis=0))
train, _, _ = normalize3(df_train, min_a, max_a)
test, _, _ = normalize3(df_test, min_a, max_a)
labels = pd.read_csv(os.path.join(dataset_folder, 'test_label.csv'))
labels = labels.values[:, 1:]

folder = os.path.join("processed", "CLGS2AWGN30ES15")
os.makedirs(folder, exist_ok=True)

for file in ['train', 'test', 'labels']:
    np.save(os.path.join(folder, f'{file}.npy'), eval(file).astype('float64'))