In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Reshape, Flatten, Conv1D, Dropout, Activation
from tensorflow.keras.utils import to_categorical
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
import pathlib
import time
import os

In [2]:
MAIN_FOLDER = pathlib.Path("F:\\ML\\venv3.9\\Scripts\\Moradzadeh\\First_Project\\New_Section")
TRAIN_FILE = MAIN_FOLDER / "data\\case1_train.csv"
TEST_FILE = MAIN_FOLDER / "data\\case3_test.csv"
STATS_FILE = MAIN_FOLDER / "data\\x_stats.csv"
MODEL_SAVE_FOLDER = MAIN_FOLDER / "models"


In [3]:
def united_cat(df):
    unq_labels = sorted(df["Label"].unique())
    label_dict = dict(zip(unq_labels, list(range(len(unq_labels)))))
    df.Label = df["Label"].map(label_dict)
    return df, label_dict

In [4]:
# Trainin data
train_init_data = pd.read_csv(TRAIN_FILE, index_col=0)
train_init_data = shuffle(train_init_data, random_state=7)         # Shuffle
train_init_data, train_label_dict = united_cat(train_init_data)    # Rename classes
y = train_init_data.pop("Label")
y = to_categorical(y)                          # One-hot encoding
# Train / Validation split 
x_train = np.array(train_init_data.iloc[:-1600])
x_validation = np.array(train_init_data.iloc[-1600:])
y_train = y[:-1600]
y_validation = y[-1600:]

# Testing data
test_init_data = pd.read_csv(TEST_FILE, index_col=0)
test_init_data, test_label_dict = united_cat(test_init_data)      # Rename classes 
y_test = np.array(test_init_data.pop("Label"))
y_test = to_categorical(y_test)                 # One-hot encoding
x_test = np.array(test_init_data)

print("train shape:", x_train.shape, "  validation shape:", x_validation.shape, "  Test shape:", x_test.shape)
print("Y train shape:", y_train.shape, "  Y test_shape:", y_test.shape)

train shape: (9084, 33)   validation shape: (1600, 33)   Test shape: (10684, 33)
Y train shape: (9084, 16)   Y test_shape: (10684, 16)


In [26]:
x_train[0]

array([ 9.78900000e+03, -3.92489800e+00, -1.04860120e+01, -8.52980500e+00,
       -7.32590300e+00, -1.20769870e+01, -1.12773740e+01, -1.12773740e+01,
       -1.27552790e+01, -1.29843500e+01, -1.26968620e+01, -1.30021810e+01,
       -1.18292740e+01, -1.40319480e+01,  1.15772164e+02,  5.73265450e+01,
        5.78436380e+01,  4.55823490e+01,  3.41377590e+01, -1.99627270e+01,
       -4.98980370e+01,  2.34472310e+01,  1.36840170e+01,  3.53036180e+01,
        5.43934100e+00,  6.31236700e+00,  4.20120900e+00,  0.00000000e+00,
        2.34472310e+01,  4.73142000e+00,  8.24102100e+00, -2.61239100e+00,
       -5.34058000e+00,  8.23158900e+00])

In [5]:
x_stats = pd.read_csv(STATS_FILE, index_col=0)
x_norm_layer = tf.keras.layers.experimental.preprocessing.Normalization(mean=x_stats.loc['mean'], variance=x_stats.loc['var'])
x_train_norm = x_norm_layer(x_train)
x_validation_norm = x_norm_layer(x_validation)
x_test_norm = x_norm_layer(x_test)

#### LSTM

In [11]:
class LSTMAttack:
    def __init__(self, model_save_folder, dataset, batch_size=512):
        self.input_shape = (33,) 
        self.model_save_folder = model_save_folder
        self.dataset = dataset
        self.batch_size = batch_size
        
        self.lstm_opt = tf.keras.optimizers.Adam(learning_rate=0.0005)
        self.loss_fn = tf.keras.losses.CategoricalCrossentropy()
        
        # Create LSTM
        self.lstm = self.build_model()
        self.lstm.summary()
        
        # Define metrics for log
        self.train_lstm_loss = tf.keras.metrics.Mean('lstm_training_loss', dtype=tf.float32)
        self.train_lstm_accuracy = tf.keras.metrics.CategoricalAccuracy('lstm_training_accuracy', dtype=tf.float32)
        self.test_lstm_loss = tf.keras.metrics.Mean('lstm_test_loss', dtype=tf.float32)
        self.test_lstm_accuracy= tf.keras.metrics.CategoricalAccuracy('lstm_test_accuracy', dtype=tf.float32)
        
        
    def load_model(self, model_path):
        self.lstm = tf.keras.models.load_model(model_path) 


    def build_model(self):
        input_layer = tf.keras.Input(shape = self.input_shape)
        reshaper = Reshape((1, 33))(input_layer)
        bi_lstm = Bidirectional(LSTM(22, return_sequences=False))(reshaper)
        bi_lstm = Dropout(0.15)(bi_lstm)
        dense1 = Flatten()(bi_lstm)
        dense1 = Dense(14, activation='tanh')(dense1)
        out = Dense(16, activation='softmax', name="output")(dense1)
        model_lstm = tf.keras.Model(inputs=input_layer, outputs= [out], name="lstm_model")
        return model_lstm
        
    
    @tf.function
    def train_step(self, one_batch):
        x, y = one_batch        
        with tf.GradientTape() as tape:
            lstm_pred = self.lstm(x)
            lstm_loss = self.loss_fn(y, lstm_pred)
        grads = tape.gradient(lstm_loss, self.lstm.trainable_weights)
        self.lstm_opt.apply_gradients(zip(grads, self.lstm.trainable_weights))
        
        self.train_lstm_loss.update_state(lstm_loss)
        self.train_lstm_accuracy.update_state(y, lstm_pred)
        return lstm_loss
        
        
    def test_step(self, one_batch):
        x, y = one_batch
        lstm_pred = self.lstm.predict(x)
        test_loss = self.loss_fn(y, lstm_pred)
        
        self.test_lstm_loss.update_state(test_loss)
        self.test_lstm_accuracy.update_state(y, lstm_pred)
        return test_loss
    
    def train(self, epochs, save_interval=50):
        current_time = str(time.strftime("%Y%m%d-%H%M%S"))
        log_dir = str(self.model_save_folder) + "\\logs\\" + current_time
        summary_writer = tf.summary.create_file_writer(log_dir)
        
        x_train, y_train, x_test, y_test = self.dataset
        test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(self.batch_size)
        for epoch in range(epochs+1):
            train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(9000).batch(self.batch_size)
            # Training
            for step, one_batch in enumerate(train_dataset):
                lstm_loss = self.train_step(one_batch)
            
            # Testing
            for step, test_batch in enumerate(test_dataset):
                test_loss = self.test_step(test_batch)
    
            epoch_train_loss = self.train_lstm_loss.result().numpy()
            epoch_train_acc = self.train_lstm_accuracy.result().numpy()
            epoch_test_loss = self.test_lstm_loss.result().numpy()
            epoch_test_acc = self.test_lstm_accuracy.result().numpy()

            # Print metrics
            print(f"epoch={epoch}/ train loss={epoch_train_loss:0.4f}" +
                  f"/ train accuracy={epoch_train_acc:0.4f}" +
                  f"/ test loss={epoch_test_loss:0.4f}" +
                  f"/ test accuracy={epoch_test_acc:0.4f}")

            # Tensorboard metrics
            with summary_writer.as_default():
                tf.summary.scalar("Train LSTM Loss", self.train_lstm_loss.result(), step=epoch)
                tf.summary.scalar("Train LSTM Accuracy", self.train_lstm_accuracy.result(), step=epoch)
                tf.summary.scalar("Test LSTM Loss", self.test_lstm_loss.result(), step=epoch)
                tf.summary.scalar("Test LSTM Accuracy", self.test_lstm_accuracy.result(), step=epoch)

            self.train_lstm_loss.reset_state()
            self.train_lstm_accuracy.reset_state()
            self.test_lstm_loss.reset_state()
            self.test_lstm_accuracy.reset_state()
            
            # Save at requested interval
            if epoch % save_interval == 0 and epoch != 0:
                self.save_models(epoch, epoch_train_loss, epoch_train_acc, epoch_test_acc)

    def test(self, new_dataset):
        x_new, y_new = new_dataset
        current_dataset = tf.data.Dataset.from_tensor_slices((x_new, y_new)).batch(self.batch_size)
        for step, new_batch in enumerate(current_dataset):
                new_loss = self.test_step(new_batch)

        epoch_new_loss = self.test_lstm_loss.result().numpy()
        epoch_new_acc = self.test_lstm_accuracy.result().numpy()
        print(f"Your dataset loss: {epoch_new_loss:0.4f}   accuracy:{epoch_new_acc:0.4f}")
        self.test_lstm_loss.reset_state()
        self.test_lstm_accuracy.reset_state()


    def save_models(self, epoch, train_loss, train_acc, test_acc):
        folder_name = self.model_save_folder / (f"epoch {epoch} " + str(time.strftime("%Y-%m-%d %H %M")))
        if not os.path.isdir(folder_name):
            os.mkdir(folder_name)

        with open(folder_name/f"lstm summary.txt", "w") as sum_file:
            self.lstm.summary(print_fn=lambda x: sum_file.write(x + '\n'))
        self.lstm.save(str(folder_name / (f"lstm train loss %.3f acc %.3f test acc %0.3f" % (train_loss, train_acc, test_acc))))
    
    def change_lr(self, new_rate):
        self.lstm_opt.learning_rate.assign(new_rate)

In [12]:
lstm_save_folder_name = MODEL_SAVE_FOLDER / "LSTM" / ("at " + str(time.strftime("%Y-%m-%d %H %M")))
if not os.path.isdir(lstm_save_folder_name):
    os.mkdir(lstm_save_folder_name)
    
lstm_model = LSTMAttack(lstm_save_folder_name, dataset = (x_train_norm, y_train, x_validation_norm, y_validation), batch_size=512)

Model: "lstm_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 33)]              0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 1, 33)             0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 44)                9856      
_________________________________________________________________
dropout_1 (Dropout)          (None, 44)                0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 44)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 14)                630       
_________________________________________________________________
output (Dense)               (None, 16)                2

In [13]:
lstm_model.train(100, save_interval=100)

epoch=0/ train loss=2.7650/ train accuracy=0.0633/ test loss=2.7344/ test accuracy=0.1944
epoch=1/ train loss=2.7061/ train accuracy=0.2035/ test loss=2.6784/ test accuracy=0.2231
epoch=2/ train loss=2.6528/ train accuracy=0.2177/ test loss=2.6270/ test accuracy=0.2325
epoch=3/ train loss=2.6034/ train accuracy=0.2292/ test loss=2.5770/ test accuracy=0.2519
epoch=4/ train loss=2.5553/ train accuracy=0.2583/ test loss=2.5247/ test accuracy=0.2794
epoch=5/ train loss=2.5017/ train accuracy=0.2805/ test loss=2.4625/ test accuracy=0.3156
epoch=6/ train loss=2.4386/ train accuracy=0.3306/ test loss=2.3906/ test accuracy=0.3744
epoch=7/ train loss=2.3639/ train accuracy=0.3804/ test loss=2.3066/ test accuracy=0.4219
epoch=8/ train loss=2.2780/ train accuracy=0.4214/ test loss=2.2126/ test accuracy=0.4600
epoch=9/ train loss=2.1821/ train accuracy=0.4614/ test loss=2.1073/ test accuracy=0.5069
epoch=10/ train loss=2.0760/ train accuracy=0.5063/ test loss=1.9958/ test accuracy=0.5487
epoch=11/



INFO:tensorflow:Assets written to: F:\ML\venv3.9\Scripts\Moradzadeh\First_Project\New_Section\models\LSTM\at 2022-02-04 02 35\epoch 100 2022-02-04 02 37\lstm train loss 0.023 acc 1.000 test acc 1.000\assets


INFO:tensorflow:Assets written to: F:\ML\venv3.9\Scripts\Moradzadeh\First_Project\New_Section\models\LSTM\at 2022-02-04 02 35\epoch 100 2022-02-04 02 37\lstm train loss 0.023 acc 1.000 test acc 1.000\assets


In [14]:
lstm_model.test((x_test_norm, y_test))

Your dataset loss: 0.0949   accuracy:0.9800


l24  d0.15  dense14 96.93

#### CNN

In [12]:
class CNNAttack:
    def __init__(self, model_save_folder, dataset, batch_size=512):
        self.input_shape = (33,) 
        self.model_save_folder = model_save_folder
        self.dataset = dataset
        self.batch_size = batch_size
        
        self.cnn_opt = tf.keras.optimizers.Adam(learning_rate=0.0005)
        self.loss_fn = tf.keras.losses.CategoricalCrossentropy()
        
        # Create CNN
        self.cnn = self.build_model()
        self.cnn.summary()
        
        # Define metrics for log
        self.train_cnn_loss = tf.keras.metrics.Mean('cnn_training_loss', dtype=tf.float32)
        self.train_cnn_accuracy = tf.keras.metrics.CategoricalAccuracy('cnn_training_accuracy', dtype=tf.float32)
        self.test_cnn_loss = tf.keras.metrics.Mean('cnn_test_loss', dtype=tf.float32)
        self.test_cnn_accuracy= tf.keras.metrics.CategoricalAccuracy('cnn_test_accuracy', dtype=tf.float32)
        

    def load_model(self, model_path):
        self.cnn = tf.keras.models.load_model(model_path) 
        
    def build_model(self):
        input_layer = tf.keras.Input(shape = self.input_shape)
        reshaper = Reshape((33, 1))(input_layer)
        cnn = Conv1D(16, kernel_size=3, strides=1, padding='same', activation='tanh')(reshaper)
        #bi_lstm = Activation("tanh")
        cnn = Dropout(0.15)(cnn)
        dense1 = Flatten()(cnn)
        dense1 = Dense(14, activation='tanh')(dense1)
        out = Dense(16, activation='softmax', name="output")(dense1)
        model_cnn = tf.keras.Model(inputs=input_layer, outputs= [out], name="cnn_model")
        return model_cnn
        
    
    @tf.function
    def train_step(self, one_batch):
        x, y = one_batch        
        with tf.GradientTape() as tape:
            cnn_pred = self.cnn(x)
            cnn_loss = self.loss_fn(y, cnn_pred)
        grads = tape.gradient(cnn_loss, self.cnn.trainable_weights)
        self.cnn_opt.apply_gradients(zip(grads, self.cnn.trainable_weights))
        
        self.train_cnn_loss.update_state(cnn_loss)
        self.train_cnn_accuracy.update_state(y, cnn_pred)
        return cnn_loss
        
        
    def test_step(self, one_batch):
        x, y = one_batch
        cnn_pred = self.cnn.predict(x)
        test_loss = self.loss_fn(y, cnn_pred)
        
        self.test_cnn_loss.update_state(test_loss)
        self.test_cnn_accuracy.update_state(y, cnn_pred)
        return test_loss
    
    def train(self, epochs, save_interval=50):
        current_time = str(time.strftime("%Y%m%d-%H%M%S"))
        log_dir = str(self.model_save_folder) + "\\logs\\" + current_time
        summary_writer = tf.summary.create_file_writer(log_dir)
        
        x_train, y_train, x_test, y_test = self.dataset
        test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(self.batch_size)
        for epoch in range(epochs+1):
            train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(9000).batch(self.batch_size)
            # Training
            for step, one_batch in enumerate(train_dataset):
                cnn_loss = self.train_step(one_batch)
            
            # Testing
            for step, test_batch in enumerate(test_dataset):
                test_loss = self.test_step(test_batch)
    
            epoch_train_loss = self.train_cnn_loss.result().numpy()
            epoch_train_acc = self.train_cnn_accuracy.result().numpy()
            epoch_test_loss = self.test_cnn_loss.result().numpy()
            epoch_test_acc = self.test_cnn_accuracy.result().numpy()

            # Print metrics
            print(f"epoch={epoch}/ train loss={epoch_train_loss:0.4f}" +
                  f"/ train accuracy={epoch_train_acc:0.4f}" +
                  f"/ test loss={epoch_test_loss:0.4f}" +
                  f"/ test accuracy={epoch_test_acc:0.4f}")

            # Tensorboard metrics
            with summary_writer.as_default():
                tf.summary.scalar("Train cnn Loss", self.train_cnn_loss.result(), step=epoch)
                tf.summary.scalar("Train cnn Accuracy", self.train_cnn_accuracy.result(), step=epoch)
                tf.summary.scalar("Test cnn Loss", self.test_cnn_loss.result(), step=epoch)
                tf.summary.scalar("Test cnn Accuracy", self.test_cnn_accuracy.result(), step=epoch)

            self.train_cnn_loss.reset_state()
            self.train_cnn_accuracy.reset_state()
            self.test_cnn_loss.reset_state()
            self.test_cnn_accuracy.reset_state()
            
            # Save at requested interval
            if epoch % save_interval == 0 and epoch != 0:
                self.save_models(epoch, epoch_train_loss, epoch_train_acc, epoch_test_acc)

    def test(self, new_dataset):
        x_new, y_new = new_dataset
        current_dataset = tf.data.Dataset.from_tensor_slices((x_new, y_new)).batch(self.batch_size)
        for step, new_batch in enumerate(current_dataset):
                new_loss = self.test_step(new_batch)

        epoch_new_loss = self.test_cnn_loss.result().numpy()
        epoch_new_acc = self.test_cnn_accuracy.result().numpy()
        print(f"Your dataset loss: {epoch_new_loss:0.4f}   accuracy:{epoch_new_acc:0.4f}")
        self.test_cnn_loss.reset_state()
        self.test_cnn_accuracy.reset_state()


    def save_models(self, epoch, train_loss, train_acc, test_acc):
        folder_name = self.model_save_folder / (f"epoch {epoch} " + str(time.strftime("%Y-%m-%d %H %M")))
        if not os.path.isdir(folder_name):
            os.mkdir(folder_name)

        with open(folder_name/f"cnn summary.txt", "w") as sum_file:
            self.cnn.summary(print_fn=lambda x: sum_file.write(x + '\n'))
        self.cnn.save(str(folder_name / (f"cnn train loss %.3f acc %.3f test acc %0.3f" % (train_loss, train_acc, test_acc))))
    
    def change_lr(self, new_rate):
        self.cnn_opt.learning_rate.assign(new_rate)

In [13]:
cnn_save_folder_name = MODEL_SAVE_FOLDER / "CNN" / ("at " + str(time.strftime("%Y-%m-%d %H %M")))
if not os.path.isdir(cnn_save_folder_name):
    os.mkdir(cnn_save_folder_name)
    
cnn_model = CNNAttack(cnn_save_folder_name, dataset = (x_train_norm, y_train, x_validation_norm, y_validation), batch_size=512)

Model: "cnn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 33)]              0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 33, 1)             0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 33, 16)            64        
_________________________________________________________________
dropout_1 (Dropout)          (None, 33, 16)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 528)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 14)                7406      
_________________________________________________________________
output (Dense)               (None, 16)                24

In [48]:
cnn_model.train(100, save_interval=100)

epoch=0/ train loss=0.1371/ train accuracy=0.9732/ test loss=0.1323/ test accuracy=0.9719
epoch=1/ train loss=0.1358/ train accuracy=0.9732/ test loss=0.1314/ test accuracy=0.9719
epoch=2/ train loss=0.1352/ train accuracy=0.9732/ test loss=0.1303/ test accuracy=0.9719
epoch=3/ train loss=0.1341/ train accuracy=0.9732/ test loss=0.1294/ test accuracy=0.9719
epoch=4/ train loss=0.1334/ train accuracy=0.9732/ test loss=0.1286/ test accuracy=0.9719
epoch=5/ train loss=0.1323/ train accuracy=0.9732/ test loss=0.1276/ test accuracy=0.9719
epoch=6/ train loss=0.1312/ train accuracy=0.9732/ test loss=0.1267/ test accuracy=0.9719
epoch=7/ train loss=0.1305/ train accuracy=0.9732/ test loss=0.1259/ test accuracy=0.9719
epoch=8/ train loss=0.1296/ train accuracy=0.9732/ test loss=0.1250/ test accuracy=0.9719
epoch=9/ train loss=0.1286/ train accuracy=0.9732/ test loss=0.1241/ test accuracy=0.9719
epoch=10/ train loss=0.1279/ train accuracy=0.9732/ test loss=0.1233/ test accuracy=0.9719
epoch=11/

In [15]:
cnn_model.test((x_test_norm, y_test))

Your dataset loss: 0.1592   accuracy:0.9847


In [14]:
cnn_model.load_model("F:\\ML\\venv3.9\\Scripts\\Moradzadeh\\First_Project\\New_Section\\models\CNN\\at 2022-02-04 03 10\\epoch 100 2022-02-04 03 13\\cnn train loss 0.097 acc 1.000 test acc 1.000")



c16 d10 dense14     98.47  first model

#### Testing

In [26]:
CNN_PATH = MODEL_SAVE_FOLDER / "best\\epoch 100 2022-02-04 03 13 best acc 98.47\\cnn train loss 0.097 acc 1.000 test acc 1.000"
LSTM_PATH = MODEL_SAVE_FOLDER / "best\\epoch 100 2022-02-04 02 37 best acc 98\\lstm train loss 0.023 acc 1.000 test acc 1.000"
acc_metric = tf.keras.metrics.CategoricalAccuracy('accuracy', dtype=tf.float32)
reverse_label = {v: k for k, v in test_label_dict.items()}

In [None]:
test_cnn = tf.keras.models.load_model(CNN_PATH)
cnn_out = test_cnn.predict(x_test_norm)
acc_metric.update_state(y_test, cnn_out)
cnn_acc = acc_metric.result().numpy()
print(cnn_acc)
cnn_pd = pd.DataFrame(columns=["predicted", "real"])
cnn_pd["predicted"] = np.argmax(cnn_out, axis=1).transpose()
cnn_pd["real"] = np.argmax(y_test, axis=1).transpose()
cnn_pd["real"], cnn_pd["predicted"] = cnn_pd["real"].map(reverse_label), cnn_pd["predicted"].map(reverse_label)
cnn_pd =pd.concat([cnn_pd, test_init_data], axis=1)
cnn_pd.to_csv(MODEL_SAVE_FOLDER / "final_cnn_output.csv")
acc_metric.reset_state()

In [34]:
test_lstm = tf.keras.models.load_model(LSTM_PATH)
lstm_out = test_lstm.predict(x_test_norm)
acc_metric.update_state(y_test, lstm_out)
lstm_acc = acc_metric.result().numpy()
print(lstm_acc)
lstm_pd = pd.DataFrame(columns=["predicted", "real"])
lstm_pd["predicted"] = np.argmax(lstm_out, axis=1).transpose()
lstm_pd["real"] = np.argmax(y_test, axis=1).transpose()
lstm_pd["real"], lstm_pd["predicted"] = lstm_pd["real"].map(reverse_label), lstm_pd["predicted"].map(reverse_label)
lstm_pd =pd.concat([lstm_pd, test_init_data], axis=1)
lstm_pd.to_csv(MODEL_SAVE_FOLDER / "final_lstm_output.csv")
acc_metric.reset_state()

0.97997004


### Pre-Processing

In [4]:
MAIN_FOLDER = pathlib.Path("F:\\ML\\venv3.9\\Scripts\\Moradzadeh\\First_Project\\New_Section")
TRAIN_FILE = MAIN_FOLDER / "data\\CaseI-Attacks without any change.csv"
TEST_FILE = MAIN_FOLDER / "data\\CaseIII-Attacks after DG integration.csv"


In [37]:
processed_train_file = MAIN_FOLDER / "data\\case1_train.csv"
processed_test_file = MAIN_FOLDER / "data\\case3_test.csv"

def catter(label):
    if label == "Normal":
        return 0
    else:
        num = [int(x) for x in label.split() if x.isdigit()]
        return num[0]

def label_fix(label):
    if label == 1010:
        return 10
    elif label == 1111:
        return 11
    elif label == 1212:
        return 12
    elif label == 1313:
        return 13
    elif label == 1414:
        return 14
    else:
        return label
    

train_pd = pd.read_csv(TRAIN_FILE)
train_pd
print(train_pd[train_pd.isna().any(axis=1)])
train_pd["Label"] = train_pd["Label"].apply(catter)
train_pd.describe()

Empty DataFrame
Columns: [Angle2, Angle3, Angle4, Angle5, Angle6, Angle7, Angle8, Angle9, Angle10, Angle11, Angle12, Angle13, Angle14, Branch1, Branch2, Branch3, Branch4, Branch5, Branch6, Branch7, Branch8, Branch9, Branch10, Branch11, Branch12, Branch13, Branch14, Branch15, Branch16, Branch17, Branch18, Branch19, Branch20, Label]
Index: []

[0 rows x 34 columns]


Unnamed: 0,Angle2,Angle3,Angle4,Angle5,Angle6,Angle7,Angle8,Angle9,Angle10,Angle11,...,Branch12,Branch13,Branch14,Branch15,Branch16,Branch17,Branch18,Branch19,Branch20,Label
count,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,...,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0
mean,-3.644671,-9.885094,-7.986397,-6.860374,-11.350325,-10.55182,-10.55182,-11.931749,-12.164337,-11.907007,...,6.103789,13.862711,-3.657806e-07,21.892834,4.804057,8.078886,-2.338343,1.22316,4.003366,10.242044
std,0.52268,1.167194,0.975713,0.852689,1.292482,1.27534,1.251084,1.427203,1.41686,1.37184,...,1.633094,3.198992,2.434109,5.329356,5.791752,1.935667,2.119438,2.048785,1.339676,13.294249
min,-4.935881,-12.809498,-10.437004,-8.957523,-14.576502,-13.703503,-13.703503,-15.460537,-15.720765,-15.354577,...,1.130133,4.089672,-7.042618,9.431774,-11.91827,2.808237,-10.444071,-5.988031,-0.099512,0.0
25%,-4.034503,-10.803379,-8.684925,-7.472805,-12.288738,-11.463168,-11.438667,-12.937993,-13.179128,-12.893056,...,5.507815,12.777745,0.0,19.060383,4.402054,7.426043,-2.612391,1.065833,3.382496,3.0
50%,-3.75108,-10.125673,-8.212203,-7.032201,-11.620262,-10.796308,-10.798035,-12.184635,-12.435873,-12.17231,...,6.357663,14.041871,0.0,22.170889,4.843327,8.212131,-2.425146,1.250523,4.120774,7.0
75%,-3.183308,-8.890593,-7.111775,-6.126686,-10.196569,-9.468049,-9.430181,-10.720062,-10.895193,-10.685274,...,6.530216,14.796432,0.0,23.938522,5.237946,8.565498,-1.919751,1.358263,4.40664,12.0
max,-2.292382,-6.725798,-5.534805,-4.750167,-8.130107,-7.449575,-7.449575,-8.47952,-8.656891,-8.508699,...,12.381601,26.28431,7.042618,38.202807,21.824992,14.317385,4.444975,8.413209,9.312726,59.0


In [38]:
test_pd = pd.read_csv(TEST_FILE)
print(test_pd[test_pd.isna().any(axis=1)])
test_pd["Label"] = test_pd["Label"].apply(catter)
test_pd["Label"] = test_pd["Label"].apply(label_fix)
test_pd.describe()

Empty DataFrame
Columns: [Angle2, Angle3, Angle4, Angle5, Angle6, Angle7, Angle8, Angle9, Angle10, Angle11, Angle12, Angle13, Angle14, Branch1, Branch2, Branch3, Branch4, Branch5, Branch6, Branch7, Branch8, Branch9, Branch10, Branch11, Branch12, Branch13, Branch14, Branch15, Branch16, Branch17, Branch18, Branch19, Branch20, Label]
Index: []

[0 rows x 34 columns]


Unnamed: 0,Angle2,Angle3,Angle4,Angle5,Angle6,Angle7,Angle8,Angle9,Angle10,Angle11,...,Branch12,Branch13,Branch14,Branch15,Branch16,Branch17,Branch18,Branch19,Branch20,Label
count,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,...,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0,10684.0
mean,-3.392706,-9.225727,-7.490133,-6.466731,-10.923167,-10.073946,-10.073946,-11.463768,-11.703609,-11.46277,...,6.081272,13.785654,8.093411e-07,22.049776,4.953864,8.178325,-2.188486,1.201756,3.904733,10.242231
std,0.609407,1.429952,1.155298,0.982335,1.396991,1.404112,1.384133,1.537724,1.52684,1.477777,...,1.578845,3.093755,2.320763,5.147532,5.563867,1.869044,2.047925,1.973673,1.307572,13.294288
min,-4.903846,-12.773253,-10.340843,-8.885402,-14.524701,-13.621052,-13.621052,-15.386231,-15.651771,-15.286318,...,1.15958,4.106071,-7.029676,9.431774,-11.852837,2.860472,-10.42713,-5.985022,-0.099512,0.0
25%,-3.84783,-10.288827,-8.365211,-7.202318,-11.96184,-11.114854,-11.106133,-12.572796,-12.816852,-12.549389,...,5.440254,12.673386,0.0,19.229689,4.540585,7.591718,-2.549857,1.046756,3.2365,3.0
50%,-3.497695,-9.475194,-7.736021,-6.636011,-11.309001,-10.416109,-10.46235,-11.869445,-12.169356,-11.909514,...,6.331139,13.989556,0.0,22.364242,4.986078,8.298088,-2.315938,1.224271,4.060306,7.0
75%,-2.93539,-8.191471,-6.614963,-5.711877,-9.822656,-8.973802,-8.973802,-10.282538,-10.500584,-10.296687,...,6.517659,14.716354,0.0,24.123869,5.363282,8.658781,-1.742206,1.341936,4.355144,12.0
max,-1.787035,-5.256002,-4.304178,-3.804625,-7.144336,-6.313136,-6.313136,-7.393421,-7.60575,-7.487309,...,12.376082,26.28431,7.03,38.173733,21.824992,14.317385,4.425189,8.401993,9.263333,59.0


In [8]:
train_pd.to_csv(processed_train_file)
test_pd.to_csv(processed_test_file)

In [9]:
print(train_pd.Label.unique())
print(test_pd.Label.unique())

[ 0  2  3  4  5  6  7  8  9 10 11 12 13 14 59 27]
[ 0  2  3  4  5  6  7  8  9 10 11 12 13 14 27 59]


In [39]:
def united_cat(df):
    unq_labels = sorted(df["Label"].unique())
    label_dict = dict(zip(unq_labels, list(range(len(unq_labels)))))
    df.Label = df["Label"].map(label_dict)
    return df, label_dict

new_train_df, train_label_dict = united_cat(train_pd)
new_test_df, test_label_dict = united_cat(test_pd)

In [40]:
y_train = new_train_df.pop("Label")
y_train = to_categorical(y_train)
x_train = new_train_df

y_test= new_test_df.pop("Label")
y_test = to_categorical(y_test)
x_test = new_test

In [41]:
mean_x = np.mean(x_train, axis=0)
var_x = np.var(x_train, axis=0)
print(mean_x, var_x)

Angle2     -3.644671e+00
Angle3     -9.885094e+00
Angle4     -7.986397e+00
Angle5     -6.860374e+00
Angle6     -1.135032e+01
Angle7     -1.055182e+01
Angle8     -1.055182e+01
Angle9     -1.193175e+01
Angle10    -1.216434e+01
Angle11    -1.190701e+01
Angle12    -1.224495e+01
Angle13    -1.238503e+01
Angle14    -1.318324e+01
Branch1     1.075064e+02
Branch2     5.368369e+01
Branch3     5.501638e+01
Branch4     4.297721e+01
Branch5     3.227778e+01
Branch6    -1.937585e+01
Branch7    -4.667018e+01
Branch8     2.189283e+01
Branch9     1.277686e+01
Branch10    3.336323e+01
Branch11    4.884840e+00
Branch12    6.103789e+00
Branch13    1.386271e+01
Branch14   -3.657806e-07
Branch15    2.189283e+01
Branch16    4.804057e+00
Branch17    8.078886e+00
Branch18   -2.338343e+00
Branch19    1.223160e+00
Branch20    4.003366e+00
dtype: float64 Angle2        0.273169
Angle3        1.362215
Angle4        0.951927
Angle5        0.727011
Angle6        1.670353
Angle7        1.626340
Angle8        1.565065

In [54]:
x_stats = pd.read_csv(MAIN_FOLDER / "data\\x_stats.csv", index_col=0)
x_norm_layer = tf.keras.layers.experimental.preprocessing.Normalization(mean=x_stats.loc['mean'], variance=x_stats.loc['var'])
x_train_norm = x_norm_layer(x_train)
x_test_norm = x_norm_layer(x_test)

Angle2        0.273169
Angle3        1.362215
Angle4        0.951927
Angle5        0.727011
Angle6        1.670353
Angle7        1.626340
Angle8        1.565065
Angle9        2.036718
Angle10       2.007305
Angle11       1.881770
Angle12       1.896029
Angle13       1.885333
Angle14       2.202255
Branch1     229.063933
Branch2      42.384365
Branch3      33.237365
Branch4      21.426008
Branch5      12.134427
Branch6       7.356930
Branch7      80.801109
Branch8       8.935240
Branch9       2.435188
Branch10     12.809927
Branch11      4.248541
Branch12      2.666747
Branch13     10.232591
Branch14      5.924334
Branch15     28.399375
Branch16     33.541248
Branch17      3.746455
Branch18      4.491599
Branch19      4.197129
Branch20      1.794563
Name: var, dtype: float64

In [55]:
x_stats

Unnamed: 0,Angle2,Angle3,Angle4,Angle5,Angle6,Angle7,Angle8,Angle9,Angle10,Angle11,...,Branch11,Branch12,Branch13,Branch14,Branch15,Branch16,Branch17,Branch18,Branch19,Branch20
mean,-3.644671,-9.885094,-7.986397,-6.860374,-11.350325,-10.55182,-10.55182,-11.931749,-12.164337,-11.907007,...,4.88484,6.103789,13.862711,-3.66e-07,21.892834,4.804057,8.078886,-2.338343,1.22316,4.003366
var,0.273169,1.362215,0.951927,0.727011,1.670353,1.62634,1.565065,2.036718,2.007305,1.88177,...,4.248541,2.666747,10.232591,5.924334,28.399375,33.541248,3.746455,4.491599,4.197129,1.794563
