In [1]:
from os import listdir
import os as os
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import ast
import sklearn_crfsuite
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from tensorflow.keras.optimizers import Adam, RMSprop, SGD, Adagrad, Adadelta, Adamax, Nadam, Ftrl
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from itertools import product

In [2]:
#directory with data
directories = ["extra_data_primary","extra_data_secondary"]

description_nlp = "Log solution.csv"

csv_files = []
solutions = {}
descriptions = {}

# get all the information within the detailed version csv
for directory in directories:
    filenames = listdir(directory)
    for name in filenames:
        if name.endswith(".csv"):
            if name == description_nlp:
                # first read all the files that contain in the log solution file
                df = pd.read_csv(directory+"/"+name, encoding="utf-8", sep='delimiter', header=None, engine='python')
                for index, row in df.iterrows():
                    if index != 0:
                        for i in row:
                            line = i.split(";")
                            if line[0]:
                                file = line[0]
                                if (line[0].endswith(".csv")):
                                    file = line[0].split(".csv")[0]
                                # get the files with suffix equal to 'detailed_version' and its corresponding solution and short version description
                                if os.path.exists(directory+"/"+file+ "_detailed_version.csv"):
                                    csv_files.append(directory+"/"+file+ "_detailed_version.csv")
                                    solutions[directory+"/"+file+ "_detailed_version.csv"] = line[2]
                                    descriptions[directory+"/"+file+ "_detailed_version.csv"] = line[3]

In [3]:
# read csv file
errors = {}
times = {}
error_group_description = {}
for csv_file in csv_files:
    df = pd.read_csv(csv_file, encoding="utf-8", sep='delimiter', header=None, engine='python')
    group_errors = []
    subgroup_id = []
    subgroup_description = []
    lines = []
    for index, row in df.iterrows():
        if index != 0:
            for i in row:
                line = i.split(";")
                lines.append(line)
    if lines:
        lines = lines[::-1]
        # set the group id to group the events
        previous_group_id = lines[0][3]
        subgroup_time = []
        # add the time of the first event
        subgroup_time.append(lines[0][0])
        group_errors = []
        subgroup_errors = []
        subgroup_description = []
        # if the corresponding description of event is not set
        description_set = False
        
        for line in lines:
            # get the time
            time = line[0]
            # get the error code and failure node id
            error = ast.literal_eval(line[1])
            hex_value, decimal_value = error
            # get the group if of each event
            group_id = line[3]
            # get the error description
            error_description = line[4]
            # events with same group id are added to same subgroup
            if group_id == previous_group_id:
                subgroup_errors.append((int(hex_value, 16), int(decimal_value)))
                if description_set == False:
                    if error_description and error_description != '-':
                        subgroup_description.append(error_description)
                        description_set = True
            else:
                # warning if description of error is not set
                if not description_set:
                    print("Group",previous_group_id,"of",csv_file,"has no descriptions")
                # append the previous pattern
                group_errors.append(subgroup_errors)
                # new subgroup
                subgroup_errors = []
                subgroup_errors.append((int(hex_value, 16), int(decimal_value)))
                subgroup_time.append(time)
                # description of new subgroup
                description_set = False
                if error_description and error_description != '-':
                    subgroup_description.append(error_description)
                    description_set = True
                previous_group_id = group_id
        group_errors.append(subgroup_errors)
        errors[csv_file] = group_errors
        times[csv_file] = subgroup_time
        error_group_description[csv_file] = subgroup_description
    else:
        print(csv_file,"is empty")

In [4]:
# system description
system_description = []
# error sequences
sequences = []
# labes for each error sequence
labels = []
for csv_file in csv_files:
    for index,sequence in enumerate(errors[csv_file]):
        system_description.append(descriptions[csv_file])
        sequences.append(sequence)
        labels.append(error_group_description[csv_file][index])

In [5]:
# Initialize dictionaries for error codes and failure node IDs
dict_error = {}
dict_failure_node_id = {}
# Initialize indexing variables for error codes and failure node IDs
index_error = 1
index_failure = 1
max_length = 0
# Iterate through each sequence in the list of sequences
for sequence in sequences:
    # Update max_length if the current sequence is longer than the previous maximum
    if len(sequence) > max_length:
        max_length = len(sequence)
    # Iterate through each (error, failure_node_id) pair in the sequence
    for (error,failure_node_id) in sequence:
        # If the error is not already in the dictionary, add it with a new index
        if error not in dict_error:
            dict_error[error] = index_error
            index_error += 1 # Increment the index for the next unique error
        # If the failure_node_id is not already in the dictionary, add it with a new index
        if failure_node_id not in dict_failure_node_id:
            dict_failure_node_id[failure_node_id] = index_failure
            index_failure += 1

# Function to get the index of an error code from dict_error
def get_dict_error(error):
    if error in dict_error:
        return dict_error[error]
    else:
        return 0
        
# Function to get the index of a failure node ID from dict_failure_node_id
def get_dict_node_id(node_id):
    if node_id in dict_failure_node_id:
        return dict_failure_node_id[node_id]
    else:
        return 0

In [6]:
# 3 functions for observation level feature function

def get_transformed_multi(system_description,sequences,labels):
    transformed_data = []
    transformed_labels = []
    for lbl, seq, txt in zip(labels, sequences, system_description):
        transformed_sequence = [(txt, feat) for feat in seq]
        transformed_data.append(transformed_sequence)
        transformed_labels.append([lbl] * len(seq))
    return transformed_data,transformed_labels

def multi_label_extract_features(observation_sequence, t):
    normalized = True
    if normalized:
        x = len(dict_error)
        y = len(dict_failure_node_id)
    else:
        x = 1
        y = 1

    text = observation_sequence[t][0]
    if t == 0:  
        features = {
            'observation': observation_sequence[t][0],
            'error': get_dict_error(observation_sequence[t][1][0])/x,
            'failure_node': get_dict_node_id(observation_sequence[t][1][1])/y,
        }
    else:
        features = {
            'observation': " ",
            'error': get_dict_error(observation_sequence[t][1][0])/x,
            'failure_node': get_dict_node_id(observation_sequence[t][1][1])/y,
        }
    if t == 0:
        features.update({
            'prev_observation': '<START>',
            'prev_error': '<START>',
            'prev_feature': '<START>',
        })
    elif t == 1:
        features.update({
            'prev_observation': observation_sequence[t-1][0],
            'prev_error': get_dict_error(observation_sequence[t-1][1][0])/x,
            'prev_failure_node': get_dict_node_id(observation_sequence[t-1][1][1])/y,
        })
    else:
        features.update({
            'prev_observation': " ",
            'prev_error': get_dict_error(observation_sequence[t-1][1][0])/x,
            'prev_failure_node': get_dict_node_id(observation_sequence[t-1][1][1])/y,
        })
    return features


def get_features_multi_label(transformed_data):
    X_train_features = []
    for data in transformed_data:
        X_train_features.append([multi_label_extract_features(data, i) for i in range(len(data))])
    return X_train_features

In [7]:
# 3 functions for sequence level feature function
def get_transformed_sequence(system_description,sequences):
    length = len(system_description)
    observation_sequences = []
    for i in range(length):
        observation_sequences.append((system_description[i],sequences[i]))
    return observation_sequences

def single_label_extract_features(observation_sequence):
    features = {}
    normalized = True
    if normalized:
        x = len(dict_error)
        y = len(dict_failure_node_id)
    else:
        x = 1
        y = 1
    features['observation'] = observation_sequence[0]
    event_sequence = observation_sequence[1]
    length = len(event_sequence)
    for i in range(max_length):
        if i < length:
            features[f'error_{i}'] = get_dict_error(event_sequence[i][0])/x
            features[f'failure_node_{i}'] = get_dict_node_id(event_sequence[i][1])/y
        else:
            features[f'error_{i}'] = 0
            features[f'failure_node_{i}'] = 0
    return features


def get_features_sequence_label(observation_sequences):
    X_train_sequence = []
    y_train_sequence = []
    for observation_sequence in observation_sequences:
        X_train_sequence.append([single_label_extract_features(observation_sequence)])
    for label in labels:
        y_train_sequence.append([label])
    return X_train_sequence, y_train_sequence

In [8]:
# Tokenize the system descriptions to prepare them for LSTM input
description_tokenizer = Tokenizer()
description_tokenizer.fit_on_texts(system_description)
text_sequences = description_tokenizer.texts_to_sequences(system_description)

# Find the maximum length of the sequences to ensure consistent input shape for the LSTM
max_text_len = max(len(seq) for seq in text_sequences)
# Pad all sequences to the same length as the longest sequence, adding zeros to the end (post-padding)
lstm_description = pad_sequences(text_sequences, maxlen=max_text_len, padding='post')


# Function to transform each event sequence into a fixed-length sequence of numerical values
def transform_sequence(seq):
    # Convert each (error, node_id) pair in the sequence to normalized values based on dictionaries
    transformed = [(get_dict_error(error)/len(dict_error), get_dict_node_id(node_id)/len(dict_failure_node_id)) for error, node_id in seq]
    while len(transformed) < max_length:
        transformed.append((0, 0))
    return transformed

# Function to pad a list of event sequences for LSTM input
def pad_events_lstm(sequences):
    event_sequences_padded = [transform_sequence(seq) for seq in sequences]
    return event_sequences_padded

# Apply padding and transformation to the original sequences to prepare for LSTM training
lstm_sequences = pad_events_lstm(sequences)

# Tokenize the labels for the output
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)
label_sequences = label_tokenizer.texts_to_sequences(labels)

# Initialize variables to map unique labels to numerical values
numerical_value = 1
lstm_label = [] # List to store the numerical representation of the labels
dict_label = {} # Dictionary to map original label sequences to their numerical representation
for index,label_sequence in enumerate(label_sequences):
    found = False
    for key in dict_label:
        # if the current label sequence is already in the dictionary, map to the existing numerical value
        if dict_label[key][1] == label_sequence:
            found = True
            lstm_label.append(dict_label[key][0])
    if not found:
        # If the label sequence is not found in the dictionary, create and map to the incremented numerical value
        key = labels[index]
        dict_label[key] = [numerical_value,label_sequence]
        lstm_label.append(numerical_value)
        numerical_value += 1


In [10]:
# convertng the data for model training
lstm_label_categorical = to_categorical(np.array(lstm_label) - 1)
lstm_description = np.array(lstm_description)
lstm_sequences = np.array(lstm_sequences)

In [12]:
# split the dataset into training and test dataset
X_train_desc, X_test_desc, X_train_seq, X_test_seq, y_train, y_test = train_test_split(lstm_description, lstm_sequences, lstm_label_categorical, test_size=0.2, random_state=0)

Number of unique classes in y_train: 38
Number of unique classes in y_test: 14
Classes in y_train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 31 32 33 34 35 36 37 38]
Counts in y_train: [ 3  2 20  1  1  1  2  3  1  2  1  4  2  1  1  1  1  2  1  2  1  2  3  2
  2  1  5  2  2  2  3  4  1  2  1  6  1  7]
Classes in y_test: [ 0  2  3  4  7 11 19 21 23 26 30 31 32 33]
Counts in y_test: [1 9 1 1 1 1 1 1 1 1 2 1 1 3]


In [13]:
# hyperparameter values
param_grid = {
    'layers': np.arange(1, 6, 1),
    'neurons': np.arange(16,64,8),
    'optimizer': ['adam', 'rmsprop', 'sgd'],
    'learning_rate': [1e-2,5e-2,1e-3,5e-3,1e-4,5e-4],
    'epochs': np.arange(50, 250, 50),
    'batch_size': np.arange(8,32,8)
}

param_combinations = list(product(
    param_grid['layers'], 
    param_grid['neurons'], 
    param_grid['optimizer'], 
    param_grid['learning_rate'], 
    param_grid['epochs'], 
    param_grid['batch_size']
))

1440
[0.01, 0.05, 0.001, 0.005, 0.0001, 0.0005]


In [14]:
 # List to store the results of each hyperparameter configuration
results = []

for params in param_combinations:
    layers, neurons, optimizer, learning_rate, epochs, batch_size = params
    neurons = int(neurons)  # Ensure that 'neurons' is an integer
    
    # Define input and LSTM layers for the text data
    text_input = Input(shape=(X_train_desc.shape[1],), name='text_input')
    embedding = Embedding(input_dim=np.max(X_train_desc) + 1, output_dim=50)(text_input)
    x = LSTM(neurons, return_sequences=True)(embedding)
    
    # Add additional LSTM layers if specified
    for _ in range(layers - 2):
        x = LSTM(neurons, return_sequences=True)(x)
    
    # Final LSTM layer for text input
    text_lstm = LSTM(neurons)(x)
    
    # Define input and LSTM layers for the sequential data (event sequences)
    seq_input = Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2]), name='seq_input')
    y = LSTM(neurons, return_sequences=True)(seq_input)
    
    # Add additional LSTM layers if specified
    for _ in range(layers - 2):
        y = LSTM(neurons, return_sequences=True)(y)
    
    # Final LSTM layer for sequence input
    seq_lstm = LSTM(neurons)(y)
    
    # Concatenate the outputs of the text and sequence LSTM branches
    concat = Concatenate()([text_lstm, seq_lstm])
    # Output layer for classification, using softmax activation for multi-class classification
    output = Dense(y_train.shape[1], activation='softmax')(concat)

    # Create the model using the specified input and output layers
    model = Model(inputs=[text_input, seq_input], outputs=output)
    
    if optimizer == 'adam':
        opt = Adam(learning_rate=learning_rate)
    elif optimizer == 'rmsprop':
        opt = RMSprop(learning_rate=learning_rate)
    elif optimizer == 'sgd':
        opt = SGD(learning_rate=learning_rate)
    
    # Compile the model with categorical crossentropy loss and accuracy as the metric
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    
    # Train the model using the training data and specified hyperparameters
    model.fit([X_train_desc, X_train_seq], y_train, epochs=epochs, batch_size=batch_size, verbose=0)
    
    # Make predictions on the test data
    y_pred = model.predict([X_test_desc, X_test_seq])
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)
    
    # Calculate performance metrics for the model
    accuracy = accuracy_score(y_test_classes, y_pred_classes)
    precision = precision_score(y_test_classes, y_pred_classes, average='weighted', zero_division=0)
    recall = recall_score(y_test_classes, y_pred_classes, average='weighted', zero_division=0)
    f1 = f1_score(y_test_classes, y_pred_classes, average='weighted', zero_division=0)
    
    # Append the current hyperparameter configuration and performance metrics to the results list
    results.append({
        'layers': layers,
        'neurons': neurons,
        'optimizer': optimizer,
        'learning_rate': learning_rate,
        'epochs': epochs,
        'batch_size': batch_size,
        'accuracy': round(accuracy, 4),
        'precision': round(precision, 4),
        'recall': round(recall, 4),
        'f1_score': round(f1, 4)
    })

# Save the configuration and result in csv file
results_df = pd.DataFrame(results)
results_df.to_csv("lstm_hyperparameter_results.csv", sep=';', encoding="utf-8", index=True)

print("Best accuracy: index =", results_df['accuracy'].idxmax(), results_df.loc[results_df['accuracy'].idxmax()])
print("Best precision: index =", results_df['precision'].idxmax(), results_df.loc[results_df['precision'].idxmax()])
print("Best recall: index =", results_df['recall'].idxmax(), results_df.loc[results_df['recall'].idxmax()])
print("Best f1_score: index =", results_df['f1_score'].idxmax(), results_df.loc[results_df['f1_score'].idxmax()])


In [15]:
# Transform the data for first CRF model usage
transformed_data, y_train_multi = get_transformed_multi(system_description,sequences,labels)
X_train_multi = get_features_multi_label(transformed_data)

# Split the dataset
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_train_multi, y_train_multi, test_size=0.2, random_state=0)

# hyperparameter values
param_grid = {
    'c1': np.arange(0, 1, 0.1),
    'c2': np.arange(0, 1, 0.1),
    'min_freq': np.arange(0, 8, 1),
    'max_iterations': np.arange(50, 2000, 50),
    'all_possible_transitions': [True, False],
    'all_possible_states': [True, False],
    'algorithms':['lbfgs','l2sgd']
}

param_combinations = list(ParameterGrid(param_grid))

249600


In [15]:
 # List to store the results of each hyperparameter configuration
results = []

for params in param_combinations:
    if params['algorithms'] == 'lbfgs':
        crf = sklearn_crfsuite.CRF(
            algorithm=params['algorithms'],
            c1=params['c1'],
            c2=params['c2'],
            min_freq=params['min_freq'],
            max_iterations=params['max_iterations'],
            all_possible_transitions=params['all_possible_transitions'],
            all_possible_states=params['all_possible_states']
        )
    else:
        crf = sklearn_crfsuite.CRF(
            algorithm=params['algorithms'],
            c2=params['c2'],
            min_freq=params['min_freq'],
            max_iterations=params['max_iterations'],
            all_possible_transitions=params['all_possible_transitions'],
            all_possible_states=params['all_possible_states']
        )
    
    # Train the model
    crf.fit(X_train_multi, y_train_multi)
    # Calculate the prediction
    y_pred_multi = crf.predict(X_test_multi)
    
    # Calculate evaluation metrics
    accuracy = metrics.flat_accuracy_score(y_test_multi, y_pred_multi)
    precision = metrics.flat_precision_score(y_test_multi, y_pred_multi, average='weighted', labels=labels, zero_division=0)
    recall = metrics.flat_recall_score(y_test_multi, y_pred_multi, average='weighted', labels=labels, zero_division=0)
    f1 = metrics.flat_f1_score(y_test_multi, y_pred_multi, average='weighted', labels=labels, zero_division=0)
    
    # Store the results in a single line format
    if params['algorithms'] == 'lbfgs':
        results.append({
            'c1': params['c1'],
            'c2': params['c2'],
            'min_freq': params['min_freq'],
            'max_iterations': params['max_iterations'],
            'all_possible_transitions': params['all_possible_transitions'],
            'all_possible_states': params['all_possible_states'],
            'algorithms':params['algorithms'],
            'accuracy': round(accuracy,4),
            'precision': round(precision,4),
            'recall': round(recall,4),
            'f1_score': round(f1,4)
        })
    else:
        results.append({
            'c1': 0,
            'c2': params['c2'],
            'min_freq': params['min_freq'],
            'max_iterations': params['max_iterations'],
            'all_possible_transitions': params['all_possible_transitions'],
            'all_possible_states': params['all_possible_states'],
            'algorithms':params['algorithms'],
            'accuracy': round(accuracy,4),
            'precision': round(precision,4),
            'recall': round(recall,4),
            'f1_score': round(f1,4)
        })
        
# Save the configuration and result in csv file
results_df = pd.DataFrame(results)
results_df.to_csv("multi_hyperparameter_results.csv",sep=';' , encoding="utf-8", index=True)

print("accuracy: index =",results_df['accuracy'].idxmax(),results_df.loc[results_df['accuracy'].idxmax()])

print("precision: index =",results_df['precision'].idxmax(),results_df.loc[results_df['precision'].idxmax()])

print("recall: index =",results_df['recall'].idxmax(),results_df.loc[results_df['recall'].idxmax()])

print("f1_score: index =",results_df['f1_score'].idxmax(),results_df.loc[results_df['f1_score'].idxmax()])


accuracy: index = 34328 c1                             0.1
c2                             0.0
min_freq                         0
max_iterations                 100
all_possible_transitions     False
all_possible_states           True
algorithms                   lbfgs
accuracy                    0.6429
precision                   0.7847
recall                      0.8491
f1_score                    0.8087
Name: 34328, dtype: object
precision: index = 140736 c1                             0.0
c2                             0.1
min_freq                         0
max_iterations                 200
all_possible_transitions      True
all_possible_states           True
algorithms                   l2sgd
accuracy                    0.2321
precision                   0.9194
recall                      0.2347
f1_score                    0.3542
Name: 140736, dtype: object
recall: index = 143913 c1                             0.0
c2                             0.1
min_freq                        

In [16]:
# Transform the data for second CRF model usage
observation_sequences = get_transformed_sequence(system_description,sequences)
X_train_sequence, y_train_sequence = get_features_sequence_label(observation_sequences)

# Split the datasest
X_train_sequence, X_test_sequence, y_train_sequence, y_test_sequence = train_test_split(X_train_sequence,  y_train_sequence, test_size=0.2, random_state=0)


In [17]:
 # List to store the results of each hyperparameter configuration
results = []

for params in param_combinations:
    if params['algorithms'] == 'lbfgs':
        crf = sklearn_crfsuite.CRF(
            algorithm=params['algorithms'],
            c1=params['c1'],
            c2=params['c2'],
            min_freq=params['min_freq'],
            max_iterations=params['max_iterations'],
            all_possible_transitions=params['all_possible_transitions'],
            all_possible_states=params['all_possible_states']
        )
    else:
        crf = sklearn_crfsuite.CRF(
            algorithm=params['algorithms'],
            c2=params['c2'],
            min_freq=params['min_freq'],
            max_iterations=params['max_iterations'],
            all_possible_transitions=params['all_possible_transitions'],
            all_possible_states=params['all_possible_states']
        )
    
    # Train the model
    crf.fit(X_train_sequence, y_train_sequence)
    # Calculate the prediction
    y_pred_sequence = crf.predict(X_test_sequence)
    
    # Calculate evaluation metrics
    accuracy = metrics.flat_accuracy_score(y_test_sequence, y_pred_sequence)
    precision = metrics.flat_precision_score(y_test_sequence, y_pred_sequence, average='weighted', labels=labels, zero_division=0)
    recall = metrics.flat_recall_score(y_test_sequence, y_pred_sequence, average='weighted', labels=labels, zero_division=0)
    f1 = metrics.flat_f1_score(y_test_sequence, y_pred_sequence, average='weighted', labels=labels, zero_division=0)
    
    # Store the results in a single line format
    if params['algorithms'] == 'lbfgs':
        results.append({
            'c1': params['c1'],
            'c2': params['c2'],
            'min_freq': params['min_freq'],
            'max_iterations': params['max_iterations'],
            'all_possible_transitions': params['all_possible_transitions'],
            'all_possible_states': params['all_possible_states'],
            'algorithms':params['algorithms'],
            'accuracy': round(accuracy,4),
            'precision': round(precision,4),
            'recall': round(recall,4),
            'f1_score': round(f1,4)
        })
    else:
        results.append({
            'c1': 0,
            'c2': params['c2'],
            'min_freq': params['min_freq'],
            'max_iterations': params['max_iterations'],
            'all_possible_transitions': params['all_possible_transitions'],
            'all_possible_states': params['all_possible_states'],
            'algorithms':params['algorithms'],
            'accuracy': round(accuracy,4),
            'precision': round(precision,4),
            'recall': round(recall,4),
            'f1_score': round(f1,4)
        })

# Save the configuration and results in csv file
results_df = pd.DataFrame(results)
results_df.to_csv("sequenced_hyperparameter_results.csv",sep=';' , encoding="utf-8", index=True)

print("accuracy: index =",results_df['accuracy'].idxmax(),results_df.loc[results_df['accuracy'].idxmax()])

print("precision: index =",results_df['precision'].idxmax(),results_df.loc[results_df['precision'].idxmax()])

print("recall: index =",results_df['recall'].idxmax(),results_df.loc[results_df['recall'].idxmax()])

print("f1_score: index =",results_df['f1_score'].idxmax(),results_df.loc[results_df['f1_score'].idxmax()])


accuracy: index = 0 c1                             0.0
c2                             0.0
min_freq                         0
max_iterations                  50
all_possible_transitions      True
all_possible_states           True
algorithms                   lbfgs
accuracy                      0.56
precision                   0.8283
recall                      0.7329
f1_score                    0.7683
Name: 0, dtype: object
precision: index = 0 c1                             0.0
c2                             0.0
min_freq                         0
max_iterations                  50
all_possible_transitions      True
all_possible_states           True
algorithms                   lbfgs
accuracy                      0.56
precision                   0.8283
recall                      0.7329
f1_score                    0.7683
Name: 0, dtype: object
recall: index = 68640 c1                             0.2
c2                             0.0
min_freq                         0
max_iterations  

In [20]:
# Load the CSV files again
lstm_results = pd.read_csv('lstm_hyperparameter_results.csv', sep=';')

In [21]:
def get_top_5_by_metric(df, optimizer, metric):
    return df[df['optimizer'] == optimizer].nlargest(5, metric)

def extract_params(top_5_df):
    params_list = []
    for _, row in top_5_df.iterrows():
        params = {
            'layers': row['layers'],
            'neurons': row['neurons'],
            'learning_rate': row['learning_rate'],
            'epochs': row['epochs'],
            'batch_size': row['batch_size'],
        }
        params_list.append(params)
    return params_list

In [46]:
top_5_lstm_adam_accuracy = get_top_5_by_metric(lstm_results, 'adam', 'accuracy')
top_5_lstm_adam_precision = get_top_5_by_metric(lstm_results, 'adam', 'precision')
top_5_lstm_adam_recall = get_top_5_by_metric(lstm_results, 'adam', 'recall')
top_5_lstm_adam_f1_score = get_top_5_by_metric(lstm_results, 'adam', 'f1_score')

top_5_adam_accuracy_params = extract_params(top_5_lstm_adam_accuracy)
top_5_adam_precision_params = extract_params(top_5_lstm_adam_precision)
top_5_adam_recall_params = extract_params(top_5_lstm_adam_recall)
top_5_adam_f1_score_params = extract_params(top_5_lstm_adam_f1_score)

top_5_lstm_rmsprop_accuracy = get_top_5_by_metric(lstm_results, 'rmsprop', 'accuracy')
top_5_lstm_rmsprop_precision = get_top_5_by_metric(lstm_results, 'rmsprop', 'precision')
top_5_lstm_rmsprop_recall = get_top_5_by_metric(lstm_results, 'rmsprop', 'recall')
top_5_lstm_rmsprop_f1_score = get_top_5_by_metric(lstm_results, 'rmsprop', 'f1_score')

top_5_rmsprop_accuracy_params = extract_params(top_5_lstm_rmsprop_accuracy)
top_5_rmsprop_precision_params = extract_params(top_5_lstm_rmsprop_precision)
top_5_rmsprop_recall_params = extract_params(top_5_lstm_rmsprop_recall)
top_5_rmsprop_f1_score_params = extract_params(top_5_lstm_rmsprop_f1_score)

top_5_lstm_sgd_accuracy = get_top_5_by_metric(lstm_results, 'sgd', 'accuracy')
top_5_lstm_sgd_precision = get_top_5_by_metric(lstm_results, 'sgd', 'precision')
top_5_lstm_sgd_recall = get_top_5_by_metric(lstm_results, 'sgd', 'recall')
top_5_lstm_sgd_f1_score = get_top_5_by_metric(lstm_results, 'sgd', 'f1_score')

top_5_sgd_accuracy_params = extract_params(top_5_lstm_sgd_accuracy)
top_5_sgd_precision_params = extract_params(top_5_lstm_sgd_precision)
top_5_sgd_recall_params = extract_params(top_5_lstm_sgd_recall)
top_5_sgd_f1_score_params = extract_params(top_5_lstm_sgd_f1_score)

params_list = top_5_adam_accuracy_params + top_5_adam_precision_params + top_5_adam_recall_params + top_5_adam_f1_score_params + top_5_rmsprop_accuracy_params + top_5_rmsprop_precision_params + top_5_rmsprop_recall_params + top_5_rmsprop_f1_score_params + top_5_sgd_accuracy_params + top_5_sgd_precision_params + top_5_sgd_recall_params + top_5_sgd_f1_score_params

unique_params_set = {tuple(param.items()) for param in params_list}
unique_params_list = [dict(param) for param in unique_params_set]

print(len(unique_params_list))

24


In [51]:
def get_top_5_by_metric(df, optimizer, metric):
    return df[df['algorithms'] == optimizer].nlargest(5, metric)

def extract_params(top_5_df,feature_function):
    params_list = []
    for _, row in top_5_df.iterrows():
        params = {
            'feature_function': feature_function,
            'algorithm': row['algorithms'],
            'c1': row['c1'],
            'c2': row['c2'],
            'max_iterations': row['max_iterations'],
            'min_freq': row['min_freq'],
            'possible_states': row['all_possible_states'],
            'possible_transitions': row['all_possible_transitions'],
            'accuracy': row['accuracy'],
            'precision': row['precision'],
            'recall': row['recall'],
            'f1_score': row['f1_score']
        }
        params_list.append(params)
    return params_list

In [40]:
crf_multi_results = pd.read_csv('multi_hyperparameter_results.csv', sep=';')

top_5_multi_lbfgs_accuracy = get_top_5_by_metric(crf_multi_results, 'lbfgs', 'accuracy')
top_5_multi_lbfgs_precision = get_top_5_by_metric(crf_multi_results, 'lbfgs', 'precision')
top_5_multi_lbfgs_recall = get_top_5_by_metric(crf_multi_results, 'lbfgs', 'recall')
# top_5_multi_lbfgs_f1_score = get_top_5_by_metric(crf_multi_results, 'lbfgs', 'f1_score')

print(top_5_multi_lbfgs_accuracy)
top_5_multi_lbfgs_accuracy_params = extract_params(top_5_multi_lbfgs_accuracy,'observation-level')
top_5_multi_lbfgs_precision_params = extract_params(top_5_multi_lbfgs_precision,'observation-level')
top_5_multi_lbfgs_recall_params = extract_params(top_5_multi_lbfgs_recall,'observation-level')
top_5_multi_lbfgs_f1_score_params = extract_params(top_5_multi_lbfgs_f1_score,'observation-level')

top_5_multi_l2sgd_accuracy = get_top_5_by_metric(crf_multi_results, 'l2sgd', 'accuracy')
top_5_multi_l2sgd_precision = get_top_5_by_metric(crf_multi_results, 'l2sgd', 'precision')
top_5_multi_l2sgd_recall = get_top_5_by_metric(crf_multi_results, 'l2sgd', 'recall')
top_5_multi_l2sgd_f1_score = get_top_5_by_metric(crf_multi_results, 'l2sgd', 'f1_score')
print(top_5_multi_l2sgd_accuracy)

top_5_multi_l2sgd_accuracy_params = extract_params(top_5_multi_l2sgd_accuracy,'observation-level')
top_5_multi_l2sgd_precision_params = extract_params(top_5_multi_l2sgd_precision,'observation-level')
top_5_multi_l2sgd_recall_params = extract_params(top_5_multi_l2sgd_recall,'observation-level')
top_5_multi_l2sgd_f1_score_params = extract_params(top_5_multi_l2sgd_f1_score,'observation-level')

params_list = top_5_multi_lbfgs_accuracy_params + top_5_multi_lbfgs_precision_params + top_5_multi_lbfgs_recall_params + top_5_multi_lbfgs_f1_score_params + top_5_multi_l2sgd_accuracy_params + top_5_multi_l2sgd_precision_params + top_5_multi_l2sgd_recall_params + top_5_multi_l2sgd_f1_score_params  
unique_params_set = {tuple(param.items()) for param in params_list}
unique_params_list = [dict(param) for param in unique_params_set]

best_results_df = pd.DataFrame(unique_params_list)

best_results_df.to_csv('best_results_multi.csv', sep=';', encoding="utf-8", index=True)

       Unnamed: 0   c1   c2  min_freq  max_iterations  \
34328       34328  0.1  0.0         0             100   
65536       65536  0.1  0.0         0             150   
65544       65544  0.1  0.0         0             200   
65552       65552  0.1  0.0         0             250   
65560       65560  0.1  0.0         0             300   

       all_possible_transitions  all_possible_states algorithms  accuracy  \
34328                     False                 True      lbfgs    0.6429   
65536                      True                False      lbfgs    0.6071   
65544                      True                False      lbfgs    0.6071   
65552                      True                False      lbfgs    0.6071   
65560                      True                False      lbfgs    0.6071   

       precision  recall  f1_score  
34328     0.7847  0.8491    0.8087  
65536     0.7164  0.8715    0.7781  
65544     0.7164  0.8715    0.7781  
65552     0.7164  0.8715    0.7781  
65560    

In [52]:
crf_sequence_results = pd.read_csv('sequenced_hyperparameter_results.csv', sep=';')

top_5_sequence_lbfgs_accuracy = get_top_5_by_metric(crf_sequence_results, 'lbfgs', 'accuracy')
top_5_sequence_lbfgs_precision = get_top_5_by_metric(crf_sequence_results, 'lbfgs', 'precision')
top_5_sequence_lbfgs_recall = get_top_5_by_metric(crf_sequence_results, 'lbfgs', 'recall')
top_5_sequence_lbfgs_f1_score = get_top_5_by_metric(crf_sequence_results, 'lbfgs', 'f1_score')

print(top_5_sequence_lbfgs_accuracy)
top_5_sequence_lbfgs_accuracy_params = extract_params(top_5_sequence_lbfgs_accuracy,'sequence-level')
top_5_sequence_lbfgs_precision_params = extract_params(top_5_sequence_lbfgs_precision,'sequence-level')
top_5_sequence_lbfgs_recall_params = extract_params(top_5_sequence_lbfgs_recall,'sequence-level')
top_5_sequence_lbfgs_f1_score_params = extract_params(top_5_sequence_lbfgs_f1_score,'sequence-level')

top_5_sequence_l2sgd_accuracy = get_top_5_by_metric(crf_sequence_results, 'l2sgd', 'accuracy')
top_5_sequence_l2sgd_precision = get_top_5_by_metric(crf_sequence_results, 'l2sgd', 'precision')
top_5_sequence_l2sgd_recall = get_top_5_by_metric(crf_sequence_results, 'l2sgd', 'recall')
top_5_sequence_l2sgd_f1_score = get_top_5_by_metric(crf_sequence_results, 'l2sgd', 'f1_score')
print(top_5_sequence_l2sgd_accuracy)

top_5_sequence_l2sgd_accuracy_params = extract_params(top_5_sequence_l2sgd_accuracy,'sequence-level')
top_5_sequence_l2sgd_precision_params = extract_params(top_5_sequence_l2sgd_precision,'sequence-level')
top_5_sequence_l2sgd_recall_params = extract_params(top_5_sequence_l2sgd_recall,'sequence-level')
top_5_sequence_l2sgd_f1_score_params = extract_params(top_5_sequence_l2sgd_f1_score,'sequence-level')

params_list = top_5_sequence_lbfgs_accuracy_params + top_5_sequence_lbfgs_precision_params + top_5_sequence_lbfgs_recall_params + top_5_sequence_lbfgs_f1_score_params + top_5_sequence_l2sgd_accuracy_params + top_5_sequence_l2sgd_precision_params + top_5_sequence_l2sgd_recall_params + top_5_sequence_l2sgd_f1_score_params  
unique_params_set = {tuple(param.items()) for param in params_list}
unique_params_list = [dict(param) for param in unique_params_set]

best_results_df = pd.DataFrame(unique_params_list)

best_results_df.to_csv('best_results_sequence.csv', sep=';', encoding="utf-8", index=True)

       Unnamed: 0   c1   c2  min_freq  max_iterations  \
0               0  0.0  0.0         0              50   
31200       31200  0.0  0.0         0              50   
62400       62400  0.0  0.0         0              50   
62408       62408  0.0  0.0         0             100   
62416       62416  0.0  0.0         0             150   

       all_possible_transitions  all_possible_states algorithms  accuracy  \
0                          True                 True      lbfgs      0.56   
31200                     False                 True      lbfgs      0.56   
62400                      True                False      lbfgs      0.56   
62408                      True                False      lbfgs      0.56   
62416                      True                False      lbfgs      0.56   

       precision  recall  f1_score  
0         0.8283  0.7329    0.7683  
31200     0.8283  0.7329    0.7683  
62400     0.6973  0.8143    0.7392  
62408     0.7485  0.7329    0.7307  
62416    