In [5]:
def evaluate(y_test, predictions, heading='-----Evaluation-----'):
    print(heading)
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, predictions)
    print(cm)
    plt.figure(figsize=(15,10))
    categories = np.unique(y_test)
    df_cm = pd.DataFrame(cm, index = [i for i in categories], columns = [i for i in categories])
    sns.heatmap(df_cm,annot=True,cmap='Reds')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

    cr = classification_report(y_test, predictions, output_dict=True)
    print("\nClassification Report:")
    print(cr)
    acc = accuracy_score(y_test, predictions)
    print("Accuracy:", acc)
    return (cm, cr, acc)

def remove_files_from_directory(directory):
    # Get all files in the directory
    files = glob.glob(os.path.join(directory, "*"))
    
    # Loop through the files and remove each one
    for file in files:
        if os.path.isfile(file):
            os.remove(file)
    
    print(f"All files in {directory} have been removed.")

def get_anomaly_X_y_from_csv(csv_file, main_labels, target_column, normal_target, output_folder):
    df=pd.read_csv(os.path.join(output_folder, csv_file),usecols=main_labels)
    df=df.fillna(0)
    anomaly_or_not=[]
    for i in df[target_column]: #it changes the normal label to "1" and the anomaly tag to "0" for use in the machine learning algorithm
        if i == normal_target:
            anomaly_or_not.append(1)
        else:
            anomaly_or_not.append(0)           
    df[target_column]=anomaly_or_not

    # y = df[target_column].values
    # del df[target_column]
    # X = df.values
    y_df = df[target_column]
    X_df = df.drop(columns=[target_column])
    
    # X = np.float32(X)
    # X[np.isnan(X)] = 0
    # X[np.isinf(X)] = 0
    # print('X', type(X), X)
    # print('y', type(y), y)
    return (X_df, y_df, df)

def one_hot_encode(df, categorical_columns):
    # Initialize the OneHotEncoder
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    
    # Make a copy of the DataFrame to keep track of column order
    X_encoded = df.copy()
    
    # Loop over categorical columns to encode them one by one and retain column order
    for col in categorical_columns:
        # Get the position (index) of the categorical column in the original DataFrame
        col_position = X_encoded.columns.get_loc(col)
        
        # Reshape the column to a 2D array and apply one-hot encoding
        encoded_array = ohe.fit_transform(X_encoded[[col]])
        
        # Create a DataFrame for the one-hot encoded columns
        encoded_columns = ohe.get_feature_names_out([col])
        encoded_df = pd.DataFrame(encoded_array, columns=encoded_columns, index=X_encoded.index)
        
        # Drop the original categorical column from the DataFrame
        X_encoded = X_encoded.drop(columns=[col])
        
        # Insert the one-hot encoded columns back into the DataFrame at the original column's position
        for i, new_col in enumerate(encoded_columns):
            X_encoded.insert(col_position + i, new_col, encoded_df.iloc[:, i])
    
    # X_encoded now contains the one-hot encoded features in the correct positions
    print(list(X_encoded.columns))
    return (ohe, X_encoded)

In [4]:
def show_missing_values(all_df):
    plt.figure(figsize=(12,4))
    sns.heatmap(all_df.isnull(),cbar=False,cmap='Wistia',yticklabels=False)
    plt.title('Missing value in the dataset');

def show_target_values(all_df, target_column):
    target_counts = all_df[target_column].value_counts()

    fig, ax = plt.subplots(1, 2, figsize=(15,7))
    target_counts_barplot = sns.barplot(x = target_counts.index,y = target_counts.values, ax = ax[0], hue=target_counts.index, palette='Set2', legend=False)
    target_counts_barplot.set_ylabel('Number of classes in the dataset')
    
    target_counts.plot.pie(autopct="%1.1f%%", ax=ax[1])

def show_feature_correlation(all_df):
    plt.figure(figsize=(20,15))
    sns.heatmap(all_df.corr(), cmap='hsv')

In [None]:
def wandb_log(conf_matrix, class_report, acc_score):
    wandb.log({
        "Accuracy Score": acc_score
    })
        
    # Create a table for classification metrics
    class_report_table = wandb.Table(columns=["class", "precision", "recall", "f1-score", "support"])
    
    # Populate the table
    for class_name, metrics in class_report.items():
        if class_name not in ['accuracy', 'macro avg', 'weighted avg']:  # Skip overall avg metrics
            class_report_table.add_data(
                class_name, 
                metrics["precision"], 
                metrics["recall"], 
                metrics["f1-score"], 
                metrics["support"]
            )
    
    # Log the table to WandB
    wandb.log({"Classification Report": class_report_table})
    
    # You can also log the metrics separately if needed (for overall comparison/graphing)
    wandb.log({
        "precision_avg": class_report["weighted avg"]["precision"],
        "recall_avg": class_report["weighted avg"]["recall"],
        "f1-score_avg": class_report["weighted avg"]["f1-score"]
    })

    # Convert confusion matrix into a DataFrame for better clarity
    conf_df = pd.DataFrame(conf_matrix)
    wandb.log({"Confusion Matrix": wandb.Table(dataframe=conf_df)})