In [5]:
def evaluate(y_test, predictions, heading='-----Evaluation-----'):
    print(heading)
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, predictions)
    print(cm)
    plt.figure(figsize=(15,10))
    categories = np.unique(y_test)
    df_cm = pd.DataFrame(cm, index = [i for i in categories], columns = [i for i in categories])
    sns.heatmap(df_cm,annot=True,cmap='Reds')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()
    
    print("\nClassification Report:")
    print(classification_report(y_test, predictions))
    print("Accuracy:", accuracy_score(y_test, predictions))

def remove_files_from_directory(directory):
    # Get all files in the directory
    files = glob.glob(os.path.join(directory, "*"))
    
    # Loop through the files and remove each one
    for file in files:
        if os.path.isfile(file):
            os.remove(file)
    
    print(f"All files in {directory} have been removed.")

def get_anomaly_X_y_from_csv(csv_file, main_labels, target_column, normal_target, output_folder):
    df=pd.read_csv(os.path.join(output_folder, csv_file),usecols=main_labels)
    df=df.fillna(0)
    anomaly_or_not=[]
    for i in df[target_column]: #it changes the normal label to "1" and the anomaly tag to "0" for use in the machine learning algorithm
        if i == normal_target:
            anomaly_or_not.append(1)
        else:
            anomaly_or_not.append(0)           
    df[target_column]=anomaly_or_not

    # y = df[target_column].values
    # del df[target_column]
    # X = df.values
    y_df = df[target_column]
    X_df = df.drop(columns=[target_column])
    
    # X = np.float32(X)
    # X[np.isnan(X)] = 0
    # X[np.isinf(X)] = 0
    # print('X', type(X), X)
    # print('y', type(y), y)
    return (X_df, y_df, df)

In [13]:
def one_hot_encode(df, categorical_columns):
    # Initialize the OneHotEncoder
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    
    # Create a copy of the DataFrame for encoding
    X_encoded = df.copy()
    
    # List to store the one-hot encoded DataFrames
    encoded_dfs = []
    
    # Loop over categorical columns to encode them
    for col in categorical_columns:
        # Fit and transform the column with one-hot encoding
        encoded_array = ohe.fit_transform(X_encoded[[col]])
        
        # Create a DataFrame for the one-hot encoded columns
        encoded_columns = ohe.get_feature_names_out([col])
        encoded_df = pd.DataFrame(encoded_array, columns=encoded_columns, index=X_encoded.index)
        
        # Append the encoded DataFrame to the list
        encoded_dfs.append(encoded_df)
    
    # Drop the original categorical columns from the DataFrame
    X_encoded = X_encoded.drop(columns=categorical_columns)
    
    # Concatenate the original DataFrame (without categorical columns) with the encoded DataFrames
    X_encoded = pd.concat([X_encoded] + encoded_dfs, axis=1)
    
    # Ensure the DataFrame is de-fragmented by making a copy
    X_encoded = X_encoded.copy()
    
    # print(list(X_encoded.columns))
    return (ohe, X_encoded)

def label_encode(df, columns):
    le = LabelEncoder()
    X_encoded = df.copy()
    for col in columns:
        X_encoded[col] = le.fit_transform(X_encoded[col])
    return (le, X_encoded)

def standardise(df, columns, scaler=None):
    X_standardised = df.copy()
    
    if not scaler:
        scaler = StandardScaler()
        # Fit and transform the numeric columns
        X_standardised[columns] = scaler.fit_transform(X_standardised[columns])
    else:
        X_standardised[columns] = scaler.transform(X_standardised[columns])
    return (scaler, X_standardised)

In [4]:
def show_missing_values(all_df):
    plt.figure(figsize=(12,4))
    sns.heatmap(all_df.isnull(),cbar=False,cmap='Wistia',yticklabels=False)
    plt.title('Missing value in the dataset');

def show_target_values(all_df, target_column):
    target_counts = all_df[target_column].value_counts()

    fig, ax = plt.subplots(1, 2, figsize=(15,7))
    target_counts_barplot = sns.barplot(x = target_counts.index,y = target_counts.values, ax = ax[0], hue=target_counts.index, palette='Set2', legend=False)
    target_counts_barplot.set_ylabel('Number of classes in the dataset')
    
    target_counts.plot.pie(autopct="%1.1f%%", ax=ax[1])

def show_feature_correlation(all_df):
    plt.figure(figsize=(20,15))
    sns.heatmap(all_df.corr(), cmap='hsv')