<a href="https://colab.research.google.com/github/afrozmaria07/anomalyDetection-DBSCAN-PID-GAM-PCA/blob/main/Annomaly_Dectection_GAM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install interpret
!pip install liac-arff
import os
import pandas as pd
import numpy as np
from scipy.io import loadmat
import arff
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from interpret.glassbox import ExplainableBoostingClassifier  # GA²M
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

Collecting interpret
  Downloading interpret-0.6.4-py3-none-any.whl.metadata (1.1 kB)
Collecting interpret-core==0.6.4 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.4->interpret)
  Downloading interpret_core-0.6.4-py3-none-any.whl.metadata (2.8 kB)
Collecting shap>=0.28.5 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.4->interpret)
  Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting dill>=0.2.5 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.4->interpret)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting SALib>=1.3.3 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.4->interpret)
  Downloading salib-1.5.1-py3-none-any.whl.metadata (11 kB)
Collecting aplr>=10.6.1 (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sens

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
folder_path = '/content/drive/MyDrive/Dataset Anomaly'
dataset_files = ['/content/drive/MyDrive/Dataset Anomaly/annthyroid.mat', '/content/drive/MyDrive/Dataset Anomaly/ecoli.data', '/content/drive/MyDrive/Dataset Anomaly/cardio.mat','/content/drive/MyDrive/Dataset Anomaly/mulcross.arff','/content/drive/MyDrive/Dataset Anomaly/abalone (2).data']

In [None]:
def load_data(file_path, file_type):
    if file_type == ".data":
        df = pd.read_csv(file_path, header=None, sep=',')  # Adjust separator if necessary
        # Assuming the last column is the label
        X = df.iloc[:, :-1]  # All columns except the last one
        y = df.iloc[:, -1]   # The last column as labels
        return X, y  # Return both features and labels
    elif file_type == ".mat":
        mat_data = loadmat(file_path)
        if 'X' in mat_data and 'y' in mat_data:
            X = pd.DataFrame(mat_data['X'])
            y = pd.Series(mat_data['y'].flatten())  # Convert labels to 1D
            return X, y
        else:
            print(f"Unexpected structure in {file_path}: {mat_data.keys()}")
            return pd.DataFrame(), pd.Series()  # Return empty DataFrame and Series
    elif file_type == ".arff":
        with open(file_path) as f:
            arff_data = arff.load(f)
        data = np.array(arff_data['data'])
        X = pd.DataFrame(data[:, :-1])  # All columns except the last one
        y = pd.Series(data[:, -1])       # The last column as labels
        return X, y  # Return both features and labels
    else:
        print(f"Unsupported file type: {file_type}")
        return pd.DataFrame(), pd.Series()  # Return empty DataFrame and Series

# Preprocess Data
def preprocess_data(X, y=None):
    # Handle non-numeric columns if they exist
    for col in X.columns:
        if X[col].dtype == 'object':  # Check for non-numeric data
            le = LabelEncoder()  # Use label encoding to convert categorical to numeric
            X[col] = le.fit_transform(X[col].astype(str))

    # Handle missing values
    imputer = SimpleImputer(strategy='mean')
    X_imputed = pd.DataFrame(imputer.fit_transform(X))

    # Normalize the data
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed))

    return X_scaled, y

# Load and preprocess all datasets
def load_and_preprocess_all_data(folder_path, dataset_files):
    combined_X = pd.DataFrame()
    combined_y = pd.Series(dtype=np.float64)  # Empty series to store labels

    for file in dataset_files:
        file_path = os.path.join(folder_path, file)
        file_extension = os.path.splitext(file)[-1]

        X, y = load_data(file_path, file_extension)
        if not X.empty and not y.empty:  # Check for non-empty DataFrames/Series
            X_preprocessed, y_preprocessed = preprocess_data(X, y)
            combined_X = pd.concat([combined_X, X_preprocessed], ignore_index=True)
            combined_y = pd.concat([combined_y, y_preprocessed], ignore_index=True)
        else:
            print(f"Skipped file {file} due to empty data.")

    return combined_X, combined_y

# Apply GA²M model (Training Phase)
def train_ga2m(X_train, y_train):
    model = ExplainableBoostingClassifier()  # GA²M model
    model.fit(X_train, y_train)
    return model

# Test the model with PID (Testing Phase)
def pid_test(model, X_test):
    predictions = model.predict(X_test)
    # Assuming anomaly detection is based on the prediction of the model
    return predictions

# Evaluation metrics
def evaluate_model(y_true, y_pred):
    le = LabelEncoder()
    y_true_encoded = le.fit_transform(y_true.astype(str))
    y_pred_encoded = le.transform(y_pred.astype(str))

    accuracy = accuracy_score(y_true_encoded, y_pred_encoded)
    precision = precision_score(y_true_encoded, y_pred_encoded, average='weighted')
    recall = recall_score(y_true_encoded, y_pred_encoded, average='weighted')
    f1 = f1_score(y_true_encoded, y_pred_encoded, average='weighted')

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

In [None]:
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")


In [None]:
#def load_and_preprocess_all_data(folder_path, dataset_files):
 #   combined_data = pd.DataFrame()

  #  for file in dataset_files:
   #     file_path = os.path.join(folder_path, file)
    #    file_extension = os.path.splitext(file)[-1]

     #   df = load_data(file_path, file_extension)
      #  if df is not None:
       #     df = preprocess_data(df)  # Preprocess the data
        #    combined_data = pd.concat([combined_data, df], ignore_index=True)

    #return combined_data

In [None]:
#def train_ga2m(X_train, y_train):
 #   model = ExplainableBoostingClassifier()  # GA²M model
  #  model.fit(X_train, y_train)
   # return model

In [None]:
#def pid_test(model, X_test):
 #   predictions = model.predict(X_test)
    # Assuming anomaly detection is based on the prediction of the model
  #  return predictions

In [None]:
#def evaluate_model(y_true, y_pred):
 #   accuracy = accuracy_score(y_true, y_pred)
  #  precision = precision_score(y_true, y_pred, average='weighted')
   # recall = recall_score(y_true, y_pred, average='weighted')
    #f1 = f1_score(y_true, y_pred, average='weighted')

    #print(f"Accuracy: {accuracy}")
    #print(f"Precision: {precision}")
    #print(f"Recall: {recall}")
    #print(f"F1 Score: {f1}")

In [None]:
combined_X, combined_y = load_and_preprocess_all_data(folder_path, dataset_files)

# Check the shapes of combined_X and combined_y
print(f"Shape of combined_X: {combined_X.shape}")
print(f"Shape of combined_y: {combined_y.shape}")


  combined_y = pd.concat([combined_y, y_preprocessed], ignore_index=True)


Skipped file /content/drive/MyDrive/Dataset Anomaly/ecoli.data due to empty data.
Shape of combined_X: (275352, 21)
Shape of combined_y: (275352,)


In [None]:
if not combined_X.empty and not combined_y.empty:
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(combined_X, combined_y, test_size=0.3, random_state=42)

    # Train the GA²M model
    ga2m_model = train_ga2m(X_train, y_train)

    # Apply PID for testing
    y_pred = pid_test(ga2m_model, X_test)

  warn(
  warn(


In [None]:
print(y_test.dtype)
print(y_pred.dtype)
print(y_test.unique())
print(np.unique(y_pred))

object
<U7
['Normal' 9 0.0 'Anomaly' 10 7 15 6 12 11 1.0 8 18 20 13 16 19 21 5 17 14
 4 3 22 26 23]
['0.0' '1.0' '10' '11' '16' '4' '5' '6' '7' '8' '9' 'Anomaly' 'Normal']


In [None]:
if not combined_X.empty and not combined_y.empty:

    # Evaluate the model
    evaluate_model(y_test, y_pred)
else:
    print("No data loaded. Please check your data files.")

Accuracy: 0.9874585381207176
Precision: 0.9865180538112541
Recall: 0.9874585381207176
F1 Score: 0.9866645272105448


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
