In [2]:
import numpy as np
import pandas as pd
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score

data = pd.read_csv('Train_Data.csv')

data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)


# Identify columns
categorical_columns = data.select_dtypes(include="object").drop(columns=["class"]).columns
numerical_columns = data.select_dtypes(include=np.number).columns

# Ensure class labels are consistent
data["class"] = data["class"].map({"normal": 0, "anomaly": 1})

# Calculate priors (class probabilities)
priors = data["class"].value_counts(normalize=True).to_dict()

# Calculate probabilities for categorical features
categorical_probs = {}
for col in categorical_columns:
    categorical_probs[col] = data.groupby("class")[col].value_counts(normalize=True).unstack(fill_value=0)

# Numerical Feature Probabilities
numerical_dists = {col: {} for col in numerical_columns}
for col in numerical_columns:
    for cls in data["class"].unique():
        cls_data = data[data["class"] == cls][col]
        mean = cls_data.mean()
        std = cls_data.std()
        
        # Handle zero standard deviation
        if std == 0 or np.isnan(std):
            std = 1e-6  # Small positive value to avoid division by zero
        
        numerical_dists[col][cls] = (mean, std)

X_categorical = data[categorical_columns]
X_numerical = data[numerical_columns]

# Naïve Bayes Prediction Function
def naive_bayes_predict(row):
    posteriors = {}
    for cls in data["class"].unique():
        posterior = priors[cls]  # Start with the prior
        
        # Categorical features
        for col in categorical_columns:
            if row[col] in categorical_probs[col].columns:
                posterior *= categorical_probs[col].loc[cls, row[col]]
            else:
                posterior *= 0  # Handle unseen categories
        
        # Numerical features
        for col in numerical_columns:
            mean, std = numerical_dists[col][cls]
            try:
                posterior *= norm.pdf(row[col], loc=mean, scale=std)
            except RuntimeWarning:
                posterior *= 1e-9  # Assign a very small probability in case of errors
        
        posteriors[cls] = posterior
    
    # Return class with highest posterior probability
    return max(posteriors, key=posteriors.get)



# Apply prediction to the entire dataset
data["predicted_class"] = data.apply(naive_bayes_predict, axis=1)

#print("Categorical Columns:", categorical_columns)
#print("Numerical Columns:", numerical_columns)

# Map class labels back to original
data["class"] = data["class"].map({0: "normal", 1: "anomaly"})
data["predicted_class"] = data["predicted_class"].map({0: "normal", 1: "anomaly"})

# Compare with actual labels and calculate accuracy
accuracy = (data["predicted_class"] == data["class"]).mean()
print(f"Accuracy: {accuracy:.2f}")



#Calculate confusion matrix components
TP = ((data["class"] == "anomaly") & (data["predicted_class"] == "anomaly")).sum()  # True Positives
TN = ((data["class"] == "normal") & (data["predicted_class"] == "normal")).sum()  # True Negatives
FP = ((data["class"] == "normal") & (data["predicted_class"] == "anomaly")).sum()  # False Positives
FN = ((data["class"] == "anomaly") & (data["predicted_class"] == "normal")).sum()  # False Negatives

# Calculate Accuracy, Precision, and Recall
#accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = TP / (TP + FP) if (TP + FP) > 0 else 0  # Avoid division by zero
recall = TP / (TP + FN) if (TP + FN) > 0 else 0  # Avoid division by zero

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

# Display predictions for reference
#print(data[["class", "predicted_class"]])

Accuracy: 0.90
Accuracy: 0.90
Precision: 0.91
Recall: 0.88


In [6]:
from sklearn.preprocessing import OneHotEncoder



# Load your dataset
data = pd.read_csv('Train_Data.csv')

# Clean data (replace NaN and inf values)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

# Identify categorical columns and separate the target column ('class')
categorical_columns = data.select_dtypes(include="object").drop(columns=["class"]).columns
numerical_columns = data.select_dtypes(include=np.number).columns

# Ensure class labels are consistent (map 'normal' to 0, 'anomaly' to 1)
data["class"] = data["class"].map({"normal": 0, "anomaly": 1})

# One-hot encode categorical columns
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Separate features (X) and target (y)
X = data_encoded.drop(columns=["class"])  # Drop the target column from features
y = data_encoded["class"]  # Target column

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models to train
models = {
    "Gaussian Naive Bayes": GaussianNB(),
    "Multinomial Naive Bayes": MultinomialNB(),
    "Bernoulli Naive Bayes": BernoulliNB()
}

metrics = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": []
}

# Evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Append metrics to the dictionary
    metrics["Model"].append(model_name)
    metrics["Accuracy"].append(accuracy)
    metrics["Precision"].append(precision)
    metrics["Recall"].append(recall)

    # Print results for the current model
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print("-" * 40)


# You can compare these results to determine which model performs best based on your needs.
metrics_df = pd.DataFrame(metrics)

# Find the best model based on a chosen metric (e.g., accuracy)
best_model_index = metrics_df["Accuracy"].idxmax()
best_model_name = metrics_df.loc[best_model_index, "Model"]

print("\nModel Comparison:")
print(metrics_df)
print(f"\nBest Model Based on Accuracy: {best_model_name}")


Model: Gaussian Naive Bayes
Accuracy: 0.56
Precision: 0.80
Recall: 0.08
----------------------------------------
Model: Multinomial Naive Bayes
Accuracy: 0.55
Precision: 0.64
Recall: 0.10
----------------------------------------
Model: Bernoulli Naive Bayes
Accuracy: 0.90
Precision: 0.95
Recall: 0.84
----------------------------------------

Model Comparison:
                     Model  Accuracy  Precision    Recall
0     Gaussian Naive Bayes  0.557253   0.799107  0.075687
1  Multinomial Naive Bayes  0.551498   0.638522  0.102326
2    Bernoulli Naive Bayes  0.902957   0.945394  0.841860

Best Model Based on Accuracy: Bernoulli Naive Bayes
