In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from scipy.stats import norm, expon, uniform, pareto
from sklearn.metrics import mean_squared_error

In [55]:
df = pd.read_csv("Train_data.csv")
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [57]:
original_copy = pd.read_csv("Train_data.csv")
data = original_copy

#Exluding the class column
without_column = df.drop(columns='class')

#splitting data
train_data, test_data = train_test_split(without_column, test_size=0.3, random_state=42)

# Save the splits if needed
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)

print("Data is prepared")

Data is prepared


In [49]:
n_columns = train_data.select_dtypes(include=['number'])
z_scores = (n_columns - n_columns.mean()) / n_columns.std()

def detect_a(z_scores, threshold=3):

    anomalies = (np.abs(z_scores) > threshold).any(axis=1)
    return pd.DataFrame({'is_anomaly': anomalies})

# Example: Use a threshold of 3
threshold = 3
anomaly_results = detect_a(z_scores, threshold)

# Add 'is_anomaly' to the original training dataset
train_data['is_anomaly'] = anomaly_results['is_anomaly']

#Testing
thresholds = [1.5, 2.5, 3, 4.5]
for t in thresholds:
    anomalies = detect_a(z_scores, t)
    print(f"Threshold: {t}, Anomalies Detected: {anomalies['is_anomaly'].sum()}")

print("Z-score computation and anomaly detection successful!")

Threshold: 1.5, Anomalies Detected: 18195
Threshold: 2.5, Anomalies Detected: 7643
Threshold: 3, Anomalies Detected: 4563
Threshold: 4.5, Anomalies Detected: 2146
Z-score computation and anomaly detection successful!


In [35]:
original_data = original_copy
train_data = original_copy

# Bring the 'class' column back from original data
train_data['class'] = original_data.loc[train_data.index, 'class']

# Convert 'class' column to binary values
train_data['class'] = train_data['class'].map({'anomaly': 1, 'normal': 0})

# Exclude columns with zero variance
numerical_columns = train_data.select_dtypes(include=['float64', 'int64']).columns
numerical_columns = [col for col in numerical_columns if train_data[col].std() > 0]


# Replace NaN/Inf values
train_data = train_data.replace([np.inf, -np.inf], np.nan).dropna()


distributions = [
    stats.uniform, stats.expon, stats.norm, stats.pareto, stats.gamma, stats.lognorm
]

# Function to fit PDFs and calculate MSE
def fit_pdfs(column_data):
    range_min = column_data.min() - 0.1 * abs(column_data.min())
    range_max = column_data.max() + 0.1 * abs(column_data.max())
    x = np.linspace(range_min, range_max, 100)  # Adjusted range for fitting
    
    results = []
    for dist in distributions:
        try:
            params = dist.fit(column_data)
            pdf = dist.pdf(x, *params)
            histogram_density, bins = np.histogram(column_data, bins=30, density=True)
            bin_centers = 0.5 * (bins[1:] + bins[:-1])
            mse = np.mean((histogram_density - np.interp(bin_centers, x, pdf))**2)
            results.append((dist.name, mse, params, pdf))
        except Exception:
            pass  # Skip distributions that fail
    return sorted(results, key=lambda x: x[1])  # Sort by MSE

# Dictionary to store results for each column
results_summary = {}

# Loop through all numerical columns
for column in numerical_columns:
    print(f"Processing column: {column}")
    column_data = data[column]
    
    # Fit PDFs for the column
    fit_results = fit_pdfs(column_data)
    best_fit = fit_results[0]
    
    # Store the best fit results in the summary dictionary
    results_summary[column] = {
        "Best Fit": best_fit[0],
        "MSE": best_fit[1],
        "Parameters": best_fit[2]
    }
    
    print(f"  Best fit: {best_fit[0]} with MSE = {best_fit[1]:.4f}")
    
    # Plot histogram and best-fit PDF
    #x = np.linspace(column_data.min(), column_data.max(), 100)
    #plt.hist(column_data, bins=30, density=True, alpha=0.5, label="Data Histogram")
    #plt.plot(x, best_fit[3], label=f"Best Fit: {best_fit[0]} (MSE={best_fit[1]:.4f})", color="red")
    #plt.title(f"Best PDF Fit for {column}")
    #plt.xlabel("Value")
    #plt.ylabel("Density")
    #plt.legend()
    #plt.show()

# Print a summary of the best fits
print("\nSummary of Best Fits:")
for column, result in results_summary.items():
    print(f"Column: {column}")
    print(f"  Best Fit: {result['Best Fit']}")
    print(f"  MSE: {result['MSE']:.4f}")
    print(f"  Parameters: {result['Parameters']}")



Processing column: duration
  Best fit: expon with MSE = 0.0000
Processing column: src_bytes
  Best fit: norm with MSE = 0.0000
Processing column: dst_bytes
  Best fit: norm with MSE = 0.0000
Processing column: land
  Best fit: norm with MSE = 11.5011
Processing column: wrong_fragment
  Best fit: expon with MSE = 0.4016
Processing column: urgent
  Best fit: norm with MSE = 17.9016
Processing column: hot
  Best fit: norm with MSE = 0.0018
Processing column: num_failed_logins
  Best fit: norm with MSE = 0.5391
Processing column: logged_in
  Best fit: gamma with MSE = 8.2968
Processing column: num_compromised
  Best fit: norm with MSE = 0.0000
Processing column: root_shell
  Best fit: norm with MSE = 14.9667
Processing column: su_attempted
  Best fit: norm with MSE = 2.4453
Processing column: num_root
  Best fit: norm with MSE = 0.0000
Processing column: num_file_creations
  Best fit: norm with MSE = 0.0049
Processing column: num_shells
  Best fit: norm with MSE = 8.1430
Processing column

In [46]:
# Convert 'class' column to binary (0 for normal, 1 for anomaly)
train_data['class'] = train_data['class'].map({'normal': 0, 'anomaly': 1})

# Identify categorical columns
categorical_columns = train_data.select_dtypes(include=['object']).columns



def calculate_and_display_pmfs(data, class_col="class"):
    # Check for the class column
    if class_col not in data.columns:
        raise ValueError(f"'{class_col}' column not found in dataset.")
    
    # Identify categorical columns (excluding the class column)
    categorical_columns = data.select_dtypes(include="object").columns

    # Calculate PMFs for each column
    for column in categorical_columns:
        print(f"Calculating PMF for column: {column}")
        
        # Overall PMF
        overall_pmf = data[column].value_counts(normalize=True)
        
        # Conditional PMFs
        anomaly_pmf = data[data[class_col] == 1][column].value_counts(normalize=True)
        normal_pmf = data[data[class_col] == 0][column].value_counts(normalize=True)
        
        # Display PMFs
        print(f"\nPMF for column: {column}")
        print("Overall PMF:")
        print(overall_pmf)
        print("\nConditional PMF for Anomalies (class=1):")
        print(anomaly_pmf)
        print("\nConditional PMF for Normal (class=0):")
        print(normal_pmf)
        print("\n" + "-"*50 + "\n")

# Calculate and display PMFs
calculate_and_display_pmfs(data, class_col="class")



Calculating PMF for column: protocol_type

PMF for column: protocol_type
Overall PMF:
tcp     0.814782
udp     0.119522
icmp    0.065695
Name: protocol_type, dtype: float64

Conditional PMF for Anomalies (class=1):
Series([], Name: protocol_type, dtype: float64)

Conditional PMF for Normal (class=0):
Series([], Name: protocol_type, dtype: float64)

--------------------------------------------------

Calculating PMF for column: service

PMF for column: service
Overall PMF:
http         0.317680
private      0.172714
domain_u     0.072245
smtp         0.057518
ftp_data     0.055414
               ...   
urh_i        0.000159
red_i        0.000119
pm_dump      0.000119
tim_i        0.000079
http_8001    0.000040
Name: service, Length: 66, dtype: float64

Conditional PMF for Anomalies (class=1):
Series([], Name: service, dtype: float64)

Conditional PMF for Normal (class=0):
Series([], Name: service, dtype: float64)

--------------------------------------------------

Calculating PMF for c

In [63]:
data = pd.read_csv("Train_data.csv") 

# True labels and predicted labels
y_true = data['class']
y_pred = data['is_anomaly']

# Calculate confusion matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

# Calculate accuracy
accuracy = (tp + tn) / len(data)

# Print results
print(f"True Positives (TP): {tp}")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"Accuracy: {accuracy:.4f}")




KeyError: 'class'