Certainly! Below is a Python script that integrates data augmentation using SMOTE and GANs, applies three anomaly detection algorithms (Isolation Forest, One-Class SVM, and Local Outlier Factor), and evaluates them to find the most effective method for detecting unauthorized access anomalies.

In [None]:
#!pip install numpy pandas scikit-learn tensorflow imbalanced-learn

#### Necessary libraries

In [None]:

import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.metrics import precision_score, recall_score, f1_score

# 1. Simulate a sample dataset
data = {
    'Issue_ID': np.arange(1, 1001),
    'Category': np.random.choice(['Network', 'Access', 'Data Leak', 'Phishing', 'Malware'], 1000),
    'Severity': np.random.choice(['Low', 'Medium', 'High', 'Critical'], 1000),
    'Impact Score': np.random.randint(1, 11, 1000),
    'Risk Level': np.random.choice(['Low', 'Medium', 'High'], 1000),
    'User_ID': np.random.randint(1000, 2000, 1000),
    'Activity Type': np.random.choice(['login', 'file_access', 'data_modification'], 1000),
    'Session_Duration': np.random.normal(500, 200, 1000),
    'Data_Transfer_MB': np.random.normal(100, 50, 1000),
    'Login_Attempts': np.random.poisson(2, 1000)
}
df = pd.DataFrame(data)

# 2. Data Preprocessing
label_encoders = {}
for column in ['Category', 'Severity', 'Risk Level', 'Activity Type']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Normalize numerical columns
scaler = StandardScaler()
df[['Impact Score', 'Session_Duration', 'Data_Transfer_MB', 'Login_Attempts']] = scaler.fit_transform(
    df[['Impact Score', 'Session_Duration', 'Data_Transfer_MB', 'Login_Attempts']])

# 3. Address class imbalance using SMOTE

X = df.drop(columns=['Issue_ID', 'User_ID'])
y = df['Risk Level']
display(X.head())
display(y.head())
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 4. Synthetic data generation using GAN
latent_dim = 100
n_outputs = X_resampled.shape[1]

def build_generator(latent_dim, n_outputs):
    model = tf.keras.Sequential()
    model.add(layers.Dense(128, activation="relu", input_dim=latent_dim))
    model.add(layers.Dense(256, activation="relu"))
    model.add(layers.Dense(n_outputs, activation="tanh"))
    return model

def build_discriminator(n_outputs):
    model = tf.keras.Sequential()
    model.add(layers.Dense(256, activation="relu", input_shape=(n_outputs,)))
    model.add(layers.Dense(128, activation="relu"))
    model.add(layers.Dense(1, activation="sigmoid"))
    return model

# Initialize models
generator = build_generator(latent_dim, n_outputs)
discriminator = build_discriminator(n_outputs)
discriminator.compile(optimizer='adam', loss='binary_crossentropy')

# Train GAN
epochs = 1000
batch_size = 64
for epoch in range(epochs):
    noise = np.random.normal(0, 1, (batch_size, latent_dim))
    gen_data = generator.predict(noise)

    idx = np.random.randint(0, X_resampled.shape[0], batch_size)
    real_data = X_resampled.iloc[idx].values

    # Labels for real and generated data
    real_labels = np.ones((batch_size, 1))
    fake_labels = np.zeros((batch_size, 1))

    d_loss_real = discriminator.train_on_batch(real_data, real_labels)
    d_loss_fake = discriminator.train_on_batch(gen_data, fake_labels)

    noise = np.random.normal(0, 1, (batch_size, latent_dim))
    g_loss = discriminator.train_on_batch(generator.predict(noise), real_labels)

# 5. Concatenate SMOTE and GAN data to form the augmented dataset
synthetic_data = pd.DataFrame(gen_data, columns=X_resampled.columns)
X_augmented = pd.concat([X_resampled, synthetic_data], axis=0)
y_augmented = pd.concat([y_resampled, pd.Series(np.repeat(y_resampled.mode()[0], synthetic_data.shape[0]))])

# 6. Anomaly Detection

# Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_pred = iso_forest.fit_predict(X_augmented)

# One-Class SVM
one_class_svm = OneClassSVM(kernel="rbf", gamma=0.001, nu=0.05)
svm_pred = one_class_svm.fit_predict(X_augmented)

# Local Outlier Factor
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
lof_pred = lof.fit_predict(X_augmented)

# 7. Model Evaluation
# Convert -1 for anomalies, 1 for normal to binary labels for evaluation
def to_binary(predictions):
    return [0 if x == -1 else 1 for x in predictions]

iso_pred_bin = to_binary(iso_pred)
svm_pred_bin = to_binary(svm_pred)
lof_pred_bin = to_binary(lof_pred)

# Assume true labels for demonstration
true_labels = np.random.choice([0, 1], len(X_augmented), p=[0.95, 0.05])  # Randomly simulate true anomalies

# Evaluate each model
models = {'Isolation Forest': iso_pred_bin, 'One-Class SVM': svm_pred_bin, 'Local Outlier Factor': lof_pred_bin}
best_model, best_f1 = None, 0

for name, pred in models.items():
    precision = precision_score(true_labels, pred)
    recall = recall_score(true_labels, pred)
    f1 = f1_score(true_labels, pred)
    print(f"{name} - Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

    if f1 > best_f1:
        best_f1 = f1
        best_model = name

print(f"\nBest Model: {best_model} with F1 Score: {best_f1:.2f}")




### Explanation

1. **Data Augmentation**:
   - We address class imbalance using **SMOTE** and **GAN** for synthetic data generation.
  
2. **Anomaly Detection**:
   - We apply three algorithms: **Isolation Forest**, **One-Class SVM**, and **Local Outlier Factor (LOF)**.
   - Each model is evaluated using `precision`, `recall`, and `F1 Score`.

3. **Evaluation and Selection**:
   - The model with the highest `F1 Score` is chosen as the best approach to detect the security breach in our scenario.

This setup should help in simulating and evaluating anomaly detection for critical cybersecurity events using real-world augmentation and detection methods.