In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/DESIGN PROJECT/final_feature_vector.csv')

# Encode categorical columns
user_encoder = LabelEncoder()
df['user_id_encoded'] = user_encoder.fit_transform(df['user'])

# Extract target variable
y_true = df.pop('is_anomaly')

# Define feature columns
features = [
    'total_working_seconds', 'total_logon_own_pc', 'total_logon_other_pc',
    'total_logon_own_pc_normal', 'total_logon_own_pc_off',
    'total_logon_other_pc_normal', 'total_logon_other_pc_off',
    'total_emails_sent', 'after_hour_emails', 'total_internal_recipients',
    'total_external_recipients', 'total_bcc_recipients', 'mails_with_attachments',
    'documents_copy_own_pc', 'documents_copy_other_pc', 'program_files_copy_own_pc',
    'program_files_copy_other_pc', 'documents_copy_own_pc_off_hour',
    'documents_copy_other_pc_off_hour', 'program_files_copy_own_pc_off_hour',
    'program_files_copy_other_pc_off_hour', 'device_connects_on_own_pc_normal_hour',
    'device_connects_on_other_pc_normal_hour', 'device_connects_on_own_pc_off_hour',
    'device_connects_on_other_pc_off_hour'
]

# Normalize features
scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])


In [None]:
def create_sequences(data, seq_length=7):
    sequences = []
    for i in range(len(data) - seq_length + 1):
        sequences.append(data[i:i + seq_length])
    return np.array(sequences)

# Prepare Training Data (Only Normal Data)
X_train = create_sequences(df.loc[y_true == 0, features].values, seq_length=7)

# Prepare Test Data (All Data)
X_test = create_sequences(df[features].values, seq_length=7)
y_test = y_true.iloc[len(y_true) - len(X_test):].values


In [None]:
# Optimized LSTM Autoencoder Model
input_layer = Input(shape=(7, len(features)))
encoded = LSTM(512, activation='relu', return_sequences=True)(input_layer)
encoded = BatchNormalization()(encoded)
encoded = LSTM(256, activation='relu', return_sequences=True)(encoded)
encoded = Dropout(0.3)(encoded)
encoded = LSTM(128, activation='relu', return_sequences=True)(encoded)
encoded = Dropout(0.3)(encoded)
encoded = LSTM(64, activation='relu', return_sequences=False)(encoded)

decoded = RepeatVector(7)(encoded)
decoded = LSTM(64, activation='relu', return_sequences=True)(decoded)
decoded = LSTM(128, activation='relu', return_sequences=True)(decoded)
decoded = LSTM(256, activation='relu', return_sequences=True)(decoded)
decoded = LSTM(512, activation='relu', return_sequences=True)(decoded)
decoded = TimeDistributed(Dense(len(features)))(decoded)

# Compile Model with Lower Learning Rate
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer=Adam(learning_rate=0.0001), loss='mse')


In [None]:
# Early Stopping with Patience
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1)

# Train Model
history = autoencoder.fit(
    X_train, X_train,
    epochs=100,
    batch_size=128,
    validation_split=0.1,
    callbacks=[early_stopping],
    verbose=1
)



In [None]:
# Compute Reconstruction Errors
X_pred = autoencoder.predict(X_test)
errors = np.mean(np.abs(X_test - X_pred), axis=(1, 2))


In [None]:
# Adjust Threshold Using Percentile Method
percentile_threshold = 90 # Adjust as needed (higher means fewer false positives)
theta = np.percentile(errors, percentile_threshold)
y_pred = (errors > theta).astype(int)


In [None]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

# Compute ROC-AUC
roc_auc = roc_auc_score(y_test, errors)

# Compute Precision-Recall AUC
precision, recall, _ = precision_recall_curve(y_test, errors)
pr_auc = auc(recall, precision)

# Print evaluation metrics
print(f"📊 Percentile-Based Threshold Evaluation:")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"PR-AUC Score: {pr_auc:.4f}")


In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, _ = roc_curve(y_test, errors)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, marker='.', label=f'ROC-AUC: {roc_auc:.4f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # Random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid()
plt.show()


In [None]:
import seaborn as sns
import numpy as np

conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 5))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Normal", "Anomaly"], yticklabels=["Normal", "Anomaly"])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
