In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model

# Load and prepare data
df = pd.read_csv('Dos attacks-SlowHTTPTEST.csv')

# Separate features and target
X = df.drop('Label', axis=1)  # Assuming 'Label' is the target column
y = df['Label']

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# 1. Random Forest Feature Extraction
def random_forest_feature_importance(X_train, y_train, X_test):
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    # Get feature importance scores
    importances = rf.feature_importances_
    feature_indices = np.argsort(importances)[::-1][:10]  # Top 10 features
    
    return X_train[:, feature_indices], X_test[:, feature_indices], rf

# 2. LDA Feature Extraction
def lda_feature_extraction(X_train, y_train, X_test):
    lda = LinearDiscriminantAnalysis()
    X_train_lda = lda.fit_transform(X_train, y_train)
    X_test_lda = lda.transform(X_test)
    
    return X_train_lda, X_test_lda, lda

# 3. Autoencoder Feature Extraction
def autoencoder_feature_extraction(X_train, X_test):
    input_dim = X_train.shape[1]
    encoding_dim = 10
    
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation='relu')(input_layer)
    decoded = Dense(input_dim, activation='sigmoid')(encoded)
    
    autoencoder = Model(input_layer, decoded)
    encoder = Model(input_layer, encoded)
    
    autoencoder.compile(optimizer='adam', loss='mse')
    autoencoder.fit(X_train, X_train, epochs=50, batch_size=256, shuffle=True, verbose=0)
    
    X_train_encoded = encoder.predict(X_train)
    X_test_encoded = encoder.predict(X_test)
    
    return X_train_encoded, X_test_encoded, encoder

# Apply feature extraction methods
X_train_rf, X_test_rf, rf_model = random_forest_feature_importance(X_train, y_train, X_test)
X_train_lda, X_test_lda, lda_model = lda_feature_extraction(X_train, y_train, X_test)
X_train_ae, X_test_ae, encoder_model = autoencoder_feature_extraction(X_train, X_test)

# Create ensemble model
def create_ensemble_model():
    rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_clf.fit(X_train_rf, y_train)
    return rf_clf

ensemble_model = create_ensemble_model()

# Function to predict single log
def predict_log(log_data):
    # Scale the input data
    log_scaled = scaler.transform(log_data)
    
    # Extract features using all methods
    log_rf = log_scaled[:, rf_model.feature_importances_.argsort()[::-1][:10]]
    log_lda = lda_model.transform(log_scaled)
    log_ae = encoder_model.predict(log_scaled)
    
    # Make prediction using ensemble model
    prediction = ensemble_model.predict(log_rf)
    
    return le.inverse_transform(prediction)[0]

# Function to test random log from dataset
def test_random_log():
    # Select random row
    random_idx = np.random.randint(0, len(df))
    random_log = df.iloc[random_idx:random_idx+1].copy()
    actual_label = random_log['Label'].values[0]
    random_log_features = random_log.drop('Label', axis=1)
    
    # Make prediction
    predicted_label = predict_log(random_log_features)
    
    print("Random Log Details:")
    print(random_log)
    print("\nActual Label:", actual_label)
    print("Predicted Label:", predicted_label)
    print("Attack Detected:", "Yes" if predicted_label == "Dos attacks-SlowHTTPTEST" else "No")

# Test the model with random log
test_random_log()

# Space for custom log testing
"""
# To test your own log, use this format:
custom_log = pd.DataFrame({
    'feature1': [value1],
    'feature2': [value2],
    ...
})
result = predict_log(custom_log)
print("Prediction for custom log:", result)
"""

# Print model performance metrics
y_pred = ensemble_model.predict(X_test_rf)
print("\nModel Performance Metrics:")
print(classification_report(y_test, y_pred))

MemoryError: Unable to allocate 64.0 KiB for an array with shape (8192,) and data type int64

In [2]:
test_random_log()

NameError: name 'test_random_log' is not defined