# Cybersecurity Threat Detection - Exploratory Analysis

This notebook demonstrates the complete workflow of the AI-driven cybersecurity threat detection system.

## 1. Setup and Imports

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_collection.collector import load_sample_threat_data
from preprocessing.preprocessor import DataPreprocessor
from feature_engineering.engineer import FeatureEngineer
from model_training.trainer import RandomForestModel, GradientBoostingModel, ModelTrainer

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 2. Load and Explore Data

In [None]:
# Load sample threat data
df = load_sample_threat_data()

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Data statistics
print("Dataset Statistics:")
print(df.describe())

In [None]:
# Check class distribution
threat_counts = df['is_malicious'].value_counts()
print(f"\nClass Distribution:")
print(f"Benign: {threat_counts[0]} ({threat_counts[0]/len(df)*100:.1f}%)")
print(f"Malicious: {threat_counts[1]} ({threat_counts[1]/len(df)*100:.1f}%)")

# Visualize
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='is_malicious')
plt.title('Distribution of Threats')
plt.xlabel('Is Malicious')
plt.ylabel('Count')
plt.xticks([0, 1], ['Benign', 'Threat'])
plt.show()

## 3. Feature Engineering

In [None]:
# Engineer features
engineer = FeatureEngineer()
df_engineered = engineer.engineer_features_pipeline(
    df, 
    include_statistical=False,
    include_temporal=False
)

print(f"Original features: {df.shape[1]}")
print(f"After engineering: {df_engineered.shape[1]}")
print(f"\nNew columns: {list(set(df_engineered.columns) - set(df.columns))}")

In [None]:
# Visualize some engineered features
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Packet size distribution
axes[0, 0].hist([df_engineered[df_engineered['is_malicious']==0]['packet_size'],
                 df_engineered[df_engineered['is_malicious']==1]['packet_size']],
                label=['Benign', 'Threat'], alpha=0.7)
axes[0, 0].set_title('Packet Size Distribution')
axes[0, 0].legend()

# Port distribution
axes[0, 1].scatter(df_engineered['source_port'], df_engineered['destination_port'],
                   c=df_engineered['is_malicious'], alpha=0.5, cmap='coolwarm')
axes[0, 1].set_title('Source vs Destination Port')
axes[0, 1].set_xlabel('Source Port')
axes[0, 1].set_ylabel('Destination Port')

# Payload ratio
if 'payload_ratio' in df_engineered.columns:
    axes[1, 0].boxplot([df_engineered[df_engineered['is_malicious']==0]['payload_ratio'],
                        df_engineered[df_engineered['is_malicious']==1]['payload_ratio']],
                       labels=['Benign', 'Threat'])
    axes[1, 0].set_title('Payload Ratio by Class')

# Protocol distribution
protocol_threat = df_engineered.groupby(['protocol', 'is_malicious']).size().unstack(fill_value=0)
protocol_threat.plot(kind='bar', ax=axes[1, 1], alpha=0.7)
axes[1, 1].set_title('Protocol Distribution by Threat Status')
axes[1, 1].legend(['Benign', 'Threat'])

plt.tight_layout()
plt.show()

## 4. Data Preprocessing

In [None]:
# Preprocess data
preprocessor = DataPreprocessor()
data = preprocessor.preprocess_pipeline(
    df_engineered,
    target_col='is_malicious',
    test_size=0.2,
    scaling_method='standard'
)

X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Features: {data['feature_names']}")

## 5. Model Training

In [None]:
# Initialize trainer
trainer = ModelTrainer()

# Add models
trainer.add_model(RandomForestModel(n_estimators=100, max_depth=20, random_state=42))
trainer.add_model(GradientBoostingModel(n_estimators=100, learning_rate=0.1, random_state=42))

print("Models added to trainer")

In [None]:
# Train all models
trainer.train_all(X_train, y_train)
print("Training completed!")

## 6. Model Evaluation

In [None]:
# Evaluate all models
results = trainer.evaluate_all(X_test, y_test)

# Get comparison DataFrame
comparison_df = trainer.get_comparison_dataframe()
print("\nModel Comparison:")
print(comparison_df.to_string(index=False))

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot of metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
x = np.arange(len(metrics))
width = 0.35

for i, model_name in enumerate(comparison_df['Model']):
    values = comparison_df.iloc[i][metrics].values
    axes[0].bar(x + i*width, values, width, label=model_name)

axes[0].set_ylabel('Score')
axes[0].set_title('Model Performance Comparison')
axes[0].set_xticks(x + width / 2)
axes[0].set_xticklabels(metrics, rotation=45)
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# F1-Score comparison
axes[1].barh(comparison_df['Model'], comparison_df['F1-Score'])
axes[1].set_xlabel('F1-Score')
axes[1].set_title('F1-Score by Model')
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nBest Model: {trainer.best_model_name}")
print(f"Best F1-Score: {trainer.best_score:.4f}")

In [None]:
# Display confusion matrix for best model
if trainer.best_model:
    cm = np.array(trainer.best_model.metrics['confusion_matrix'])
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Benign', 'Threat'],
                yticklabels=['Benign', 'Threat'])
    plt.title(f'Confusion Matrix - {trainer.best_model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    print("\nClassification Report:")
    print(trainer.best_model.metrics['classification_report'])

## 7. Feature Importance Analysis

In [None]:
# Get feature importance from best model (if available)
if trainer.best_model and hasattr(trainer.best_model.model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'feature': data['feature_names'],
        'importance': trainer.best_model.model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 8))
    plt.barh(importance_df['feature'][:15], importance_df['importance'][:15])
    plt.xlabel('Importance')
    plt.title(f'Top 15 Features - {trainer.best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print("\nTop 10 Most Important Features:")
    print(importance_df.head(10).to_string(index=False))

## 8. Save Models

In [None]:
# Save all trained models
saved_paths = trainer.save_all_models('../models/trained')
print("Models saved:")
for model_name, path in saved_paths.items():
    print(f"  {model_name}: {path}")

## 9. Conclusion

This notebook demonstrated:
1. Data loading and exploration
2. Feature engineering for network traffic
3. Data preprocessing and splitting
4. Training multiple ML models
5. Model evaluation and comparison
6. Feature importance analysis

The best performing model can now be used for real-time threat detection via the API service.