# AI Cybersecurity Threat Detection - Model Training

This notebook demonstrates the complete machine learning pipeline for cybersecurity threat detection:
1. Load the LogHub dataset
2. Extract features from log entries
3. Train Random Forest and Isolation Forest models
4. Evaluate model performance
5. Export models for the web application

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import re
import json
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


## 2. Load Dataset

In [2]:
# Load the LogHub dataset
print("📊 Loading LogHub dataset...")
df = pd.read_csv('data/training_dataset.csv')

print(f"✅ Loaded {len(df)} log entries")
print(f"   Normal logs: {len(df[df['label'] == 0])}")
print(f"   Threat logs: {len(df[df['label'] == 1])}")

print("\n📋 Dataset columns:", list(df.columns))
print("\n🎯 Threat type distribution:")
print(df['threat_type'].value_counts())

# Show sample data
print("\n📝 Sample log entries:")
df.head()

📊 Loading LogHub dataset...
✅ Loaded 5000 log entries
   Normal logs: 3000
   Threat logs: 2000

📋 Dataset columns: ['log', 'label', 'threat_type']

🎯 Threat type distribution:
threat_type
normal                  3000
brute_force              400
privilege_escalation     400
dos_attack               400
unauthorized_access      400
network_scan             400
Name: count, dtype: int64

📝 Sample log entries:


Unnamed: 0,log,label,threat_type
0,Jan 15 14:23:18 server4 apache2[8924]: 192.168...,0,normal
1,Jan 21 17:52:15 server3 cron[2786]: (www-data)...,0,normal
2,Jan 30 07:50:19 server1 cron[7164]: (service) ...,0,normal
3,Jan 16 20:29:27 server2 apache2[7234]: 192.168...,0,normal
4,Jan 10 04:03:05 server3 postfix/smtpd[1404]: c...,0,normal


## 3. Feature Engineering

In [3]:
def extract_features(log_text):
    """Extract numerical features from log text"""
    features = {}
    
    # Basic text features
    features['log_length'] = len(log_text)
    features['word_count'] = len(log_text.split())
    features['char_count'] = len(log_text)
    
    # Security keywords
    features['failed_count'] = len(re.findall(r'failed|fail', log_text, re.IGNORECASE))
    features['password_count'] = len(re.findall(r'password', log_text, re.IGNORECASE))
    features['root_count'] = len(re.findall(r'\\broot\\b', log_text, re.IGNORECASE))
    features['admin_count'] = len(re.findall(r'admin', log_text, re.IGNORECASE))
    features['sudo_count'] = len(re.findall(r'sudo|su:', log_text, re.IGNORECASE))
    features['error_count'] = len(re.findall(r'error|denied|invalid|unauthorized', log_text, re.IGNORECASE))
    features['connection_count'] = len(re.findall(r'connection|connect', log_text, re.IGNORECASE))
    features['attack_count'] = len(re.findall(r'attack|scan|probe|flood', log_text, re.IGNORECASE))
    
    # IP addresses
    ip_pattern = r'\\b(?:\\d{1,3}\\.){3}\\d{1,3}\\b'
    ip_matches = re.findall(ip_pattern, log_text)
    features['ip_count'] = len(ip_matches)
    features['has_external_ip'] = int(any(not ip.startswith(('192.168.', '10.', '172.')) for ip in ip_matches))
    
    # Ports
    port_pattern = r'port\\s+(\\d+)'
    port_matches = re.findall(port_pattern, log_text, re.IGNORECASE)
    features['port_count'] = len(port_matches)
    features['has_suspicious_port'] = int(any(int(port) in [22, 23, 21, 3389] for port in port_matches if port.isdigit()))
    
    # Time features
    time_pattern = r'(\\d{2}):(\\d{2}):(\\d{2})'
    time_match = re.search(time_pattern, log_text)
    if time_match:
        hour = int(time_match.group(1))
        features['hour'] = hour
        features['is_night_time'] = int(hour < 6 or hour > 22)
    else:
        features['hour'] = 12
        features['is_night_time'] = 0
    
    # Character analysis
    features['digit_ratio'] = sum(c.isdigit() for c in log_text) / len(log_text) if log_text else 0
    features['special_char_ratio'] = sum(not c.isalnum() and c != ' ' for c in log_text) / len(log_text) if log_text else 0
    features['uppercase_ratio'] = sum(c.isupper() for c in log_text) / len(log_text) if log_text else 0
    
    # HTTP status
    http_pattern = r'HTTP/1\\.[01]"\\s+(\\d{3})'
    http_match = re.search(http_pattern, log_text)
    if http_match:
        status_code = int(http_match.group(1))
        features['http_status'] = status_code
        features['is_http_error'] = int(status_code >= 400)
    else:
        features['http_status'] = 0
        features['is_http_error'] = 0
    
    return features

# Extract features from all logs
print("🔧 Extracting features from logs...")
feature_list = [extract_features(log) for log in df['log']]
features_df = pd.DataFrame(feature_list)

print(f"✅ Extracted {len(features_df.columns)} features")
print("Features:", list(features_df.columns))
features_df.head()

🔧 Extracting features from logs...
✅ Extracted 22 features
Features: ['log_length', 'word_count', 'char_count', 'failed_count', 'password_count', 'root_count', 'admin_count', 'sudo_count', 'error_count', 'connection_count', 'attack_count', 'ip_count', 'has_external_ip', 'port_count', 'has_suspicious_port', 'hour', 'is_night_time', 'digit_ratio', 'special_char_ratio', 'uppercase_ratio', 'http_status', 'is_http_error']


Unnamed: 0,log_length,word_count,char_count,failed_count,password_count,root_count,admin_count,sudo_count,error_count,connection_count,...,has_external_ip,port_count,has_suspicious_port,hour,is_night_time,digit_ratio,special_char_ratio,uppercase_ratio,http_status,is_http_error
0,123,15,123,0,0,0,0,0,0,0,...,0,0,0,12,0,0.414634,0.195122,0.073171,0,0
1,71,8,71,0,0,0,0,0,0,0,...,0,0,0,12,0,0.183099,0.197183,0.056338,0,0
2,70,8,70,0,0,0,0,0,0,0,...,0,0,0,12,0,0.185714,0.185714,0.057143,0,0
3,123,15,123,0,0,0,0,0,0,0,...,0,0,0,12,0,0.398374,0.195122,0.073171,0,0
4,90,8,90,0,0,0,0,0,0,1,...,0,0,0,12,0,0.266667,0.144444,0.011111,0,0


## 4. Data Preparation

In [4]:
# Prepare data
X = features_df.values
y = df['label'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"✅ Data prepared - Feature matrix shape: {X_train_scaled.shape}")

Training set: 4000 samples
Test set: 1000 samples
✅ Data prepared - Feature matrix shape: (4000, 22)


## 5. Model Training

In [5]:
# Train Random Forest
print("🌲 Training Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42
)
rf_model.fit(X_train_scaled, y_train)

# Train Isolation Forest
print("🔍 Training Isolation Forest...")
isolation_forest = IsolationForest(
    contamination=0.1,
    random_state=42,
    n_estimators=100
)
isolation_forest.fit(X_train_scaled)

print("✅ Models trained successfully!")

🌲 Training Random Forest...
🔍 Training Isolation Forest...
✅ Models trained successfully!


## 6. Model Evaluation

In [6]:
# Evaluate Random Forest
rf_pred = rf_model.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, rf_pred)

print("🌲 Random Forest Results:")
print(f"Accuracy: {rf_accuracy:.3f}")
print(classification_report(y_test, rf_pred))

# Evaluate Isolation Forest
if_pred = isolation_forest.predict(X_test_scaled)
if_pred_binary = (if_pred == -1).astype(int)
if_accuracy = accuracy_score(y_test, if_pred_binary)

print("\n🔍 Isolation Forest Results:")
print(f"Accuracy: {if_accuracy:.3f}")
print(classification_report(y_test, if_pred_binary))

# Feature importance
print("\n🎯 Top 10 Important Features:")
feature_importance = pd.DataFrame({
    'feature': features_df.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

feature_importance.head(10)

🌲 Random Forest Results:
Accuracy: 0.998
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       600
           1       1.00      1.00      1.00       400

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000


🔍 Isolation Forest Results:
Accuracy: 0.691
              precision    recall  f1-score   support

           0       0.66      1.00      0.80       600
           1       1.00      0.23      0.37       400

    accuracy                           0.69      1000
   macro avg       0.83      0.61      0.58      1000
weighted avg       0.80      0.69      0.63      1000


🎯 Top 10 Important Features:


Unnamed: 0,feature,importance
17,digit_ratio,0.167738
19,uppercase_ratio,0.147564
1,word_count,0.132499
18,special_char_ratio,0.131115
2,char_count,0.117146
0,log_length,0.099466
3,failed_count,0.086359
10,attack_count,0.027913
7,sudo_count,0.025745
9,connection_count,0.024042


## 7. Export Models

In [7]:
# Create models directory
os.makedirs('models', exist_ok=True)

# Save models
print("💾 Saving models...")
joblib.dump(rf_model, 'models/random_forest.pkl')
joblib.dump(isolation_forest, 'models/isolation_forest.pkl')
joblib.dump(scaler, 'models/scaler.pkl')

# Save metadata
metadata = {
    'model_version': '1.0',
    'training_date': datetime.now().isoformat(),
    'feature_names': list(features_df.columns),
    'training_samples': len(X_train),
    'test_samples': len(X_test),
    'rf_accuracy': float(rf_accuracy),
    'if_accuracy': float(if_accuracy),
    'feature_count': len(features_df.columns)
}

with open('models/model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("✅ Models exported successfully!")
print("Files saved:")
print("  - models/random_forest.pkl")
print("  - models/isolation_forest.pkl")
print("  - models/scaler.pkl")
print("  - models/model_metadata.json")

print(f"\n🎉 Training Complete!")
print(f"Random Forest Accuracy: {rf_accuracy:.3f}")
print(f"Isolation Forest Accuracy: {if_accuracy:.3f}")
print(f"Models ready for web application!")

💾 Saving models...
✅ Models exported successfully!
Files saved:
  - models/random_forest.pkl
  - models/isolation_forest.pkl
  - models/scaler.pkl
  - models/model_metadata.json

🎉 Training Complete!
Random Forest Accuracy: 0.998
Isolation Forest Accuracy: 0.691
Models ready for web application!
