# Project 12: DNS Tunneling Detection

## Objective
Build an interpretable machine learning model that distinguishes between legitimate DNS queries and those used for DNS tunneling based on statistical features.

## Dataset
DNS Tunneling Dataset from Kaggle containing labeled DNS queries with pre-calculated features.

## Key Features
- Logistic Regression for maximum interpretability
- Focus on query length, entropy, and subdomain count
- Model coefficients explain why queries are flagged
- Real-time deployment capability

## 1. Environment Setup and Data Loading

In [None]:
# Install required packages
!pip install kaggle pandas numpy scikit-learn matplotlib seaborn

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("viridis")

In [None]:
# Setup Kaggle API for dataset download
if not os.path.exists(os.path.expanduser('~/.kaggle/kaggle.json')):
    print("Please set up your Kaggle API credentials first.")
    print("1. Go to https://www.kaggle.com/account")
    print("2. Create API token and download kaggle.json")
    print("3. Place it in ~/.kaggle/ directory")
else:
    print("Kaggle API configured. Downloading DNS tunneling dataset...")
    !kaggle datasets download -d ahmethamzadedbs/dns-tunneling-dataset --unzip

## 2. Data Loading and Exploration

In [None]:
# Load the DNS tunneling dataset
print("Loading DNS tunneling dataset...")

try:
    df = pd.read_csv('dnscat2.csv')
    print(f"Dataset loaded successfully. Shape: {df.shape}")
    print(f"\nColumns: {list(df.columns)}")
except FileNotFoundError:
    print("Dataset file not found. Please ensure the dataset is downloaded correctly.")
    print("Expected file: dnscat2.csv")

In [None]:
# Explore the dataset structure
print("Dataset Overview:")
print("=" * 40)
print(f"Shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())

print(f"\nData types:")
print(df.dtypes)

print(f"\nMissing values:")
print(df.isnull().sum())

## 3. Feature Selection and Data Preprocessing

In [None]:
# Focus on most interpretable features for DNS tunneling detection
feature_cols = ['query_length', 'subdomain_count', 'entropy']
target_col = 'label'

# Select relevant columns
df_model = df[feature_cols + [target_col]].copy()

# Encode the labels: 'nontunnel' -> 0, 'tunnel' -> 1
df_model[target_col] = df_model[target_col].apply(lambda x: 1 if x == 'tunnel' else 0)

print("Feature selection completed.")
print(f"Selected features: {feature_cols}")
print(f"Target variable: {target_col}")

# Check class distribution
print("\nClass Distribution:")
class_counts = df_model[target_col].value_counts()
print(f"Nontunnel (0): {class_counts[0]:,}")
print(f"Tunnel (1): {class_counts[1]:,}")
print(f"Shape after feature selection: {df_model.shape}")

## 4. Exploratory Data Analysis

In [None]:
# Visualize feature differences between classes
print("Visualizing feature differences between normal and tunneling DNS queries...")

plt.figure(figsize=(18, 5))
for i, col in enumerate(feature_cols):
    plt.subplot(1, 3, i + 1)
    sns.boxplot(x=target_col, y=col, data=df_model)
    plt.title(f'{col} by Class')
    plt.xticks([0, 1], ['Nontunnel', 'Tunnel'])

plt.tight_layout()
plt.show()

print("\n📊 Key Observations:")
print("• Tunnel queries show higher length values")
print("• Tunnel queries have more subdomains")
print("• Tunnel queries exhibit higher entropy (more randomness)")
print("• These patterns align with DNS tunneling behavior")

In [None]:
# Statistical summary of features by class
print("Statistical Summary by Class:")
print("=" * 50)

for feature in feature_cols:
    print(f"\n{feature}:")
    nontunnel_stats = df_model[df_model[target_col] == 0][feature].describe()
    tunnel_stats = df_model[df_model[target_col] == 1][feature].describe()
    
    comparison = pd.DataFrame({
        'Nontunnel': nontunnel_stats,
        'Tunnel': tunnel_stats
    })
    print(comparison.round(2))

## 5. Data Splitting and Scaling

In [None]:
# Prepare features and target
X = df_model[feature_cols]
y = df_model[target_col]

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Stratified split to maintain class ratio
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"\nData split completed:")
print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")
print(f"Training class distribution:")
print(y_train.value_counts().sort_index())

In [None]:
# Scale features for optimal Logistic Regression performance
print("Scaling features...")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed.")
print(f"Training features shape: {X_train_scaled.shape}")
print(f"Test features shape: {X_test_scaled.shape}")

## 6. Model Training

In [None]:
# Initialize and train Logistic Regression model
print("Training Logistic Regression model...")

# Use balanced class weights to handle any class imbalance
model = LogisticRegression(
    random_state=42, 
    class_weight='balanced',
    max_iter=1000
)

# Fit the model
model.fit(X_train_scaled, y_train)
print("Training completed successfully!")

# Display basic model information
print(f"\nModel coefficients:")
for feature, coef in zip(feature_cols, model.coef_[0]):
    print(f"  {feature}: {coef:.4f}")
print(f"Intercept: {model.intercept_[0]:.4f}")

## 7. Model Evaluation

In [None]:
# Make predictions
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

# Classification report
print("Classification Report:")
print("=" * 50)
print(classification_report(y_test, y_pred, target_names=['Nontunnel (0)', 'Tunnel (1)']))

# Confusion matrix analysis
print("\nConfusion Matrix Analysis:")
cm = confusion_matrix(y_test, y_pred)
print(f"True Negatives (Correctly identified nontunnel): {cm[0,0]}")
print(f"False Positives (Nontunnel flagged as tunnel): {cm[0,1]}")
print(f"False Negatives (Missed tunnel queries): {cm[1,0]}")
print(f"True Positives (Correctly identified tunnel): {cm[1,1]}")

In [None]:
# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', 
            xticklabels=['Nontunnel', 'Tunnel'], 
            yticklabels=['Nontunnel', 'Tunnel'])
plt.title('Confusion Matrix - DNS Tunneling Detection')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

## 8. Model Interpretability Analysis

In [None]:
# Analyze feature importance through model coefficients
print("Model Interpretability Analysis:")
print("=" * 50)

# Create coefficient dataframe
coefficients = pd.DataFrame({
    'Feature': feature_cols,
    'Coefficient': model.coef_[0]
}).sort_values('Coefficient', ascending=False)

print("\nModel Coefficients (Log-Odds):")
print(coefficients)

print("\n🔍 Coefficient Interpretation:")
for _, row in coefficients.iterrows():
    feature, coef = row['Feature'], row['Coefficient']
    if coef > 0:
        print(f"• {feature}: +{coef:.4f} - Higher values increase tunneling probability")
    else:
        print(f"• {feature}: {coef:.4f} - Higher values decrease tunneling probability")

In [None]:
# Visualize feature importance
plt.figure(figsize=(10, 6))
colors = ['green' if x > 0 else 'red' for x in coefficients['Coefficient']]
bars = plt.barh(coefficients['Feature'], coefficients['Coefficient'], color=colors)
plt.title('Feature Importance in DNS Tunneling Detection\n(Logistic Regression Coefficients)')
plt.xlabel('Coefficient Value (Log-Odds)')
plt.ylabel('Features')
plt.axvline(x=0, color='black', linestyle='--', alpha=0.7)

# Add value labels on bars
for bar, coef in zip(bars, coefficients['Coefficient']):
    plt.text(coef + (0.01 if coef > 0 else -0.01), bar.get_y() + bar.get_height()/2, 
             f'{coef:.3f}', ha='left' if coef > 0 else 'right', va='center')

plt.tight_layout()
plt.show()

print("\n📈 Key Insights:")
print("• Positive coefficients indicate features that increase tunneling probability")
print("• All three features align with expected DNS tunneling behavior")
print("• Model provides clear explanations for each prediction")

## 9. Performance Metrics and Business Impact

In [None]:
# Calculate comprehensive performance metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print("🎯 Model Performance Summary:")
print("=" * 40)
print(f"Accuracy:  {accuracy:.3f} ({accuracy*100:.1f}%)")
print(f"Precision: {precision:.3f} ({precision*100:.1f}%)")
print(f"Recall:    {recall:.3f} ({recall*100:.1f}%)")
print(f"F1-Score:  {f1:.3f}")
print(f"AUC-ROC:   {auc:.3f}")

print(f"\n💼 Business Impact:")
print(f"• Detects {recall*100:.1f}% of DNS tunneling attempts")
print(f"• {precision*100:.1f}% of alerts are genuine tunneling attempts")
print(f"• Reduces analyst workload through interpretable results")
print(f"• Enables real-time DNS monitoring and alerting")

## 10. Example Predictions with Explanations

In [None]:
# Demonstrate model interpretability with example predictions
print("🔍 Example Predictions with Explanations:")
print("=" * 50)

# Select a few test samples for demonstration
sample_indices = [0, 10, 100, 500]  # Mix of different cases

for idx in sample_indices:
    if idx < len(X_test):
        sample_features = X_test.iloc[idx]
        sample_scaled = X_test_scaled[idx:idx+1]
        prediction = model.predict(sample_scaled)[0]
        probability = model.predict_proba(sample_scaled)[0, 1]
        actual = y_test.iloc[idx]
        
        print(f"\nSample {idx + 1}:")
        print(f"  Actual: {'Tunnel' if actual == 1 else 'Nontunnel'}")
        print(f"  Predicted: {'Tunnel' if prediction == 1 else 'Nontunnel'} ({probability:.3f} probability)")
        print(f"  Features:")
        
        # Show feature contributions
        for feature, value, coef in zip(feature_cols, sample_features, model.coef_[0]):
            # Standardize the feature value for comparison
            feature_idx = feature_cols.index(feature)
            scaled_value = sample_scaled[0, feature_idx]
            contribution = coef * scaled_value
            print(f"    {feature}: {value:.2f} (contribution: {contribution:+.3f})")
        
        result = "✅" if prediction == actual else "❌"
        print(f"  Result: {result}")

## 11. Conclusion and Next Steps

In [None]:
print("🛡️  PROJECT CONCLUSION: DNS Tunneling Detection")
print("=" * 60)

print("\n✅ Key Achievements:")
print(f"  • Built interpretable model with {recall*100:.1f}% recall for tunnel detection")
print(f"  • Achieved {precision*100:.1f}% precision, minimizing false positive alerts")
print(f"  • Created explainable predictions with clear feature contributions")
print(f"  • Identified key DNS patterns: entropy, length, and subdomain structure")

print("\n🎯 Security Value:")
print("  • Detects stealthy data exfiltration through DNS channels")
print("  • Provides forensic evidence for security investigations")
print("  • Enables real-time monitoring of DNS traffic")
print("  • Supports compliance requirements for advanced threat detection")

print("\n🚀 Production Deployment:")
print("  1. Integrate with DNS monitoring infrastructure")
print("  2. Set up real-time stream processing for DNS queries")
print("  3. Configure SIEM integration for automated alerting")
print("  4. Establish analyst workflows for investigating flagged queries")
print("  5. Implement model monitoring and retraining pipeline")

print("\n⚡ Technical Advantages:")
print("  • Lightweight model suitable for high-speed DNS processing")
print("  • Interpretable results reduce analyst investigation time")
print("  • Balanced approach between sensitivity and specificity")
print("  • Robust feature selection based on domain expertise")

print("\n🔬 Future Enhancements:")
print("  • Add temporal analysis for DNS query sequences")
print("  • Incorporate domain reputation and threat intelligence")
print("  • Extend to other covert channels (ICMP, HTTP headers)")
print("  • Implement ensemble methods for improved robustness")