In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pathlib import Path
import logging
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("6.1_section")

## 1. Data Loading & Initial Setup

We start by importing necessary libraries and setting up logging for better debugging.

# NSL-KDD Network Intrusion Detection Analysis

## Overview
This notebook explores the NSL-KDD dataset for network intrusion detection. We analyze network traffic patterns to understand the characteristics that distinguish normal traffic from anomalies/attacks.

## Dataset Information
- **Source**: NSL-KDD dataset (improved version of KDD Cup 1999)
- **Purpose**: Binary classification (Normal vs Anomaly detection)
- **Features**: 41 network traffic features + 1 target variable
- **Classes**: 
  - `normal`: Legitimate network traffic
  - `anomaly`: Malicious network traffic/attacks

## Analysis Goals
1. **Data Loading & Preprocessing**: Load and prepare the dataset
2. **Exploratory Data Analysis**: Understand data distributions and patterns
3. **Correlation Analysis**: Identify features most correlated with attack detection
4. **Feature Engineering**: Prepare data for machine learning models

---

In [None]:
df = pd.read_csv('../datasets/NSL-KDD/KDDTrain+.txt', header=None)
df.head()

In [None]:
import arff

def load_arff_dataset(file_path: Path) -> pd.DataFrame:
    logger.info(f"Loading dataset from {file_path}")

    if not file_path.exists():
        raise FileNotFoundError(f"Dataset file not found: {file_path}")

    with file_path.open('r', encoding='utf-8') as file:
        arff_data = arff.load(file)
        arff_attributes = arff_data['attributes']
        columns = [attr[0] for attr in arff_attributes]
        data = arff_data['data']

    return pd.DataFrame(data, columns=columns)

df = load_arff_dataset(Path('../datasets/NSL-KDD/KDDTrain+.arff'))
df

## 2. Dataset Loading

Loading the NSL-KDD dataset from ARFF format. The ARFF format includes metadata about the dataset structure, making it more reliable than plain CSV files.

In [None]:
df.info()

## 3. Initial Data Exploration

Let's examine the dataset structure, data types, and basic statistics to understand what we're working with.

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
print("Conteo por clase:")
class_counts = df["class"].value_counts()
print(class_counts)

print("Conteo relativo por clase:")
relative_class_counts = df["class"].value_counts(normalize=True)
print(relative_class_counts)

plt.figure(figsize=(10, 6))
class_counts.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('DistribuciÃ³n de Clases en el Conjunto de Datos')
plt.xlabel('Clase')
plt.ylabel('NÃºmero de Instancias')
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.show()

## 4. Class Distribution Analysis

Understanding the balance between normal and anomaly classes is crucial for:
- Choosing appropriate evaluation metrics
- Deciding on sampling strategies
- Understanding potential model bias

In [None]:
normal_df = df.loc[df['class'] == 'normal']
anomaly_df = df.loc[df['class'] == 'anomaly']

variables = ['duration', 'src_bytes', 'dst_bytes', "hot", "num_failed_logins"]
n_variables = len(variables)
n_columns = 3
n_rows = int(np.ceil(n_variables / n_columns))

print(f"NÃºmero de variables a graficar: {n_variables}")
print(f"NÃºmero de columnas: {n_columns}")
print(f"NÃºmero de filas: {n_rows}")

bins = 30
fig, axes = plt.subplots(n_rows, n_columns, figsize=(12, 4 * n_rows))
for i, var in enumerate(variables):
    ax = axes[i // n_columns, i % n_columns]
    normal_df[var].hist(ax=ax, label='Normal', alpha=0.5, bins=bins)
    anomaly_df[var].hist(ax=ax, label='Anomaly', alpha=0.5, bins=bins)
    ax.set_title(f'DistribuciÃ³n de {var}')
    ax.set_xlabel(var)
    ax.set_ylabel('Densidad')
    ax.legend()
    ax.grid()
    ax.set_yscale('log')
plt.tight_layout()
plt.show()

## 5. Feature Distribution Comparison

Analyzing how key numerical features differ between normal and anomaly classes. This helps us understand:
- Which features show clear separation between classes
- The nature of the data distributions
- Potential outliers and data quality issues

In [None]:
print("DistribuciÃ³n de protocolos por clase:")
protocol_by_class = df.groupby('class')['protocol_type'].value_counts(normalize=True).unstack()
print(protocol_by_class)

## 6. Categorical Feature Analysis

Examining categorical features like `protocol_type` to understand:
- How different protocols relate to attack patterns
- The distribution of network protocols in our dataset
- Protocol-specific attack patterns

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
normal_protocols = normal_df['protocol_type'].value_counts()
anomaly_protocols = anomaly_df['protocol_type'].value_counts()

normal_protocols.plot(kind='bar', ax=axes[0], color='skyblue')
anomaly_protocols.plot(kind='bar', ax=axes[1], color='salmon')

axes[0].set_title('Protocolos en Clase Normal')
axes[1].set_title('Protocolos en Clase Anomaly')
axes[0].set_xlabel('Protocolo')
axes[1].set_xlabel('Protocolo')
axes[0].set_ylabel('NÃºmero de Instancias')
axes[1].set_ylabel('NÃºmero de Instancias')
axes[0].grid(axis='y')
axes[1].grid(axis='y')
plt.show()

In [None]:
df_corr = df.copy()
df_corr["class_num"] = df_corr["class"].map({"normal": 0, "anomaly": 1})
df_corr = df_corr.drop(columns=["class", "protocol_type", "service", "flag"])

top_n = 5
top_features = df_corr.corr()["class_num"].abs().sort_values(ascending=False).head(top_n).index.to_list()
correlation_matrix = df_corr[top_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, cmap="RdBu_r", annot=True, center=0, linewidths=0.5, fmt=".2f")
plt.title("Correlation Matrix")
plt.tight_layout()
plt.yticks(rotation=0)
plt.xticks(rotation=45, ha="right")
plt.show()


### Key Correlation Insights

From the correlation matrix above, we can observe:

**ðŸ”´ Strong Positive Correlations** (Red - Higher values â†’ More likely anomaly):
- **Error-related features**: `serror_rate`, `dst_host_serror_rate`, etc.
- **Interpretation**: Anomalies are associated with network errors and failed connections

**ðŸ”µ Strong Negative Correlations** (Blue - Higher values â†’ More likely normal):
- **Service consistency features**: `same_srv_rate`, `dst_host_same_srv_rate`
- **Login features**: `logged_in` status
- **Interpretation**: Normal traffic shows consistent patterns and successful authentication

**ðŸŸ¡ Feature Redundancy**:
- Multiple error rate features show high correlation (0.95+) with each other
- Suggests we could potentially reduce dimensionality without losing information

## 8. Correlation Analysis

**Objective**: Identify which features have the strongest correlation with our target variable (`class`).

This analysis helps us understand:
- **Feature Importance**: Which features are most predictive
- **Redundancy**: Features that provide similar information  
- **Attack Patterns**: What network characteristics indicate anomalies

**Methodology**:
1. Convert categorical target to numeric: `normal=0, anomaly=1`
2. Remove non-numeric features that can't be correlated
3. Focus on top correlated features for visualization

In [None]:
numeric_features = ["duration", "src_bytes", "dst_bytes"]
categorical_features = ["protocol_type"]
binary_features = ["class"]

preprocessor = ColumnTransformer(
    transformers=[
        ('n', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ],
    remainder='drop'
)

X = df.loc[:, numeric_features + categorical_features]
y = df[binary_features]

X_transformed = preprocessor.fit_transform(X)
X_transformed = pd.DataFrame(np.array(X_transformed), columns=preprocessor.get_feature_names_out())
X_transformed


## 9. Feature Engineering & Preprocessing

**Purpose**: Prepare the data for machine learning models by:
- **Scaling numerical features**: Ensure all features have similar ranges
- **Encoding categorical features**: Convert text categories to numerical format
- **Creating a clean feature matrix**: Ready for ML algorithms

**Preprocessing Pipeline**:
1. **StandardScaler**: Normalize numerical features (mean=0, std=1)
2. **OneHotEncoder**: Convert categorical variables to binary columns
3. **Drop first**: Avoid multicollinearity in categorical encoding

## Summary & Next Steps

### Key Findings from EDA:

1. **Dataset Characteristics**:
   - 125,973 samples with 42 features
   - Binary classification: Normal vs Anomaly
   - Mixed data types: numerical + categorical

2. **Class Distribution**:
   - Dataset shows class imbalance (check actual percentages)
   - Important for model selection and evaluation metrics

3. **Feature Insights**:
   - **Error rates** are strong predictors of anomalies
   - **Service consistency** patterns distinguish normal traffic
   - **Protocol type** shows different attack patterns

4. **Data Quality**:
   - No missing values detected
   - Some features show high correlation (potential redundancy)

### Recommended Next Steps:

1. **Feature Selection**: 
   - Use correlation insights to select most predictive features
   - Consider removing highly correlated features

2. **Model Development**:
   - Start with simple models (Logistic Regression, Decision Trees)
   - Progress to ensemble methods (Random Forest, XGBoost)

3. **Evaluation Strategy**:
   - Use appropriate metrics for imbalanced classes
   - Implement cross-validation for robust evaluation

4. **Advanced Analysis**:
   - Explore specific attack types (if available in data)
   - Time-series analysis for temporal patterns