In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import arff
import os
from pathlib import Path
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("6.1_section")

In [None]:
df = pd.read_csv('../datasets/NSL-KDD/KDDTrain+.txt', header=None)
df.head()

In [None]:
import arff

def load_arff_dataset(file_path: Path) -> pd.DataFrame:
    logger.info(f"Loading dataset from {file_path}")

    if not file_path.exists():
        raise FileNotFoundError(f"Dataset file not found: {file_path}")

    with file_path.open('r', encoding='utf-8') as file:
        arff_data = arff.load(file)
        arff_attributes = arff_data['attributes']
        columns = [attr[0] for attr in arff_attributes]
        data = arff_data['data']

    return pd.DataFrame(data, columns=columns)

df = load_arff_dataset(Path('../datasets/NSL-KDD/KDDTrain+.arff'))

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
print("Conteo por clase:")
class_counts = df["class"].value_counts()
print(class_counts)

print("Conteo relativo por clase:")
relative_class_counts = df["class"].value_counts(normalize=True)
print(relative_class_counts)

plt.figure(figsize=(10, 6))
class_counts.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Distribución de Clases en el Conjunto de Datos')
plt.xlabel('Clase')
plt.ylabel('Número de Instancias')
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.show()

In [None]:
normal_df = df.loc[df['class'] == 'normal']
anomaly_df = df.loc[df['class'] == 'anomaly']

variables = ['duration', 'src_bytes', 'dst_bytes', "hot", "num_failed_logins"]
n_variables = len(variables)
n_columns = 3
n_rows = int(np.ceil(n_variables / n_columns))

print(f"Número de variables a graficar: {n_variables}")
print(f"Número de columnas: {n_columns}")
print(f"Número de filas: {n_rows}")

bins = 30
fig, axes = plt.subplots(n_rows, n_columns, figsize=(12, 4 * n_rows))
for i, var in enumerate(variables):
    ax = axes[i // n_columns, i % n_columns]
    normal_df[var].hist(ax=ax, label='Normal', alpha=0.5, bins=bins)
    anomaly_df[var].hist(ax=ax, label='Anomaly', alpha=0.5, bins=bins)
    ax.set_title(f'Distribución de {var}')
    ax.set_xlabel(var)
    ax.set_ylabel('Densidad')
    ax.legend()
    ax.grid()
    ax.set_yscale('log')
plt.tight_layout()
plt.show()

In [None]:
print("Distribución de protocolos por clase:")
protocol_by_class = df.groupby('class')['protocol_type'].value_counts(normalize=True).unstack()
print(protocol_by_class)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
normal_protocols = normal_df['protocol_type'].value_counts()
anomaly_protocols = anomaly_df['protocol_type'].value_counts()

normal_protocols.plot(kind='bar', ax=axes[0], color='skyblue')
anomaly_protocols.plot(kind='bar', ax=axes[1], color='salmon')

axes[0].set_title('Protocolos en Clase Normal')
axes[1].set_title('Protocolos en Clase Anomaly')
axes[0].set_xlabel('Protocolo')
axes[1].set_xlabel('Protocolo')
axes[0].set_ylabel('Número de Instancias')
axes[1].set_ylabel('Número de Instancias')
axes[0].grid(axis='y')
axes[1].grid(axis='y')
plt.show()