# Importacion de Bibliotecas

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import hdbscan

url_dataset = "../../Dataset/target/access_log_master_manual_labeling.csv"
url_dataset_modified= "../../Dataset/target/access_log_master_automatic_label.csv"

# Cargar dataset y definicion de variables 

In [3]:
Day_to_analyze=["2025-11-19","2025-11-20","2025-11-21"]

df_original = pd.read_csv(url_dataset)
df_original["anomaly"] = df_original["anomaly"].replace({-1: np.nan})
df= df_original.copy()
mask = df["timestamp"].astype(str).str.startswith(tuple(Day_to_analyze))
df = df[mask]
# Aseguramos NaN en no etiquetados

FEATURE_COLUMNS = [
    "status",
    "size",
    "status_category",

    "url__count_sql_words",
    "url__count_xss_words",
    "url__count_command_words",
    "url__count_auth_words",
    "url__count_error_words",
    "url__count_malware_words",
    "url__count_danger_characters",
    "url__count_obfuscation_code_words",
    "url__count_dir_words",
    "url__count_dot",
    "url__count_http",
    "url__count_percentage_symbol",
    "url__count_question_symbol",
    "url__count_hyphen",
    "url__count_equal",
    "url__url_length",
    "url__digit_count",
    "url__letter_count",
    "url__count_special_characters",
    "url__is_encoded",
    "url__unusual_character_ratio",
]


# Preprocesamiento

In [11]:
X = df[FEATURE_COLUMNS]
y = df["anomaly"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Clustering con HDBSCAN

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=30,
    min_samples=10,
    metric="euclidean"
)

cluster_labels = clusterer.fit_predict(X_scaled)
df["cluster"] = cluster_labels


# Funcion de Propagacion de etiquetas

In [None]:
def propagate_labels(df, label_col="anomaly", cluster_col="cluster", threshold=0.7):
    df = df.copy()
    new_labels = 0

    for cluster_id in df[cluster_col].unique():
        if cluster_id == -1:
            continue  # ruido

        cluster_data = df[df[cluster_col] == cluster_id]
        labeled = cluster_data[cluster_data[label_col].notna()]

        if len(labeled) == 0:
            continue

        label_counts = labeled[label_col].value_counts(normalize=True)
        dominant_label = label_counts.idxmax()
        confidence = label_counts.max()

        if confidence >= threshold:
            mask = (
                (df[cluster_col] == cluster_id) &
                (df[label_col].isna())
            )
            assigned = mask.sum()
            df.loc[mask, label_col] = dominant_label
            new_labels += assigned

    return df, new_labels


# Propagacion de etiquetas 

In [None]:
MAX_ITERS = 5

for i in range(MAX_ITERS):
    df, assigned = propagate_labels(df, threshold=0.75)
    print(f"Iteración {i+1}: {assigned} nuevas etiquetas")
    if assigned == 0:
        break


In [None]:
df["anomaly"].value_counts(dropna=False)
df.to_csv(url_dataset_modified, index=False)

# Probar la consistencia del Metodo

In [5]:
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split

# Solo datos con etiqueta original
labeled_df = df_original[df_original["anomaly"].notna()].copy()

# Ocultamos el 30%
train_idx, test_idx = train_test_split(
    labeled_df.index,
    test_size=0.3,
    stratify=labeled_df["anomaly"],
    random_state=42
)

df_masked = df_original.copy()
df_masked.loc[test_idx, "anomaly"] = np.nan



# Comparar Resultados 

In [8]:
y_true = df_original.loc[test_idx, "anomaly"]
y_pred = df_masked.loc[test_idx, "anomaly"]

print(classification_report(y_true, y_pred))


  return x.astype(dtype=dtype, copy=copy)


ValueError: Input y_pred contains NaN.