In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

## Start Session

In [None]:
spark = (
    SparkSession.builder.appName("iot")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

## Read data

In [None]:
df = spark.read.parquet("feature_engineered.pq")

In [None]:
df.show(5)

In [None]:
numerical_features = [
    "duration",
    "orig_bytes",
    "resp_bytes",
    "orig_pkts",
    "orig_ip_bytes",
    "resp_pkts",
    "resp_ip_bytes",
    "source_ip_count_last_min",
    "source_ip_count_last_30_mins",
    "source_port_count_last_min",
    "source_port_count_last_30_mins",
    "source_ip_avg_pkts_last_min",
    "source_ip_avg_pkts_last_30_mins",
    "source_ip_avg_bytes_last_min",
    "source_ip_avg_bytes_last_30_mins",
]
categorical_features = ["proto", "service", "conn_state", "history"]
categorical_features_indexed = [c + "_index" for c in categorical_features]

input_features = numerical_features + categorical_features_indexed

In [None]:
categorical_valid_values = {}

for c in categorical_features:
    # Find frequent values
    categorical_valid_values[c] = (
        df.groupby(c)
        .count()
        .filter(F.col("count") > 100)
        .select(c)
        .toPandas()
        .values.ravel()
    )

    df_fe = df.withColumn(
        c,
        F.when(F.col(c).isin(list(categorical_valid_values[c])), F.col(c)).otherwise(
            F.lit("Other").alias(c)
        ),
    )

In [None]:
df_train, df_test = df_fe.randomSplit(weights=[0.8, 0.2], seed=42)
df_train, df_val = df_train.randomSplit(weights=[0.8, 0.2], seed=42)

## HP Tuning

In [None]:
from tuning import tune_rf

In [None]:
search_space = {
    "numTrees": hp.uniformint("numTrees", 10, 500),
    "maxDepth": hp.uniformint("maxDepth", 2, 10),
}

roc = BinaryClassificationEvaluator(labelCol="is_bad", metricName="areaUnderROC")

ind = StringIndexer(
    inputCols=categorical_features,
    outputCols=categorical_features_indexed,
    handleInvalid="skip",
)
va = VectorAssembler(
    inputCols=input_features, outputCol="features", handleInvalid="skip"
)

best_params = tune_rf(df_train, df_val, ind, va, roc, search_space)

In [None]:
best_rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="is_bad",
    numTrees=best_params["numTrees"],
    maxDepth=best_params["maxDepth"],
)

best_pipeline = Pipeline(stages=[ind, va, best_rf])

best_pipeline = best_pipeline.fit(df_train)
test_preds = best_pipeline.transform(df_test)

score = roc.evaluate(test_preds)
score

In [None]:
best_pipeline.save("best_pipeline")