In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("Final_IDS").getOrCreate()
sc = spark.sparkContext
spark

In [2]:
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline
from pyspark.sql.types import DoubleType
import pyarrow as pa
from pyspark.sql.types import DataType
from pyspark.sql.functions import col
from pyspark.sql.functions import split
from pyspark.sql.dataframe import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, TrainValidationSplit
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression

In [3]:
df_col = spark.read.csv('NUSW-NB15_features.csv', header="true", inferSchema=False)
#resolving the columns containing features to a list in the order they appear to use as headers for the dataframe
distinct_names = df_col.select(col("`No.`").cast("int"), "Name").distinct()
ordered_dataset_names = [row.Name for row in distinct_names.orderBy("`No.`").collect()]

In [4]:
dfs = []
csv_files = ["C:/users/python/UNSW-NB15_1.csv", "C:/users/python/UNSW-NB15_2.csv", "C:/users/python/UNSW-NB15_3.csv", "C:/users/python/UNSW-NB15_4.csv"]

for csv_file in csv_files:
    df = spark.read.option('header', False).csv(csv_file)
    renamed_df = df.toDF(*ordered_dataset_names)
    dfs.append(renamed_df)

def union_all(*dfss):
    return reduce(DataFrame.unionAll, dfss)


dataframe = union_all(*dfs)

In [5]:
dataframe = dataframe.na.fill('normal', ['attack_cat'])
dataframe = dataframe.na.fill('0', ['ct_flw_http_mthd'])
dataframe = dataframe.na.fill('0', ['is_ftp_login'])
dataframe = dataframe.withColumn("is_ftp_login", F.when(F.col("is_ftp_login") > 1, 1).otherwise(F.col("is_ftp_login")))
dataframe = dataframe.withColumn("service", F.when(F.col("service") == "-", None).otherwise(F.col("service")))
dataframe = dataframe.fillna("unknown", ["service"])
dataframe = dataframe.fillna("0", ["sport"])
dataframe = dataframe.fillna("0", ["dsport"])

In [6]:
#resolving the ip address columns from string to numeric type while preserving the value
dataframe = dataframe.withColumn("srcip_int",split(col("srcip"),"\.")[0]*16777216 +split(col("srcip"),"\.")[1]*65536+ split(col("srcip"),"\.")[2]*256 + split(col("srcip"),"\.")[3])
dataframe = dataframe.withColumn("dstip_int",split(col("dstip"),"\.")[0]*16777216 +split(col("dstip"),"\.")[1]*65536+ split(col("dstip"),"\.")[2]*256 + split(col("dstip"),"\.")[3])

In [7]:
columns_to_convert = ["dur", "dload", "sload", "sjit", "djit", "Sintpkt", "Dintpkt", "tcprtt", "synack", "ackdat"]
for col_name in columns_to_convert:
    dataframe = dataframe.withColumn(col_name, dataframe[col_name].cast("double"))

In [8]:
#specifying the columns to exclude from the transformation of dataframe from string type to double type
#'srcip','dstip' will be dropped so no need to transform that, 'srcip_int' and 'dstip_int' have been transformed to int type, the rest are nominal datatypes that will be transformed later user stringindexer
columns_to_exclude = ["srcip", "dstip", "srcip_int","dstip_int","proto", "state", "service","attack_cat", "dur", "dload", "sload", "sjit", "djit", "Sintpkt" , "Dintpkt","tcprtt", "synack", "ackdat"]

In [9]:
#performing stringindexing of the nominal datatypes
for col_name, col_type in dataframe.dtypes:
        if col_name not in columns_to_exclude:
            dataframe = dataframe.withColumn(col_name, dataframe[col_name].cast("int"))
columns_to_convert = ["proto", "state", "service", "attack_cat"]
indexers = [StringIndexer(inputCol="proto", outputCol="proto_index", handleInvalid="skip"), StringIndexer(inputCol="state", outputCol="state_index", handleInvalid="skip"),StringIndexer(inputCol="service", outputCol="service_index",  handleInvalid="skip"),StringIndexer(inputCol="attack_cat", outputCol="attackcat_index",  handleInvalid="skip")]
pipeline = Pipeline(stages=indexers)
dataframe_r = pipeline.fit(dataframe).transform(dataframe)

In [10]:
cols = ("srcip", "dstip", "proto","state","service", "attack_cat", "ct_ftp_cmd")

In [11]:
dataframe_d = dataframe_r.drop(*cols)

In [12]:
training, test = dataframe_d.randomSplit([0.8, 0.2], seed=123)

In [13]:
#attack cat is for multiclassification, srcip,dstip, sport,dsport, and the int versions of srcip and dstip are dropped
columns_to_drop = ('attackcat_index', 'srcip', 'dstip', 'sport', 'dsport', 'srcip_int', 'dstip_int')

In [14]:
training_d = training.drop(*map(str, columns_to_drop))

In [15]:
#standardizing the dataset and excluding the label
columns = training_d.columns
column_to_exclude = 'Label'

In [16]:
feature_columns = [col for col in columns if col != column_to_exclude]
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features', handleInvalid='skip')
temptraining = assembler.transform(training_d)
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
training = scaler.fit(temptraining).transform(temptraining)

In [17]:
test_d = test.drop(*map(str, columns_to_drop))
temptest = assembler.transform(test_d)
test = scaler.fit(temptest).transform(temptest)

In [18]:
training = training.withColumnRenamed('Label', 'label')
test = test.withColumnRenamed('Label', 'label')

In [19]:
lr = LogisticRegression(labelCol='label', featuresCol='scaled_features')
lr_param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.fitIntercept, [False, True]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=lr_param_grid,
                           evaluator=BinaryClassificationEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)
model = tvs.fit(training)
best_model = model.bestModel
predictions = best_model.transform(test)
true_positives = predictions.filter("label = 1 AND prediction = 1").count()
false_positives = predictions.filter("label = 0 AND prediction = 1").count()
true_negatives = predictions.filter("label = 0 AND prediction = 0").count()
false_negatives = predictions.filter("label = 1 AND prediction = 0").count()
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)

false_alarm_rate = false_positives / (false_positives + true_negatives)
print("F1 Score:", f1_score)
print("False Alarm Rate:", false_alarm_rate)


F1 Score: 0.9613000204573553
False Alarm Rate: 0.012976256212037547


In [20]:
evaluator_auc = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol='rawPrediction')
auc_score = evaluator_auc.evaluate(predictions)
print("AUC Score:", auc_score)

AUC Score: 0.9963040710896558


In [21]:
rf = RandomForestClassifier(labelCol='label', featuresCol='scaled_features')
rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 50, 100]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .build()

tvs_rf = TrainValidationSplit(estimator=rf,
                              estimatorParamMaps=rf_param_grid,
                              evaluator=BinaryClassificationEvaluator(),
                              trainRatio=0.8)

model_rf = tvs_rf.fit(training)
best_model_rf = model_rf.bestModel
predictions_rf = best_model_rf.transform(test)
true_positives = predictions_rf.filter("label = 1 AND prediction = 1").count()
false_positives = predictions_rf.filter("label = 0 AND prediction = 1").count()
true_negatives = predictions_rf.filter("label = 0 AND prediction = 0").count()
false_negatives = predictions_rf.filter("label = 1 AND prediction = 0").count()
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)
false_alarm_rate = false_positives / (false_positives + true_negatives)
print("rf_F1 Score:", f1_score)
print("rf_False Alarm Rate:", false_alarm_rate)
auc_score = evaluator_auc.evaluate(predictions_rf)
print("rf_AUC Score:", auc_score)

rf_F1 Score: 0.9854765071718733
rf_False Alarm Rate: 0.0044393608722708665
rf_AUC Score: 0.9998539982433695


In [22]:
from pyspark.ml.classification import LinearSVC

In [23]:
svm = LinearSVC(labelCol='label', featuresCol='scaled_features')
svm_param_grid = ParamGridBuilder() \
    .addGrid(svm.maxIter, [10, 100]) \
    .addGrid(svm.regParam, [0.01, 0.1, 0.5]) \
    .build()

tvs_svm = TrainValidationSplit(estimator=svm,
                               estimatorParamMaps=svm_param_grid,
                               evaluator=BinaryClassificationEvaluator(),
                               trainRatio=0.8)
model_svm = tvs_svm.fit(training)
best_model_svm = model_svm.bestModel
predictions_svm = best_model_svm.transform(test)
true_positives = predictions_svm.filter("label = 1 AND prediction = 1").count()
false_positives = predictions_svm.filter("label = 0 AND prediction = 1").count()
true_negatives = predictions_svm.filter("label = 0 AND prediction = 0").count()
false_negatives = predictions_svm.filter("label = 1 AND prediction = 0").count()
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)
false_alarm_rate = false_positives / (false_positives + true_negatives)
print("svm_F1 Score:", f1_score)
print("svm_False Alarm Rate:", false_alarm_rate)
auc_score = evaluator_auc.evaluate(predictions_svm)
print("svm_AUC Score:", auc_score)

svm_F1 Score: 0.9621809292685952
svm_False Alarm Rate: 0.013843969393389603
svm_AUC Score: 0.9964232647552888


In [25]:
from pyspark.ml.classification import DecisionTreeClassifier


In [26]:
dt = DecisionTreeClassifier(labelCol='label', featuresCol='scaled_features')
dt_param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15]) \
    .build()

tvs_dt = TrainValidationSplit(estimator=dt,
                              estimatorParamMaps=dt_param_grid,
                              evaluator=BinaryClassificationEvaluator(),
                              trainRatio=0.8)
model_dt = tvs_dt.fit(training)
best_model_dt = model_dt.bestModel
predictions_dt = best_model_dt.transform(test)
true_positives = predictions_dt.filter("label = 1 AND prediction = 1").count()
false_positives = predictions_dt.filter("label = 0 AND prediction = 1").count()
true_negatives = predictions_dt.filter("label = 0 AND prediction = 0").count()
false_negatives = predictions_dt.filter("label = 1 AND prediction = 0").count()
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)
false_alarm_rate = false_positives / (false_positives + true_negatives)
print("dt_F1 Score:", f1_score)
print("dt_False Alarm Rate:", false_alarm_rate)
auc_score = evaluator_auc.evaluate(predictions_dt)
print("dt_AUC Score:", auc_score)

dt_F1 Score: 0.9814850127579976
dt_False Alarm Rate: 0.002866082932344666
dt_AUC Score: 0.9994499606650452


In [27]:
from pyspark.ml.classification import GBTClassifier


In [28]:
gbt = GBTClassifier(labelCol='label', featuresCol='scaled_features')
gbt_param_grid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [5, 10]) \
    .addGrid(gbt.maxIter, [20, 50]) \
    .build()

tvs_gbt = TrainValidationSplit(estimator=gbt,
                               estimatorParamMaps=gbt_param_grid,
                               evaluator=BinaryClassificationEvaluator(),
                               trainRatio=0.8)

model_gbt = tvs_gbt.fit(training)
best_model_gbt = model_gbt.bestModel
predictions_gbt = best_model_gbt.transform(test)
true_positives = predictions_gbt.filter("label = 1 AND prediction = 1").count()
false_positives = predictions_gbt.filter("label = 0 AND prediction = 1").count()
true_negatives = predictions_gbt.filter("label = 0 AND prediction = 0").count()
false_negatives = predictions_gbt.filter("label = 1 AND prediction = 0").count()
precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
f1_score = 2 * (precision * recall) / (precision + recall)
false_alarm_rate = false_positives / (false_positives + true_negatives)
print("gbt_F1 Score:", f1_score)
print("gbt_False Alarm Rate:", false_alarm_rate)
auc_score = evaluator_auc.evaluate(predictions_gbt)
print("gbt_AUC Score:", auc_score)

gbt_F1 Score: 0.9839253892009734
gbt_False Alarm Rate: 0.0032122917269245266
gbt_AUC Score: 0.9998140903812993


In [29]:
dataframe_d = dataframe_r.drop(*cols)

In [30]:
training, test = dataframe_d.randomSplit([0.8, 0.2], seed=123)

In [31]:
columns_to_drop = ('Label', 'srcip', 'dstip', 'sport', 'dsport', 'srcip_int', 'dstip_int')

In [32]:
training_d = training.drop(*map(str, columns_to_drop))

In [33]:
#standardizing the dataset and excluding the label
columns = training_d.columns
column_to_exclude = 'attackcat_index'

In [34]:
feature_columns = [col for col in columns if col != column_to_exclude]
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features', handleInvalid='skip')
temptraining = assembler.transform(training_d)
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
training = scaler.fit(temptraining).transform(temptraining)

In [35]:
test_d = test.drop(*map(str, columns_to_drop))
temptest = assembler.transform(test_d)
test = scaler.fit(temptest).transform(temptest)

In [36]:
training = training.withColumnRenamed('attackcat_index', 'label')
test = test.withColumnRenamed('attackcat_index', 'label')

In [37]:
dt = DecisionTreeClassifier(labelCol='label', featuresCol='scaled_features')
dt_param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15]) \
    .build()

tvs_dt = TrainValidationSplit(estimator=dt,
                              estimatorParamMaps=dt_param_grid,
                              evaluator=MulticlassClassificationEvaluator(),
                              trainRatio=0.8)
model_dt = tvs_dt.fit(training)
best_model_dt = model_dt.bestModel
predictions_gbt = best_model_dt.transform(test)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol= 'label')
f1_score = evaluator.evaluate(predictions_gbt, {evaluator.metricName: "f1"})
print("F1 Score:", f1_score)
far = 1.0 - evaluator.evaluate(predictions_gbt, {evaluator.metricName: "weightedPrecision"})
print("False Alarm Rate:", far)

F1 Score: 0.9663811897856818
False Alarm Rate: 0.028522615901736326


In [38]:
rf = RandomForestClassifier(labelCol='label', featuresCol='scaled_features')
rf_param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 50, 100]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .build()

tvs_rf = TrainValidationSplit(estimator=rf,
                              estimatorParamMaps=rf_param_grid,
                              evaluator=MulticlassClassificationEvaluator(),
                              trainRatio=0.8)

model_rf = tvs_rf.fit(training)
best_model_rf = model_rf.bestModel
predictions_rf = best_model_rf.transform(test)
f1_score = evaluator.evaluate(predictions_rf, {evaluator.metricName: "f1"})
print("F1 Score:", f1_score)
far = 1.0 - evaluator.evaluate(predictions_rf, {evaluator.metricName: "weightedPrecision"})
print("False Alarm Rate:", far)

F1 Score: 0.9713371120818522
False Alarm Rate: 0.02473790940764775
