In [1]:
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.ml import feature, evaluation, Pipeline
from pyspark.ml.classification import LogisticRegression

In [2]:
conf = (SparkConf()
            .setAppName('data_explore')
            .setMaster('spark://spark-master:7077')
       )
conf.set("spark.executor.memory", "6g")
conf.set("spark.driver.maxResultSize", "0")
conf.set("spark.sql.shuffle.partitions", "30")
conf.set("spark.driver.memory", "3g")

<pyspark.conf.SparkConf at 0x7f9c2a6ada58>

In [3]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
df = (spark.read.format('csv')
      .option('inferSchema', 'true')
      .option('header', 'true')
      .option('escape', '"')
      .load('hdfs://namenode:9000/data/labeled_process_logs.csv') 
      .dropna() # orginal dataset had 52,025 rows, this operation drops 682 rows
     )

In [5]:
df.show(5)

+--------------------+---------------+--------------------+-------------+--------------------+--------------------+--------+--------+--------------------+-----------+
|           timestamp|       hostname|              kernel|host_platform|        process_name|      parent_process|login_id|event_id|        command_line|class_label|
+--------------------+---------------+--------------------+-------------+--------------------+--------------------+--------+--------+--------------------+-----------+
|2019-11-02T01:22:...|DESKTOP-1QAE25C|10.0.18362.30 (Wi...|      windows|C:\Windows\System...|C:\Windows\System...|   0x3e5|    4688|C:\Windows\system...|          0|
|2019-11-02T01:22:...|DESKTOP-1QAE25C|10.0.18362.30 (Wi...|      windows|C:\Windows\System...|C:\Windows\System...|   0x3e5|    4688|C:\Windows\system...|          0|
|2019-11-02T01:22:...|DESKTOP-1QAE25C|10.0.18362.30 (Wi...|      windows|C:\Windows\System...|C:\Windows\System...|     0x0|    4688|  oobe\windeploy.exe|          0

## clean process,path fields and create command line tokens

In [6]:
from pyspark.sql.functions import regexp_extract, regexp_replace, col, count, split, size, to_date
clean_df = df.select(
    '*',
    regexp_extract(col('process_name'), r"(\w?:?.*\\?\w+\s?\w+\\)", 0).alias('process_path'),
    split(col('command_line'), '\s+(?=[-/]+)').alias('cmd_line_split')
).withColumn(
    'class_label', (col('class_label').cast('int'))
).withColumn(
    'trusted_path', (col('process_path').like('%C:\Program Files%') | col('process_path').like('%C:\Windows%') | col('process_path').like('%C:\PROGRA~2%') |col('process_path').like('%Program Files (x86)%')).cast('int')
).withColumn(
    '_timestamp', F.to_timestamp(col('timestamp'))
).withColumn(
    'process_name', regexp_extract(col('process_name'), r'(\w+.?\w+$)', 0)
).withColumn(
    'parent_process', regexp_extract(col('parent_process'), r'(\w+.?\w+$)', 0)
).withColumn(
    'command_line', regexp_replace(col('command_line'), r'["|\{}();=:]'," ")
).withColumn(
    'command_line', regexp_replace(col('command_line'), r'[(\s+\\\s+)]', " ")
).withColumn(
    'command_line', regexp_replace(col('command_line'), r'\s+\.\s+', "")  
).withColumn(
    'command_line', regexp_replace(col('command_line'), r':\/\/(\d+\.?)+:\d+', " ")  # remove ip addresss
).withColumn(
    'command_line', regexp_replace(col('command_line'), r'\/\/raw.*\.com(\/\w+)*.*\/', "")  # remove github url
).withColumn(
    'cmd_line_tokens', regexp_replace(col('command_line'), r'[\?\&]'," ")
).withColumn(
    'cmd_line_tokens', regexp_replace(col('command_line'), r'\d{10,}',"")  # long string of digits
).withColumn(
    'cmd_line_tokens', regexp_replace(col('command_line'), r'[C,]',"")
).withColumn(
    'cmd_line_tokens', split(col('command_line'), '\s+')
)

## cleaned command line tokens

In [7]:
@F.udf(returnType=ArrayType(StringType()))
def clean_array(l):
    arr = [x for x in l if x is not "," and x is not " "]
    return arr

In [8]:
clean_df.select('cmd_line_tokens').sample(.10).show(10, False)

+---------------------------------------------------------------------------------------------+
|cmd_line_tokens                                                                              |
+---------------------------------------------------------------------------------------------+
|[C, Windows, system32, svchost.exe, -k, LocalServiceNoNetwork, -p]                           |
|[C, Windows, system32, oobe, setup.exe]                                                      |
|[C, Windows, system32, svchost.exe, -k, LocalServiceNetworkRestricted, -p, -s, TimeBrokerSvc]|
|[C, Windows, System32, Upfc.exe, /launchtype, boot, /cv, 09o3CnnAskG8AMTNUwkQhQ.0]           |
|[C, Windows, System32, svchost.exe, -k, LocalServiceNetworkRestricted, -p, -s, EventLog]     |
|[C, Windows, system32, svchost.exe, -k, LocalService, -p, -s, SstpSvc]                       |
|[C, Windows, system32, svchost.exe, -k, netsvcs, -p, -s, LanmanServer]                       |
|[C, Windows, system32, svchost.exe, -k,

## create freq and tfidf pipeline

In [9]:
cv_pipeline = Pipeline(stages=[
    feature.CountVectorizer(minTF=1, minDF=3, inputCol='cmd_line_tokens', outputCol='tf')
]).fit(clean_df)

In [10]:
cv_pipeline.transform(clean_df).select('tf').show(20,truncate=False)

+--------------------------------------------------------------------------+
|tf                                                                        |
+--------------------------------------------------------------------------+
|(5362,[0,15,26,49,50,89,455],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])               |
|(5362,[0,15,26,49,50,89,91,155,843],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|(5362,[1420,4759],[1.0,1.0])                                              |
|(5362,[0,15,26,1420,2407],[1.0,1.0,1.0,1.0,1.0])                          |
|(5362,[0,15,31,49,50,89,91,140,685],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|(5362,[0,15,31,49,50,89,91,145,829],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|(5362,[0,15,26,49,50,89,91,140,828],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|(5362,[0,15,31,579,652,654,722],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])            |
|(5362,[0,15,26,49,50,91,145,764],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])       |
|(5362,[0,15,31,49,50,89,91,140,820],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|

In [11]:
len(cv_pipeline.stages[0].vocabulary)

5362

In [12]:
idf_pipeline = Pipeline(stages=[
    cv_pipeline, feature.IDF(inputCol='tf', outputCol='tfidf')
]).fit(clean_df)

In [13]:
tfidf_df = idf_pipeline.transform(clean_df)

## create logistic regression model

In [14]:
training_df, validation_df, testing_df = clean_df.randomSplit([0.6, 0.3, 0.1], seed=0)

In [15]:
[training_df.count(), validation_df.count(), testing_df.count()]

[30843, 15310, 5190]

In [16]:
lr = LogisticRegression(featuresCol='tfidf', labelCol='class_label')

In [17]:
lr_pipeline = Pipeline(stages=[
    idf_pipeline, 
    lr
]).fit(training_df)

In [18]:
lr_pipeline.transform(validation_df).\
    select(F.expr('float(prediction = class_label)').alias('correct')).\
    select(F.avg('correct')).show()

+------------+
|avg(correct)|
+------------+
|         1.0|
+------------+



In [19]:
import pandas as pd
vocabulary = idf_pipeline.stages[0].stages[0].vocabulary
weights = lr_pipeline.stages[-1].coefficients.toArray()
coeffs_df = pd.DataFrame({'token': vocabulary, 'weight': weights})

## the positive class is 1 (malicious) so the positive weights indicate a feature that contributes positively to being classified as malicious

In [20]:
coeffs_df.sort_values('weight', ascending=False).head(20)

Unnamed: 0,token,weight
271,/C,6.502964
641,wmic,3.980105
5104,calc,3.703205
144,-ExecutionPolicy,1.282716
148,Bypass,1.281894
147,-C,1.281894
2557,reg,1.023405
704,RegSvr32.sct,0.9517
702,/u,0.9517
688,scrobj.dll,0.9517


## example of malicious tokens

In [30]:
training_df.select('cmd_line_tokens').filter('class_label == 1').sample(.20).show(5, truncate=False)

+--------------------------------------------------------------------------------------+
|cmd_line_tokens                                                                       |
+--------------------------------------------------------------------------------------+
|[powershell.exe, -ExecutionPolicy, Bypass, -C, Remove-Item, -Path, staged, -recurse, ]|
|[cmd.exe, /C, arp, -a, ]                                                              |
|[cmd.exe, /C, net, share, ]                                                           |
|[powershell.exe, -ExecutionPolicy, Bypass, -C, Remove-Item, -Path, staged, -recurse, ]|
|[powershell.exe, -ExecutionPolicy, Bypass, -C, Remove-Item, -Path, staged, -recurse, ]|
+--------------------------------------------------------------------------------------+
only showing top 5 rows



## the negative class is 0 (benign) so the negative weights indicate a feature that contributes negatively to being classified as benign

In [21]:
coeffs_df.sort_values('weight').head(20)

Unnamed: 0,token,weight
0,C,-56.615276
1,,-7.581101
651,Wbem,-4.560307
645,WMIC.exe,-4.560307
15,Windows,-2.367351
123,taskhostw.exe,-2.011005
6,Files,-1.539467
7,Program,-1.539467
718,sihost.exe,-1.471196
329,whoami,-1.446494


## example of benign tokens

In [31]:
training_df.select('cmd_line_tokens').filter('class_label == 0').sample(.20).show(5, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|cmd_line_tokens                                                                                                                             |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|[C, Windows, servicing, TrustedInstaller.exe]                                                                                               |
|[, C, ProgramData, Microsoft, Windows, Defender, platform, 4.18.1910.4-0, NisSrv.exe, ]                                                     |
|[C, Windows, System32, RuntimeBroker.exe, -Embedding]                                                                                       |
|[C, Windows, winsxs, amd64_microsoft-windows-servicingstack_31bf3856ad364e35_10.0.18362.411_none_5f53d2d858cf8961, TiWorker.exe, -Embedding]|

## create another model with elastic net regularization to control overfitting and mitigate noisy data

In [22]:
en_lr = Pipeline(stages=[
    idf_pipeline,
    LogisticRegression(featuresCol='tfidf',labelCol='class_label',regParam=0.02,elasticNetParam=0.3)
]).fit(training_df)

In [23]:
validation_model = en_lr.transform(validation_df)

In [24]:
from pyspark.sql.functions import avg
validation_model.selectExpr('avg(float(prediction = class_label)) as prediction').show()

+------------------+
|        prediction|
+------------------+
|0.9998693664271718|
+------------------+



## re-examine the coefficients of the model to determine how well it learned the weights for the tokens

In [25]:
coefs = en_lr.stages[-1].coefficients.toArray()
word_coefs = pd.DataFrame({'token': cv_pipeline.stages[0].vocabulary, 'weights': coefs})

+ malicious tokens

In [26]:
word_coefs.sort_values('weights', ascending=False).head(15)

Unnamed: 0,token,weights
271,/C,0.783473
243,cmd.exe,0.497236
144,-ExecutionPolicy,0.435198
147,-C,0.43492
148,Bypass,0.43492
139,powershell.exe,0.394196
704,RegSvr32.sct,0.25158
688,scrobj.dll,0.25158
702,/u,0.25158
701,/i,0.25158


+ benign tokens

In [27]:
word_coefs.sort_values('weights', ascending=True).head(15)

Unnamed: 0,token,weights
0,C,0.0
3578,"1568,2869842192297148915,8002914160209687559,1...",0.0
3577,"1624,3163048634562658899,4908755355501109964,1...",0.0
3576,Microsoft.Pcd26229b#,0.0
3575,"1532,12424944089969324879,5506418820526302287,...",0.0
3574,4fc95f0b13ec2127b5cd6cebbc9e3a31,0.0
3573,"1552,16080360943918942232,14127310776218489240...",0.0
3572,270f73da430d947b2d1f419de547829d,0.0
3579,"1572,945730016226139338,4365584540048621087,13...",0.0
3571,Microsoft.ConfigCI.Commands.ni.dll,0.0


+ The 'C' is the directory path for all programs, not very informative here and should probably be removed 
+ The random numbers look to be from Microsoft Edge browser url srings, and other seemingly benign system processes--however hackers use similar encoding strategies for malicious code execution; therefore this feature shouldn't be ignored