In [1]:
import keras
from tensorflow.keras import backend
import pandas as pd
import re
import os

Using TensorFlow backend.


In [2]:
def read_data(direc):
    df_list = []
    for f in os.listdir(direc):
        if f.endswith('.csv'):
            print('reading ...', f)
            df = pd.read_csv(direc+'/'+f)
            df_list.append(df)
    print('Joining all together ...')
    df_full = pd.concat(df_list, ignore_index=1)
    print('Done.')
    return df_full

In [98]:
df = read_data(r'C:\Users\csob\OneDrive - Chevron\powershell_data')

reading ... 20191230-09.csv
reading ... data20200102-17.csv
reading ... data20200106-12.csv
reading ... data20200107-14.csv
reading ... data20200107-15.csv
reading ... data20200107-16.csv
reading ... data20200107-18.csv
reading ... data20200108-09.csv
reading ... data20200108-14.csv
reading ... data20200108-15.csv
reading ... data20200108-16.csv
reading ... data20200109-09.csv
reading ... data20200109-10.csv
reading ... data20200109-15.csv
Joining all together ...
Done.


In [4]:
df_sample=df.sample(100000, replace=False)
del df

In [5]:
df_sample['hash']=df_sample['scripts'].apply(hash)

In [6]:
df_sample = df_sample.drop_duplicates(subset='hash')

In [7]:
def clean_script_block(x):
    block = x.split(':')[0]
    if 'of' in block:
        x = x.split(':')[1:]
        x = ' '.join(x)
    x = x.replace('\n',' ')
    x = x.replace('\t',' ')
    #replace all numbers with *
    regex = re.compile('[0-9]')
    x = regex.sub('*', x)
    #replace all chars not in a-z, A-Z, *(numbers replaced with *) and $ with empty spaces
    regex = re.compile('[^a-zA-Z*$-]')
    x = regex.sub(' ', x)
    #replace all upper case with lower case
    x = x.lower()
    x = ' '.join(x.split())
    return x

In [8]:
df_sample['scripts_clean']=df_sample['scripts'].apply(lambda x: clean_script_block(x))

In [9]:
scripts=list(df_sample['scripts_clean'])

In [10]:
word_token=keras.preprocessing.text.Tokenizer(oov_token='UNK', num_words=20000)

In [11]:
word_token.fit_on_texts(scripts)

In [12]:
len(word_token.word_index)

20528

In [13]:
embeddings = word_token.texts_to_sequences(scripts)

In [14]:
max_len=max([len(e) for e in embeddings]);max_len

3136

In [15]:
max_len=200

In [16]:
padded_embeddings = keras.preprocessing.sequence.pad_sequences(embeddings, maxlen=max_len, padding='post')

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [88]:
scaler=MinMaxScaler()
scaled_data=scaler.fit_transform(padded_embeddings)
X_train, X_test = train_test_split(scaled_data ,test_size=0.2, random_state=43)

In [89]:
from keras.models import Model, Sequential
from keras.layers import Dense, Input, LSTM, Embedding, RepeatVector
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers

In [90]:
input_dim=padded_embeddings.shape[1]
encoding_dim=56
hidden_dim=int(encoding_dim/2)
hidden_dim_2=int(encoding_dim/4)
hidden_dim_3=int(encoding_dim/8)
epochs=30
batch_size=32
learning_rate=0.1

In [157]:
nb_epoch = 30
batch_size = 32
learning_rate = 0.1
input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim, activation="tanh", activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoder = Dense(hidden_dim, activation="relu")(encoder)
encoder = Dense(hidden_dim_2, activation='relu')(encoder)
encoder = Dense(hidden_dim_3, activation='relu')(encoder)
decoder = Dense(hidden_dim_3, activation='relu')(encoder)
decoder = Dense(hidden_dim_2, activation='relu')(decoder)
decoder = Dense(hidden_dim, activation='relu')(decoder)
decoder = Dense(encoding_dim, activation='relu')(decoder)
decoder = Dense(input_dim, activation='tanh')(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

In [158]:
autoencoder.summary()

Model: "model_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_22 (InputLayer)        (None, 200)               0         
_________________________________________________________________
dense_69 (Dense)             (None, 56)                11256     
_________________________________________________________________
dense_70 (Dense)             (None, 28)                1596      
_________________________________________________________________
dense_71 (Dense)             (None, 14)                406       
_________________________________________________________________
dense_72 (Dense)             (None, 7)                 105       
_________________________________________________________________
dense_73 (Dense)             (None, 7)                 56        
_________________________________________________________________
dense_74 (Dense)             (None, 14)                112

In [159]:
autoencoder.compile(optimizer='adam', 
                    loss='mean_squared_error', 
                    metrics=['mse'])

In [None]:
autoencoder.fit(X_train, X_train,
                epochs=epochs,
                batch_size=batch_size,
                shuffle=True,
                validation_data=(X_test, X_test),
                verbose=1)

Train on 79802 samples, validate on 19951 samples
Epoch 1/30

In [99]:
autoencoder.save('autoencoder.h5')

In [100]:
import joblib
joblib.dump(word_token, 'word_token.pkl')
joblib.dump(scaler, 'min_max_scaler.pkl')

['min_max_scaler.pkl']

In [101]:
test_data=df.sample(20000, replace=False)
del df

In [103]:
test_scripts=list(test_data['scripts'])
del test_data

In [104]:
test_tokens = word_token.texts_to_sequences(test_scripts)

In [106]:
test_padded = keras.preprocessing.sequence.pad_sequences(test_tokens, maxlen=max_len, padding='post')

In [134]:
def data_prep(data):
    clean_data = [clean_script_block(s) for s in data]
    embeddings = word_token.texts_to_sequences(clean_data)
    padded = keras.preprocessing.sequence.pad_sequences(embeddings, maxlen=max_len, padding='post')
    scaled_data = scaler.transform(padded)
    return scaled_data

In [135]:
test=data_prep(test_scripts)

In [138]:
predicted=autoencoder.predict(test)

In [139]:
import numpy as np
mse = np.mean(np.power(scaled_data - predicted, 2), axis=1)

In [140]:
mse

array([3.96752827e-04, 2.36310445e-05, 1.05877488e-05, ...,
       1.00342790e-05, 7.73146450e-05, 6.62749999e-05])

In [141]:
df=pd.DataFrame()

In [142]:
df['scripts']=test_scripts

In [143]:
df['mse']=mse

In [144]:
df.head()

Unnamed: 0,scripts,mse
0,"Creating Scriptblock text (2 of 3):\n\nme},\n\...",0.000397
1,Creating Scriptblock text (1 of 1):\n\n\n\n#re...,2.4e-05
2,Creating Scriptblock text (4 of 5):\n\netName=...,1.1e-05
3,Creating Scriptblock text (1 of 1):\n\n{ Set-S...,7.6e-05
4,Creating Scriptblock text (4 of 4):\n\nss {\n\...,1.4e-05


In [153]:
mse_threshold = np.quantile(df['mse'], 0.9998)
print(f'MSE 0.9998 thresholde:{mse_threshold}')

MSE 0.9998 thresholde:0.03513053949470924


In [154]:
df['outlier'] = 0
df.loc[df['mse'] > mse_threshold, 'outlier'] =1

In [155]:
df['outlier'].sum()

4

In [156]:
for i in df['scripts'].loc[df['outlier']==1]:
    print(i)

Creating Scriptblock text (1 of 1):

# Copyright © 2008, Microsoft Corporation. All rights reserved.



trap {break}



# Include common library

. .\CL_Utility.ps1



Import-LocalizedData -BindingVariable localizationString -FileName CL_LocalizationData



# function library

# Function to check whether have unnecessary files

function Test-UnnecessaryFiles([string]$folder = $(throw "No folder is specified")) {

    if([String]::IsNullOrEmpty($folder) -or (-not(Test-Path $folder))) {

        return $false

    }



    [int]$threshold = -1

    $folders = Get-ChildItem -literalPath $folder -Recurse -Force | Where-Object {($_.PSIsContainer) -and ((($_.CreationTime).CompareTo((Get-Date).AddMonths($threshold))) -lt 0)}



    return ($folders -ne $null)

}



# Check troubleshooting history

Write-DiagProgress -Activity $localizationString.CheckTSHistory



[string]$userTSHistoryPath = Get-UserTSHistoryPath

[string]$adminTSHistoryPath = Get-AdminTSHistoryPath

[double]$userTSHistorySiz