## Word Embedding

In [1]:
## import libraries
import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

#### Install spacy large model if you are using in the first time, be patient it can take some time

In [2]:
# !python -m spacy download en_core_web_lg 

In [3]:
nlp = spacy.load("en_core_web_lg")

#### Loading train and test data

In [4]:
data_log= pd.read_csv('./SAMPLE_DATA/labeled-encoded-data-samples/may_jun_jul_2021.csv')


In [5]:
def balance_data(data):
    numbers_samples=data.label.value_counts()[1]
    df_safe = data[data.label==0].sample(numbers_samples+int(numbers_samples/2), random_state=2024)
    df_suspecious = data[data.label==1].sample(numbers_samples, random_state=2024)
    data_balanced = pd.concat([df_safe,df_suspecious],axis=0)
    return data_balanced


In [6]:
data_log_balanced=balance_data(data_log)

In [7]:
data_log_balanced.label.value_counts()

label
0    4033
1    2689
Name: count, dtype: int64

In [8]:
#use this utility function to preprocess the text
#1. Remove the stop words
#2. Convert to base form using lemmatisation
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return ' '.join(filtered_tokens)

In [9]:
#create a new column "preprocessed_text" which store the clean form of given text [use apply and lambda function]

data_log_balanced['preprocessed_log_line'] = data_log_balanced['log_line'].apply(lambda text: preprocess(text))

In [10]:
data_log_balanced['preprocessed_log_line'].head()

81269     54.36.148.59 06 Jul/2021:00:51:26 -0700 /self....
40563     85.208.98.51 31 May/2021:22:52:16 -0700 /datas...
45812     157.55.39.45 05 Jun/2021:03:30:46 -0700 /self....
105414    49.7.20.120 26 Jul/2021:12:35:29 -0700 http/1....
55681     40.77.167.38 13 Jun/2021:04:06:18 -0700 /self....
Name: preprocessed_log_line, dtype: object

#### Get spacy embeddings for each preprocessed text

In [11]:
#create a new column "vector" that store the vector representation of each pre-processed text
data_log_balanced['vector'] = data_log_balanced['preprocessed_log_line'].apply(lambda text: nlp(text).vector) 

In [12]:
selected_features = ['preprocessed_log_line','vector','label']
data_log_balanced[selected_features].head()

Unnamed: 0,preprocessed_log_line,vector,label
81269,54.36.148.59 06 Jul/2021:00:51:26 -0700 /self....,"[0.14392935, -1.8637211, -0.040199377, 0.30865...",0
40563,85.208.98.51 31 May/2021:22:52:16 -0700 /datas...,"[-0.69720566, -1.4630636, 0.7257287, 0.0254308...",0
45812,157.55.39.45 05 Jun/2021:03:30:46 -0700 /self....,"[0.08847688, -2.0407524, 0.6192496, -0.1505259...",0
105414,49.7.20.120 26 Jul/2021:12:35:29 -0700 http/1....,"[0.8372246, -1.3550147, -0.18667156, -0.060749...",0
55681,40.77.167.38 13 Jun/2021:04:06:18 -0700 /self....,"[-0.59797686, -2.1597211, -0.294015, 0.7068584...",0


## Train-Test splitting

In [13]:
#Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
X_train, X_test, y_train, y_test = train_test_split(
    data_log_balanced.vector.values, 
    data_log_balanced.label, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=data_log_balanced.label # keep the same ratio of 0 and 1 in train and test data
)

#### Reshape the X_train and X_test so as to fit for models
In simple terms, this code is taking a list of arrays (X_train) and stacking them along a new axis, resulting in a new two-dimensional array 

In [14]:

print("Shape of X_train before reshaping: ", X_train.shape)
print("Shape of X_test before reshaping: ", X_test.shape)

#convert the shape of X_train and X_test to 2D
#use np.stack() function
X_train_2d = np.stack(X_train)
X_test_2d =  np.stack(X_test)

print("Shape of X_train after reshaping: ", X_train_2d.shape)
print("Shape of X_test after reshaping: ", X_test_2d.shape)

Shape of X_train before reshaping:  (5377,)
Shape of X_test before reshaping:  (1345,)
Shape of X_train after reshaping:  (5377, 300)
Shape of X_test after reshaping:  (1345, 300)


### Using DecisionTreeClassifier

In [15]:
#1. creating a Decision Tree model object
DecisionTreeModel = DecisionTreeClassifier()

#2. fit with all_train_embeddings and y_train
DecisionTreeModel.fit(X_train_2d, y_train)


#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = DecisionTreeModel.predict(X_test_2d)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       807
           1       0.85      0.85      0.85       538

    accuracy                           0.88      1345
   macro avg       0.87      0.87      0.87      1345
weighted avg       0.88      0.88      0.88      1345


#### Using RandomForest

In [16]:

#1. creating a Random Forest model object
RandomForestModel = RandomForestClassifier()


#2. fit with all_train_embeddings and y_train
RandomForestModel.fit(X_train_2d, y_train)


#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = RandomForestModel.predict(X_test_2d)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       807
           1       0.89      0.86      0.88       538

    accuracy                           0.90      1345
   macro avg       0.90      0.90      0.90      1345
weighted avg       0.90      0.90      0.90      1345


#### Test the model with some logs 

 The ground truth is non risky

In [17]:
def vectorize_log(text):
    return nlp(text).vector

In [18]:
log_for_prediction = '[01/Aug/2021:03:03:55 -0700] "GET / HTTP/1.1" 200 12883 "-" "Mozilla/5.0 (Macintosh# Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML# like Gecko) Chrome/92.0.4515.107 Safari/537.36'
log_for_prediction = preprocess(log_for_prediction)
#reshape the vector to 2D array
log_for_prediction = vectorize_log(log_for_prediction).reshape(-1,300)
RandomForestModel.predict(log_for_prediction)

array([0])

The ground truth is risky

In [19]:
log_for_prediction = '77.75.76.168 - - [01/Aug/2021:04:07:07 -0700] "GET /honeypot/BSidesDFW%20-%202014.ipynb HTTP/1.1" 304 265 "-" "Mozilla/5.0 (compatible# SeznamBot/3.2# +http://napoveda.seznam.cz/en/seznambot-intro/)"'
log_for_prediction = preprocess(log_for_prediction)
#reshape the vector to 2D array
log_for_prediction = vectorize_log(log_for_prediction).reshape(-1,300)
print(log_for_prediction.shape)
RandomForestModel.predict(log_for_prediction)

(1, 300)


array([1])

In [20]:
# save the model to disk
import joblib
filename = 'risky_safe_model.pkl'
joblib.dump(RandomForestModel, filename)

['risky_safe_model.pkl']

In [21]:
# Load the model from the file
loaded_model = joblib.load('risky_safe_model.pkl')

# Now, you can use loaded_model to make predictions or perform other tasks
predictions = loaded_model.predict(log_for_prediction)
print(type(predictions))
print(predictions)

<class 'numpy.ndarray'>
[1]
