In [93]:
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import re
import nltk
import functools
from collections import Counter

In [84]:
# importing file with data
raw_logs=[]
with open('LinuxLogsDataset.txt','r') as log_file:
    raw_logs = log_file.readlines()

In [85]:
# Bag of Words technique
def clear_words(raw_text_lst):
    cleared = []
    for sentence in raw_text_lst:
        cleared.append(re.sub(r'\d+[.;[]]|([*\d+:]*)|[^\w\s]','',sentence))
    return cleared

cleared_logs = clear_words(raw_logs)
print(cleared_logs)



In [97]:
def tokenize_words(logs):
    docs_lst=[]
    for log in logs:
        tokens = nltk.word_tokenize(log)
        docs_lst.append(tokens)
    return docs_lst
        
    

docs = tokenize_words(cleared_logs)

In [98]:
print(docs)



In [105]:
# Count Vectors algorithm
# distant words
distant_words = set([token.lower() for doc in docs for token in doc])
n_distant_words = len(distant_words)
print(distant_words)
print("Distant words:%d"%n_distant_words)

Distant words:395


In [None]:
# calculating words occurence in docs


In [116]:
# mapping word(dist)->doc[i]:occur_num ->doc[i+1]:occur_num
word_occurrence = [{'word': word, 'count_vector': [1 if word in doc else 0 for doc in docs]} for word in distant_words]
print(word_occurrence)





In [152]:
# data preparation
unique_words = list(distant_words)
num_words = len(unique_words)
num_docs = len(docs)
count_matrix = np.array([ [0] * num_words for _ in range(num_docs)])

for i,doc in enumerate(docs):
    for j,word in enumerate(unique_words):
        count_matrix[i][j] = doc.count(word)
print(count_matrix)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [128]:
# SOM
!pip install minisom


Collecting minisom
  Downloading MiniSom-2.3.1.tar.gz (10 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: minisom
  Building wheel for minisom (setup.py): started
  Building wheel for minisom (setup.py): finished with status 'done'
  Created wheel for minisom: filename=MiniSom-2.3.1-py3-none-any.whl size=10603 sha256=deafcf335d68e7bc7a809e0a4ba338054716dd33cdbebb86db5f0c662055ed29
  Stored in directory: c:\users\asus\appdata\local\pip\cache\wheels\b4\aa\7a\d9a88098f7877aa95dd4f227fe614f75773654baa39b47bba6
Successfully built minisom
Installing collected packages: minisom
Successfully installed minisom-2.3.1



[notice] A new release of pip is available: 23.1.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [129]:
from minisom import MiniSom

In [153]:
grid_width = 10
grid_height = 10
input_shape = count_matrix.shape[1]
random_seed = 0
print(input_shape)

395


In [154]:
som = MiniSom(grid_width,grid_height,input_shape,random_seed=0)
som.train_random(count_matrix,num_iteration=100)

cluster_labels_som = []
for i in range(count_matrix.shape[0]):
    x = count_matrix[i]
    winner = som.winner(x)
    cluster_label = f"Cluster_{winner[0]}_{winner[1]}"
    cluster_labels_som.append(cluster_label)

# Convert cluster_labels to a numpy array if needed
cluster_labels_som = np.array(cluster_labels_som)



In [156]:
# Print the cluster labels
# getting result

af = AffinityPropagation(preference=-50, random_state=0)
cluster_labels_af = af.fit_predict(count_matrix)

# Step 3: Assigning final cluster labels to log messages # Replace with your log messages

# Iterate over log messages and their corresponding cluster labels
for i, log_message in enumerate(docs):
    cluster_label_som = cluster_labels_som[i]
    cluster_label_af = cluster_labels_af[i]
    log_message_str = ' '.join(log_message)  # Convert the list of tokens to a string
    print(f"Log Message: {log_message_str} | SOM Cluster Label: {cluster_label_som} | AP Cluster Label: {cluster_label_af}")

Log Message: Jun combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhost | SOM Cluster Label: Cluster_1_4 | AP Cluster Label: 216
Log Message: Jun combo sshdpam_unix check pass user unknown | SOM Cluster Label: Cluster_1_7 | AP Cluster Label: 151
Log Message: Jun combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhost | SOM Cluster Label: Cluster_1_4 | AP Cluster Label: 8
Log Message: Jun combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhosthinetiphinetnet userroot | SOM Cluster Label: Cluster_0_5 | AP Cluster Label: 0
Log Message: Jun combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhosthinetiphinetnet userroot | SOM Cluster Label: Cluster_0_5 | AP Cluster Label: 0
Log Message: Jun combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhosthinetiphinetnet userroot | SOM Cluster Label: Cluster_0_5 | AP Cluster Label: 2
Log Message: Jun combo sshdpam_uni



In [157]:
print("Number of clusters: %d"%len(cluster_labels))

Number of clusters: 22
