In [18]:
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
import matplotlib.pyplot as plt
import re
import nltk
import functools
from collections import Counter

In [20]:
# importing file with data
raw_logs=[]
with open('LinuxLogsDataset.txt','r') as log_file:
    raw_logs = log_file.readlines()

raw_logs[:10]

['Jun 14 15:16:01 combo sshd(pam_unix)[19939]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=218.188.2.4 \n',
 'Jun 14 15:16:02 combo sshd(pam_unix)[19937]: check pass; user unknown\n',
 'Jun 14 15:16:02 combo sshd(pam_unix)[19937]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=218.188.2.4 \n',
 'Jun 15 02:04:59 combo sshd(pam_unix)[20882]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=220-135-151-1.hinet-ip.hinet.net  user=root\n',
 'Jun 15 02:04:59 combo sshd(pam_unix)[20884]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=220-135-151-1.hinet-ip.hinet.net  user=root\n',
 'Jun 15 02:04:59 combo sshd(pam_unix)[20883]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=220-135-151-1.hinet-ip.hinet.net  user=root\n',
 'Jun 15 02:04:59 combo sshd(pam_unix)[20885]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=220-135-151-1.hinet-ip.hinet.net

In [21]:
# Bag of Words technique
def clear_words(raw_text_lst):
    cleared = []
    for sentence in raw_text_lst:
        cleared.append(re.sub(r'\d+[.;[]]|([*\d+:]*)|[^\w\s]','',sentence))
    return cleared

cleared_logs = clear_words(raw_logs)
print(cleared_logs[:10])

['Jun   combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhost \n', 'Jun   combo sshdpam_unix check pass user unknown\n', 'Jun   combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhost \n', 'Jun   combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhosthinetiphinetnet  userroot\n', 'Jun   combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhosthinetiphinetnet  userroot\n', 'Jun   combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhosthinetiphinetnet  userroot\n', 'Jun   combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhosthinetiphinetnet  userroot\n', 'Jun   combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhosthinetiphinetnet  userroot\n', 'Jun   combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhosthinetiphinetnet  userroot\n', 'Jun   combo sshdpam_unix authentication 

In [22]:
def tokenize_words(logs):
    docs_lst=[]
    for log in logs:
        tokens = nltk.word_tokenize(log)
        docs_lst.append(tokens)
    return docs_lst
        
    

docs = tokenize_words(cleared_logs)

In [23]:
print(docs[:20])

[['Jun', 'combo', 'sshdpam_unix', 'authentication', 'failure', 'logname', 'uid', 'euid', 'ttyNODEVssh', 'ruser', 'rhost'], ['Jun', 'combo', 'sshdpam_unix', 'check', 'pass', 'user', 'unknown'], ['Jun', 'combo', 'sshdpam_unix', 'authentication', 'failure', 'logname', 'uid', 'euid', 'ttyNODEVssh', 'ruser', 'rhost'], ['Jun', 'combo', 'sshdpam_unix', 'authentication', 'failure', 'logname', 'uid', 'euid', 'ttyNODEVssh', 'ruser', 'rhosthinetiphinetnet', 'userroot'], ['Jun', 'combo', 'sshdpam_unix', 'authentication', 'failure', 'logname', 'uid', 'euid', 'ttyNODEVssh', 'ruser', 'rhosthinetiphinetnet', 'userroot'], ['Jun', 'combo', 'sshdpam_unix', 'authentication', 'failure', 'logname', 'uid', 'euid', 'ttyNODEVssh', 'ruser', 'rhosthinetiphinetnet', 'userroot'], ['Jun', 'combo', 'sshdpam_unix', 'authentication', 'failure', 'logname', 'uid', 'euid', 'ttyNODEVssh', 'ruser', 'rhosthinetiphinetnet', 'userroot'], ['Jun', 'combo', 'sshdpam_unix', 'authentication', 'failure', 'logname', 'uid', 'euid', '

In [24]:
# Count Vectors algorithm
# distant words
distant_words = set([token.lower() for doc in docs for token in doc])
n_distant_words = len(distant_words)
print(distant_words)
print("Distant words:%d"%n_distant_words)

Distant words:395


In [None]:
# calculating words occurence in docs


In [25]:
# mapping word(dist)->doc[i]:occur_num ->doc[i+1]:occur_num
word_occurrence = [{'word': word, 'count_vector': [1 if word in doc else 0 for doc in docs]} for word in distant_words]
print(word_occurrence[:5])



[{'word': 'reset', 'count_vector': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [35]:
# data preparation
unique_words = list(distant_words)
num_words = len(unique_words)
num_docs = len(docs)
count_matrix = np.array([ [0] * num_words for _ in range(num_docs)])

for i,doc in enumerate(docs):
    for j,word in enumerate(unique_words):
        count_matrix[i][j] = doc.count(word)
print(count_matrix)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
# SOM
!pip install minisom


In [36]:
from minisom import MiniSom

In [37]:
grid_width = 10
grid_height = 10
input_shape = count_matrix.shape[1]
random_seed = 0
print(input_shape)

395


In [38]:
som = MiniSom(grid_width,grid_height,input_shape,random_seed=0)
som.train_random(count_matrix,num_iteration=100)

cluster_labels_som = []
for i in range(count_matrix.shape[0]):
    x = count_matrix[i]
    winner = som.winner(x)
    cluster_label = f"Cluster_{winner[0]}_{winner[1]}"
    cluster_labels_som.append(cluster_label)

# Convert cluster_labels to a numpy array if needed
cluster_labels_som = np.array(cluster_labels_som)
print(cluster_labels_som)



['Cluster_6_4' 'Cluster_9_5' 'Cluster_6_4' ... 'Cluster_8_2' 'Cluster_8_7'
 'Cluster_8_7']


In [39]:
# Print the cluster labels
# getting result

af = AffinityPropagation(preference=-50, random_state=0)
cluster_labels_af = af.fit_predict(count_matrix)

# Step 3: Assigning final cluster labels to log messages # Replace with your log messages

# Iterate over log messages and their corresponding cluster labels
for i, log_message in enumerate(docs):
    cluster_label_som = cluster_labels_som[i]
    cluster_label_af = cluster_labels_af[i]
    log_message_str = ' '.join(log_message)  # Convert the list of tokens to a string
    print(f"Log Message: {log_message_str} | SOM Cluster Label: {cluster_label_som} | AP Cluster Label: {cluster_label_af}")

Log Message: Jun combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhost | SOM Cluster Label: Cluster_6_4 | AP Cluster Label: 216
Log Message: Jun combo sshdpam_unix check pass user unknown | SOM Cluster Label: Cluster_9_5 | AP Cluster Label: 151
Log Message: Jun combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhost | SOM Cluster Label: Cluster_6_4 | AP Cluster Label: 8
Log Message: Jun combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhosthinetiphinetnet userroot | SOM Cluster Label: Cluster_7_4 | AP Cluster Label: 0
Log Message: Jun combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhosthinetiphinetnet userroot | SOM Cluster Label: Cluster_7_4 | AP Cluster Label: 0
Log Message: Jun combo sshdpam_unix authentication failure logname uid euid ttyNODEVssh ruser rhosthinetiphinetnet userroot | SOM Cluster Label: Cluster_7_4 | AP Cluster Label: 2
Log Message: Jun combo sshdpam_uni



In [41]:
print("Number of clusters: %d"%len(cluster_labels))

NameError: name 'cluster_labels' is not defined