In [9]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv('system_logs_600.csv')

# Split into training and testing sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Display the shapes to confirm about the split
print("Training data shape:", train.shape)
print("Testing data shape:", test.shape)

Training data shape: (480, 5)
Testing data shape: (120, 5)


In [12]:
# Import necessary libraries
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np
import pandas as pd

# Label encoding for categorical features
le = LabelEncoder()
train['level_encoded'] = le.fit_transform(train['level'])
test['level_encoded'] = le.transform(test['level'])

# Model architecture
model = Sequential()
model.add(Dense(8, input_dim=3, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Convert 'timestamp' column to datetime
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

# Convert 'timestamp' datetime object to Unix timestamp (number of seconds since 1970-01-01 00:00:00)
train['timestamp'] = (train['timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
test['timestamp'] = (test['timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# Select feature columns
feature_cols = ['level_encoded', 'message_length', 'timestamp']

# Convert pandas dataframe to np arrays
x_train = np.array(train[feature_cols]).astype('float32')
y_train = np.array(train['is_anomaly']).astype('float32')
x_test = np.array(test[feature_cols]).astype('float32')
y_test = np.array(test['is_anomaly']).astype('float32')

# Train the model
model.fit(x_train, y_train, epochs=50, batch_size=10)

# Evaluate the model
_, accuracy = model.evaluate(x_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy: 97.50


In [16]:
# Load new log data
new_logs = pd.read_csv('system_logs_200.csv')

# Preprocessing steps
new_logs['level_encoded'] = le.transform(new_logs['level']) # Use the LabelEncoder instance 'le' trained above
new_logs['timestamp'] = pd.to_datetime(new_logs['timestamp'])
new_logs['timestamp'] = (new_logs['timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# Convert pandas dataframe to np array
new_features = np.array(new_logs[feature_cols]).astype('float32')

# Predict anomalies
predictions = model.predict(new_features)

# To make the prediction results more understandable, convert the output to binary (0: normal, 1: anomaly)
predictions = (predictions > 0.5).astype(int)

# Append the predictions to the 'new_logs' dataframe
new_logs['predicted_anomaly'] = predictions

# Display the dataframe
new_logs.head()



Unnamed: 0,timestamp,level,message,message_length,is_anomaly,level_encoded,predicted_anomaly
0,1672531200,WARNING,Disk space low,14,0,3,0
1,1672534800,DEBUG,Disk space low,14,0,0,0
2,1672538400,DEBUG,Database connection lost,24,0,0,0
3,1672542000,DEBUG,System rebooted,15,0,0,0
4,1672545600,INFO,Configuration updated,21,0,2,0


In [17]:
from sklearn.metrics import confusion_matrix, classification_report

# Actual labels
actual = new_logs['is_anomaly']

# Predicted labels
predicted = new_logs['predicted_anomaly']

# Compute confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(actual, predicted))

# Compute precision, recall, F1-score and support
print('Classification Report:')
print(classification_report(actual, predicted))

Confusion Matrix:
[[195   0]
 [  5   0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       195
           1       0.00      0.00      0.00         5

    accuracy                           0.97       200
   macro avg       0.49      0.50      0.49       200
weighted avg       0.95      0.97      0.96       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
