In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

df = pd.read_csv("logs_5000_final.csv")

X = df["message"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    class_weight="balanced",
    random_state=42
)

model.fit(X_train_tfidf, y_train)
import joblib

joblib.dump(model, "log_anomaly_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.787
Confusion Matrix:
 [[492 188]
 [ 25 295]]
              precision    recall  f1-score   support

           0       0.95      0.72      0.82       680
           1       0.61      0.92      0.73       320

    accuracy                           0.79      1000
   macro avg       0.78      0.82      0.78      1000
weighted avg       0.84      0.79      0.79      1000

