In [7]:
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Téléchargement des ressources NLP
nltk.download("punkt")
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

# ===== Étape 1 : Lecture et parsing =====
log_file = "output_0.1.log"
data = []
with open(log_file, "r", encoding="utf-8") as f:
    for line in f:
        if "," in line:
            category, log = line.strip().split(",", 1)
            data.append({"category": category.strip(), "log": log.strip()})
df = pd.DataFrame(data)

# ===== Étape 2 : Nettoyage du texte =====
def clean_text(text):
    text = re.sub(r'\[.*?\]|\d+\.\d+\.\d+\.\d+|user\s\w+|\".*?\"', '', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)  # Garde que les lettres
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return " ".join(tokens)

df["clean_log"] = df["log"].apply(clean_text)
counts = df_small["category"].value_counts()
valid_classes = counts[counts >= 2].index
df_small = df_small[df_small["category"].isin(valid_classes)]

# ===== Étape 3 : Réduire la taille pour forcer surapprentissage =====

df_small = df.groupby("category").apply(lambda x: x.sample(min(len(x), 30), random_state=42)).reset_index(drop=True)
counts = df_small["category"].value_counts()
valid_classes = counts[counts >= 2].index
df_small = df_small[df_small["category"].isin(valid_classes)]


# ===== Étape 4 : Vectorisation très complexe =====
vectorizer = TfidfVectorizer(ngram_range=(1, 4), max_features=15000)
X = vectorizer.fit_transform(df_small["clean_log"])
y = df_small["category"]

# ===== Étape 5 : Split + Modèle sans régularisation =====
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

model = LogisticRegression(penalty=None, max_iter=5000, solver="lbfgs")
model.fit(X_train, y_train)

# ===== Étape 6 : Évaluation =====
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("=== Performances Entraînement ===")
print(classification_report(y_train, y_pred_train))

print("\n=== Performances Test ===")
print(classification_report(y_test, y_pred_test))


[nltk_data] Downloading package punkt to /Users/mac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df_small = df.groupby("category").apply(lambda x: x.sample(min(len(x), 30), random_state=42)).reset_index(drop=True)
  df_small = df.groupby("category").apply(lambda x: x.sample(min(len(x), 30), random_state=42)).reset_index(drop=True)


=== Performances Entraînement ===
                              precision    recall  f1-score   support

       authentication-failed       1.00      0.95      0.98        21
      authentication-success       1.00      1.00      1.00        21
           connection-closed       1.00      1.00      1.00        21
           connection-failed       1.00      1.00      1.00        21
           connection-opened       1.00      1.00      1.00        21
          database-operation       1.00      0.95      0.98        21
           directory-changed       1.00      1.00      1.00        21
           directory-created       1.00      1.00      1.00        21
           directory-deleted       1.00      0.95      0.98        21
         file-action-failure       1.00      1.00      1.00        21
                file-deleted       1.00      1.00      1.00        21
           file-modification       1.00      1.00      1.00        21
                   file-read       1.00      1.00      

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Division des données
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modèle avec régularisation L2 (ridge)
model = LogisticRegression(penalty='l2', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

# Évaluation
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("Accuracy Train :", accuracy_score(y_train, y_pred_train))
print("Accuracy Test  :", accuracy_score(y_test, y_pred_test))



Accuracy Train : 0.9472222222222222
Accuracy Test  : 0.7611111111111111


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

# Exemple de pipeline simplifié (suppose que df_small est prêt)

vectorizer = TfidfVectorizer(ngram_range=(1,4), max_features=15000)
X = vectorizer.fit_transform(df_small["clean_log"])
y = df_small["category"]

# Recherche des meilleurs hyperparamètres C (inverse de la force de régularisation)
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

logreg = LogisticRegression(penalty='l2', max_iter=1000, solver='lbfgs', multi_class='auto')

grid = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X, y)

print(f"Meilleur paramètre C : {grid.best_params_}")
print(f"Meilleure accuracy CV : {grid.best_score_:.3f}")

# Évaluer sur un split test si besoin
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

best_model = grid.best_estimator_
best_model.fit(X_train, y_train)

y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

from sklearn.metrics import classification_report
print("=== Train ===")
print(classification_report(y_train, y_pred_train))
print("=== Test ===")
print(classification_report(y_test, y_pred_test))




Meilleur paramètre C : {'C': 10}
Meilleure accuracy CV : 0.831




=== Train ===
                              precision    recall  f1-score   support

       authentication-failed       1.00      0.95      0.98        21
      authentication-success       1.00      1.00      1.00        21
           connection-closed       1.00      1.00      1.00        21
           connection-failed       1.00      1.00      1.00        21
           connection-opened       1.00      1.00      1.00        21
          database-operation       1.00      0.95      0.98        21
           directory-changed       1.00      1.00      1.00        21
           directory-created       1.00      1.00      1.00        21
           directory-deleted       1.00      0.95      0.98        21
         file-action-failure       1.00      1.00      1.00        21
                file-deleted       1.00      1.00      1.00        21
           file-modification       1.00      1.00      1.00        21
                   file-read       1.00      1.00      1.00        21
     

In [None]:
# Comparaison et ✅ Conclusion
#Le modèle de base présentait un surapprentissage manifeste (écart élevé entre entraînement et test).
#L’ajout de régularisation a limité la complexité du modèle → meilleure généralisation.
#La validation croisée a permis une évaluation plus fiable et moins dépendante d’un seul split.
#Le surapprentissage est normal et fréquent, mais il se corrige avec les bonnes pratiques.

In [10]:
import joblib

# Sauvegarder le modèle entraîné
joblib.dump(model, "model.pkl")

# Sauvegarder le vecteur TF-IDF
joblib.dump(vectorizer, "vectorizer.pkl")


['vectorizer.pkl']