1.1 Code - Chargement et Nettoyage

In [3]:
import pandas as pd
import nltk

print(pd.__version__)


2.3.1


In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('wordnet')

# 1) Charger les données
# df = pd.read_csv(r"C:\Users\UNiK\Desktop\output_0.1.log", header=None, names=["log"], sep="\n")

with open(r"C:\Users\UNiK\Desktop\output_0.1.log", "r") as f:
    lines = f.readlines()

# Exemple : extraction manuelle des catégories et logs
categories = []
logs = []

for line in lines[1:]:  # si la première ligne est header
    # Supposons que la catégorie est tout avant la première virgule
    parts = line.split(",", 1)
    if len(parts) == 2:
        categories.append(parts[0].strip())
        logs.append(parts[1].strip())
    else:
        categories.append("unknown")
        logs.append(line.strip())

df = pd.DataFrame({"category": categories, "log": logs})

# 2) Nettoyage des logs
def nettoyer_texte(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)        # Retirer les crochets et leur contenu
    text = re.sub(r'\d+', '', text)            # Retirer les chiffres (dates, ports)
    text = re.sub(r'[^\w\s]', '', text)        # Retirer ponctuation
    text = re.sub(r'http\S+', '', text)        # Retirer URLs
    return text

df['clean_log'] = df['log'].apply(nettoyer_texte)

# 3) Tokenisation, Stopwords et Lemmatisation
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def pretraitement(text):
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

df['clean_log'] = df['clean_log'].apply(pretraitement)

print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\UNiK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\UNiK\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                category                                                log  \
0  authentication-failed  [Tue Apr 11 14:36:11 2000] [error] [client 28....   
1  authentication-failed  [Tue Jan 21 17:01:07 2020] [error] [client 108...   
2  authentication-failed  [Thu Oct 12 01:17:44 2023] [ malfunction error...   
3  authentication-failed  [Tue Jul 30 16:18:08 2013] [error] [client 217...   
4  authentication-failed  [Thu Sep 10 05:09:58 2015] [error] [client 2.5...   

                                           clean_log  
0  user jessicakaiser authentication failure here...  
1  user mejianathan authentication failure powerg...  
2  user anthony authentication failure beathoweve...  
3  user bushcassandra authentication failure agre...  
4  user dylanlewis authentication failure expectt...  


1.2 Vectorisation TF-IDF



In [5]:
# Division en Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_log'], df['category'], test_size=0.2, random_state=42
)

# TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


2. Modélisation & Surapprentissage
2.1 Sur-apprentissage volontaire
On va entraîner un petit réseau de neurones MLPClassifier avec beaucoup d’itérations et peu de régularisation.

In [6]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# Sur-apprentissage forcé (trop de neurones, beaucoup d'epochs)
model_overfit = MLPClassifier(hidden_layer_sizes=(100,100,100),
                              max_iter=1000,
                              alpha=0.00001,  # quasi pas de régularisation
                              random_state=42)
model_overfit.fit(X_train_vec, y_train)

y_pred_train = model_overfit.predict(X_train_vec)
y_pred_test = model_overfit.predict(X_test_vec)

print("=== SURAPPRENTISSAGE ===")
print("Train Accuracy :", accuracy_score(y_train, y_pred_train))
print("Test Accuracy  :", accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))


=== SURAPPRENTISSAGE ===
Train Accuracy : 0.95975
Test Accuracy  : 0.9174285714285715
                              precision    recall  f1-score   support

       authentication-failed       0.93      0.91      0.92       222
      authentication-success       0.84      0.86      0.85       236
           connection-closed       0.89      0.91      0.90       253
           connection-failed       0.96      0.95      0.95       233
           connection-opened       0.92      0.89      0.90       241
          database-operation       1.00      0.98      0.99       221
           directory-changed       0.88      0.68      0.76       231
           directory-created       0.99      0.98      0.98       243
           directory-deleted       0.80      0.95      0.87       236
         file-action-failure       1.00      0.99      1.00       217
                file-deleted       0.98      0.77      0.87       248
           file-modification       0.78      0.97      0.86       222
   

3. Correction
Méthode 1 : Régularisation + Dropout simulé
On augmente alpha (régularisation L2) et réduit la complexité du modèle.

In [7]:
model_reg = MLPClassifier(hidden_layer_sizes=(50,),
                          max_iter=300,
                          alpha=0.01,
                          random_state=42)
model_reg.fit(X_train_vec, y_train)

y_pred_test_reg = model_reg.predict(X_test_vec)
print("=== CORRECTION MÉTHODE 1 ===")
print("Test Accuracy :", accuracy_score(y_test, y_pred_test_reg))


=== CORRECTION MÉTHODE 1 ===
Test Accuracy : 0.925


Méthode 2 : Validation croisée + Optimisation Hyperparamètres (GridSearch)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'hidden_layer_sizes': [(50,), (100,)],
    'alpha': [0.001, 0.01, 0.1],
    'max_iter': [200, 300]
}

grid = GridSearchCV(MLPClassifier(random_state=42),
                    param_grid, cv=3, scoring='accuracy')
grid.fit(X_train_vec, y_train)

best_model = grid.best_estimator_
y_pred_best = best_model.predict(X_test_vec)
print("=== CORRECTION MÉTHODE 2 ===")
print("Meilleurs paramètres :", grid.best_params_)
print("Test Accuracy :", accuracy_score(y_test, y_pred_best))




In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

nltk.download('stopwords')
nltk.download('wordnet')

# ----------- Prétraitement identique -----------------

def nettoyer_texte(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)        # Retirer crochets
    text = re.sub(r'\d+', '', text)            # Retirer chiffres
    text = re.sub(r'[^\w\s]', '', text)        # Retirer ponctuation
    text = re.sub(r'http\S+', '', text)        # Retirer URLs
    return text

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def pretraitement(text):
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

# Charger fichier brut, extraire catégories et logs
with open(r"C:\Users\UNiK\Desktop\output_0.1.log", "r") as f:
    lines = f.readlines()

categories = []
logs = []
for line in lines[1:]:
    parts = line.split(",", 1)
    if len(parts) == 2:
        categories.append(parts[0].strip())
        logs.append(parts[1].strip())
    else:
        categories.append("unknown")
        logs.append(line.strip())

df = pd.DataFrame({"category": categories, "log": logs})
df['clean_log'] = df['log'].apply(nettoyer_texte).apply(pretraitement)

# Vectorisation TF-IDF
vectorizer = TfidfVectorizer(max_features=2000, min_df=2, max_df=0.8)
X = vectorizer.fit_transform(df['clean_log'])
y = df['category']

# Train/test split
X_train_vec, X_test_vec, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------- Modèle 1 : Naive Bayes ---------------
model_nb = MultinomialNB()
model_nb.fit(X_train_vec, y_train)
y_pred_nb = model_nb.predict(X_test_vec)
print("=== Naive Bayes ===")
print("Test Accuracy :", accuracy_score(y_test, y_pred_nb))

# ----------- Modèle 2 : Logistic Regression + GridSearch ---------------
param_grid_lr = {'C': [0.1, 1, 10]}
grid_lr = GridSearchCV(LogisticRegression(max_iter=200), param_grid_lr, cv=3, scoring='accuracy', n_jobs=-1)
grid_lr.fit(X_train_vec, y_train)
best_lr = grid_lr.best_estimator_
y_pred_lr = best_lr.predict(X_test_vec)
print("\n=== Logistic Regression ===")
print("Meilleurs paramètres :", grid_lr.best_params_)
print("Test Accuracy :", accuracy_score(y_test, y_pred_lr))

# ----------- Modèle 3 : MLPClassifier + GridSearch rapide ---------------
param_grid_mlp = {
    'hidden_layer_sizes': [(50,)],
    'alpha': [0.01],
    'max_iter': [200]
}
grid_mlp = GridSearchCV(MLPClassifier(random_state=42), param_grid_mlp, cv=3, scoring='accuracy', n_jobs=-1)
grid_mlp.fit(X_train_vec, y_train)
best_mlp = grid_mlp.best_estimator_
y_pred_mlp = best_mlp.predict(X_test_vec)
print("\n=== MLPClassifier ===")
print("Meilleurs paramètres :", grid_mlp.best_params_)
print("Test Accuracy :", accuracy_score(y_test, y_pred_mlp))

# ----------- Rapport détaillé pour le meilleur modèle ---------------
best_score = max(
    accuracy_score(y_test, y_pred_nb),
    accuracy_score(y_test, y_pred_lr),
    accuracy_score(y_test, y_pred_mlp)
)

if best_score == accuracy_score(y_test, y_pred_nb):
    print("\nMeilleur modèle : Naive Bayes")
    print(classification_report(y_test, y_pred_nb))
elif best_score == accuracy_score(y_test, y_pred_lr):
    print("\nMeilleur modèle : Logistic Regression")
    print(classification_report(y_test, y_pred_lr))
else:
    print("\nMeilleur modèle : MLPClassifier")
    print(classification_report(y_test, y_pred_mlp))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\UNiK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\UNiK\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


=== Naive Bayes ===
Test Accuracy : 0.8985714285714286

=== Logistic Regression ===
Meilleurs paramètres : {'C': 10}
Test Accuracy : 0.9264285714285714

=== MLPClassifier ===
Meilleurs paramètres : {'alpha': 0.01, 'hidden_layer_sizes': (50,), 'max_iter': 200}
Test Accuracy : 0.9271428571428572

Meilleur modèle : MLPClassifier
                              precision    recall  f1-score   support

       authentication-failed       0.93      0.94      0.93       222
      authentication-success       0.90      0.88      0.89       236
           connection-closed       0.90      0.95      0.92       253
           connection-failed       0.96      0.94      0.95       233
           connection-opened       0.93      0.92      0.92       241
          database-operation       1.00      1.00      1.00       221
           directory-changed       0.96      0.69      0.80       231
           directory-created       1.00      0.99      0.99       243
           directory-deleted       0.80  

5. Déploiement Streamlit

In [None]:
import streamlit as st

st.title("Détection d'échec d'authentification")
text_input = st.text_area("Entrez un log à analyser")

if st.button("Prédire"):
    text_clean = pretraitement(nettoyer_texte(text_input))
    vec = vectorizer.transform([text_clean])
    pred = best_model.predict(vec)[0]
    st.write("### Prédiction :", pred)


In [None]:
joblib.dump(best_model, "best_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
print("✅ Modèle et vectorizer sauvegardés !")


NameError: name 'best_model' is not defined