In [None]:
import os
import re
import pandas as pd
from drain3 import TemplateMiner
from drain3.file_persistence import FilePersistence
from tqdm import tqdm
import sys
!git clone https://github.com/logpai/logparser.git

sys.path.append('/content/logparser')
from logparser.Drain import LogParser

def mount_drive():
    from google.colab import drive
    drive.mount("/content/drive", force_remount=True)
    print("Drive mounted")

!pip install drain3
!pip install mlflow
!pip install boto3 awscli


def split_logfile(input_file, train_file, valid_file, test_file, train_ratio=0.6, valid_ratio=0.2):
    with open(input_file, 'r') as f:
        lines = f.readlines()

    total_lines = len(lines)
    train_end = int(train_ratio * total_lines)
    valid_end = train_end + int(valid_ratio * total_lines)

    with open(train_file, 'w') as f:
        f.writelines(lines[:train_end])
    with open(valid_file, 'w') as f:
        f.writelines(lines[train_end:valid_end])
    with open(test_file, 'w') as f:
        f.writelines(lines[valid_end:])

def parse_with_drain3(input_path, output_path, depth=4, max_children=100, sim_th=0.4):
    persistence = FilePersistence("drain3_state.bin")
    template_miner = TemplateMiner(persistence)
    template_miner.drain.depth = depth
    template_miner.drain.sim_th = sim_th
    template_miner.drain.max_children = max_children

    log_format = r'(?P<Date>\d{6}) (?P<Time>\d{6}) (?P<Component>\w+) (?P<Level>\w+) (?P<Content>.*)'

    structured_logs = []
    with open(input_path, 'r') as f:
        for line in tqdm(f, desc="Parsing log lines"):
            match = re.match(log_format, line)
            if match:
                content = match.group("Content")
                result = template_miner.add_log_message(content)
                structured_logs.append({
                    "Date": match.group("Date"),
                    "Time": match.group("Time"),
                    "Component": match.group("Component"),
                    "Level": match.group("Level"),
                    "Content": content,
                    "EventId": result["cluster_id"],
                    "EventTemplate": result["template_mined"]
                })

    df = pd.DataFrame(structured_logs)
    df.to_csv(output_path, index=False)
    print(f"Structured log saved to {output_path}")

# Exemple d'utilisation
if __name__ == "__main__":
    mount_drive()
    base_path = "/content/drive/MyDrive/ProjetEts/"
    input_log = base_path + "HDFS.log"
    output_dir = base_path + "Ingestion_des_données/HDFS_results/"
    os.makedirs(output_dir, exist_ok=True)

    # Étape 1 : Split
    split_logfile(input_log,
                  output_dir + "HDFS_train.log",
                  output_dir + "HDFS_valid.log",
                  output_dir + "HDFS_test.log")

    # Étape 2 : Parsing
    parse_with_drain3(output_dir + "HDFS_train.log", output_dir + "HDFS_train.log_structured.csv")
    parse_with_drain3(output_dir + "HDFS_valid.log", output_dir + "HDFS_valid.log_structured.csv")
    parse_with_drain3(output_dir + "HDFS_test.log", output_dir + "HDFS_test.log_structured.csv")


fatal: destination path 'logparser' already exists and is not an empty directory.
Collecting cachetools<7,>=5.0.0 (from mlflow-skinny==3.1.4->mlflow)
  Using cached cachetools-6.1.0-py3-none-any.whl.metadata (5.4 kB)
  Using cached cachetools-5.5.2-py3-none-any.whl.metadata (5.4 kB)
Using cached cachetools-5.5.2-py3-none-any.whl (10 kB)
Installing collected packages: cachetools
  Attempting uninstall: cachetools
    Found existing installation: cachetools 4.2.1
    Uninstalling cachetools-4.2.1:
      Successfully uninstalled cachetools-4.2.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
drain3 0.9.11 requires cachetools==4.2.1, but you have cachetools 5.5.2 which is incompatible.[0m[31m
[0mSuccessfully installed cachetools-5.5.2


Mounted at /content/drive
Drive mounted


Parsing log lines: 6705377it [02:02, 54752.63it/s]


Structured log saved to /content/drive/MyDrive/ProjetEts/Ingestion_des_données/HDFS_results/HDFS_train.log_structured.csv


Parsing log lines: 2235125it [00:43, 51804.33it/s]


Structured log saved to /content/drive/MyDrive/ProjetEts/Ingestion_des_données/HDFS_results/HDFS_valid.log_structured.csv


Parsing log lines: 2235127it [00:35, 62454.61it/s]


Structured log saved to /content/drive/MyDrive/ProjetEts/Ingestion_des_données/HDFS_results/HDFS_test.log_structured.csv


In [None]:
# 1_data_ingestion.py

import os
import re
import json
import sys
import pandas as pd
from tqdm import tqdm
from google.colab import drive

# Mount Google Drive
drive.mount("/content/drive", force_remount=True)

# Install dependencies
!pip install drain3 mlflow boto3 awscli
!git clone https://github.com/logpai/logparser.git
sys.path.append('/content/logparser')

from logparser.Drain import LogParser

# Define paths
input_dir = '/content/drive/MyDrive/ProjetEts/'
output_dir = os.path.join(input_dir, 'HDFS_results')
os.makedirs(output_dir, exist_ok=True)
log_files = ['HDFS_train.log', 'HDFS_valid.log', 'HDFS_test.log']
log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>'
regex = [r'blk_(|-)[0-9]+', r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$']

# Split the raw file into train/val/test
def split_logfile(input_file, train_file, valid_file, test_file, train_ratio=0.6, valid_ratio=0.2):
    with open(input_file, 'r') as file:
        lines = file.readlines()
    train_idx = int(len(lines) * train_ratio)
    valid_idx = int(len(lines) * (train_ratio + valid_ratio))
    with open(train_file, 'w') as f: f.writelines(lines[:train_idx])
    with open(valid_file, 'w') as f: f.writelines(lines[train_idx:valid_idx])
    with open(test_file, 'w') as f: f.writelines(lines[valid_idx:])
    print(f"Split done: {train_idx} train, {valid_idx - train_idx} valid, {len(lines) - valid_idx} test")

split_logfile(
    input_file=os.path.join(input_dir, 'HDFS.log'),
    train_file=os.path.join(input_dir, 'HDFS_train.log'),
    valid_file=os.path.join(input_dir, 'HDFS_valid.log'),
    test_file=os.path.join(input_dir, 'HDFS_test.log')
)

# Parse with Drain
parser = LogParser(log_format, indir=input_dir, outdir=output_dir, depth=4, st=0.5, rex=regex)
for log_file in log_files:
    parser.parse(log_file)

# Generate template mapping
def mapping(file_names, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    for file_name in file_names:
        path = os.path.join(output_dir, file_name)
        df = pd.read_csv(path).sort_values(by="Occurrences", ascending=False)
        mapping_dict = {event: f"E{idx+1}" for idx, event in enumerate(df["EventId"])}
        with open(path.replace(".csv", ".json"), "w") as f:
            json.dump(mapping_dict, f)
        print(f"Mapping done for {file_name}")

mapping([f.replace(".log", ".log_templates.csv") for f in log_files], output_dir)

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Processed 92.3% of log lines.
Processed 92.3% of log lines.
Processed 92.3% of log lines.
Processed 92.3% of log lines.
Processed 92.4% of log lines.
Processed 92.4% of log lines.
Processed 92.4% of log lines.
Processed 92.4% of log lines.
Processed 92.4% of log lines.
Processed 92.4% of log lines.
Processed 92.4% of log lines.
Processed 92.5% of log lines.
Processed 92.5% of log lines.
Processed 92.5% of log lines.
Processed 92.5% of log lines.
Processed 92.5% of log lines.
Processed 92.5% of log lines.
Processed 92.6% of log lines.
Processed 92.6% of log lines.
Processed 92.6% of log lines.
Processed 92.6% of log lines.
Processed 92.6% of log lines.
Processed 92.6% of log lines.
Processed 92.6% of log lines.
Processed 92.7% of log lines.
Processed 92.7% of log lines.
Processed 92.7% of log lines.
Processed 92.7% of log lines.
Processed 92.7% of log lines.
Processed 92.7% of log lines.
Processe

In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

!pip install drain3
!pip install mlflow
!pip install boto3 awscli


!git clone https://github.com/logpai/logparser.git


import sys
sys.path.append('/content/logparser')


from logparser.Drain import LogParser

import os
import pandas as pd
import sys
import re
import mlflow
import mlflow.sklearn
import json
from collections import defaultdict
from tqdm import tqdm

mlflow.set_tracking_uri("http://ec2-18-207-206-140.compute-1.amazonaws.com:5000")
mlflow.set_experiment("HDFS")


import os

def split_logfile(input_file, train_file, valid_file, test_file, train_ratio=0.6, valid_ratio=0.2):
    """
    Splits a file into train, validation, and test sets based on the given ratios.

    Parameters:
    input_file (str): Path to the input file.
    train_file (str): Path to the output train file.
    valid_file (str): Path to the output validation file.
    test_file (str): Path to the output test file.
    train_ratio (float): Ratio of the train set size to the total size (default is 0.6).
    valid_ratio (float): Ratio of the validation set size to the total size (default is 0.2).
    """
    # Read the input file
    with open(input_file, 'r') as file:
        lines = file.readlines()

    # Calculate split indices
    train_split_index = int(len(lines) * train_ratio)
    valid_split_index = int(len(lines) * (train_ratio + valid_ratio))

    # Split the lines into train, validation, and test sets
    train_lines = lines[:train_split_index]
    valid_lines = lines[train_split_index:valid_split_index]
    test_lines = lines[valid_split_index:]

    # Write the train set to the train file
    with open(train_file, 'w') as file:
        file.writelines(train_lines)

    # Write the validation set to the validation file
    with open(valid_file, 'w') as file:
        file.writelines(valid_lines)

    # Write the test set to the test file
    with open(test_file, 'w') as file:
        file.writelines(test_lines)

    print(f"Split completed: \n {len(train_lines)} lines in {train_file} \n {len(valid_lines)} lines in {valid_file} \n {len(test_lines)} lines in {test_file}")


    split_logfile(
    input_file='/content/drive/MyDrive/ProjetEts/HDFS.log',
    train_file='/content/drive/MyDrive/ProjetEts/HDFS_train.log',
    valid_file='/content/drive/MyDrive/ProjetEts/HDFS_valid.log',
    test_file='/content/drive/MyDrive/ProjetEts/HDFS_test.log'
)




input_dir  = '/content/drive/MyDrive/ProjetEts/'            # Dossier contenant les fichiers log
output_dir = '/content/drive/MyDrive/ProjetEts/HDFS_results/'  # Dossier pour les résultats de parsing
log_files  = ['HDFS_train.log', 'HDFS_valid.log', 'HDFS_test.log']  # Fichiers log à parser
log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>'  #  Format des logs HDFS

# Regular expression list for optional preprocessing (default: [])
regex = [
    r'blk_(|-)[0-9]+',  # block id
    r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)',  # IP
    r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$',  # Numbers
]
st = 0.5  # Similarity threshold
depth = 4  # Depth of all leaf nodes

parser = LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st, rex=regex)

# Iterate over each log file and parse it
for log_file in log_files:
    parser.parse(log_file)



def mapping(file_names, output_dir):
    """
    Process multiple log template CSV files and save the mappings to JSON files in a specified output directory.

    Parameters:
        file_names (list): A list of CSV file names to process.
        output_dir (str): The directory where output JSON files will be saved.
    """
    # Assure that the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    for file_name in file_names:
        log_templates_file = os.path.join(output_dir, file_name)

        # Lire le fichier CSV et créer le dictionnaire de mappage
        log_temp = pd.read_csv(log_templates_file).sort_values(by="Occurrences", ascending=False)
        log_temp_dict = {event: f"E{idx + 1}" for idx, event in enumerate(log_temp["EventId"])}

        # Définir le nom du fichier de sortie
        output_file = os.path.join(output_dir, f"{file_name.replace('.csv', '')}.json")

        # Sauvegarder le dictionnaire de mappage
        with open(output_file, "w") as f:
            json.dump(log_temp_dict, f)

        print(f"Mapping completed and saved to {output_file}")


def process_log_files(input_dir, output_dir, json_filename, structured_log_filename, anomaly_label_filename, output_filename):
    """
    Process log files to add BlockId, map EventId values, and label anomalies.

    Parameters:
        input_dir (str): The directory containing the input files.
        output_dir (str): The directory for output files.
        json_filename (str): The JSON file with EventId mappings.
        structured_log_filename (str): The CSV file with structured log data.
        anomaly_label_filename (str): The CSV file with anomaly labels.
        output_filename (str): The filename for the processed output.
    """
    # Paths for the input files
    json_file_path = os.path.join(output_dir, json_filename)
    anomaly_label_path = os.path.join(input_dir, anomaly_label_filename)
    structured_log_path = os.path.join(output_dir, structured_log_filename)

    # Load the structured log file
    df_structured = pd.read_csv(structured_log_path)

    # Load the JSON mapping file
    with open(json_file_path, 'r') as json_file:
        event_mapping = json.load(json_file)

    # Load the anomaly label file
    df_labels = pd.read_csv(anomaly_label_path)
    df_labels['Label'] = df_labels['Label'].replace({'Normal': 'Success', 'Anomaly': 'Fail'})

    # Add BlockId by extracting block identifiers
    df_structured['BlockId'] = df_structured['Content'].apply(lambda x: re.search(r'blk_(|-)[0-9]+', x).group(0) if re.search(r'blk_(|-)[0-9]+', x) else None)

    # Remove rows where BlockId is NaN
    df_structured = df_structured.dropna(subset=['BlockId'])

    # Map EventId using the JSON file
    df_structured['EventId'] = df_structured['EventId'].apply(lambda x: event_mapping.get(x, x))

    # Merge DataFrames to add the Label column
    df_structured = pd.merge(df_structured, df_labels, on='BlockId', how='left')

    # Reorder columns so BlockId and Label are first
    columns = ['BlockId', 'Label'] + [col for col in df_structured.columns if col not in ['BlockId', 'Label']]
    df_structured = df_structured[columns]

    # Save the processed structured log file
    output_path = os.path.join(output_filename)
    df_structured.to_csv(output_path, index=False)

    print(f"Le fichier structuré avec BlockId et les EventId remplacés est généré et sauvegardé dans {output_path}")




    files = [
    "HDFS_train.log_templates.csv",
    "HDFS_valid.log_templates.csv",
    "HDFS_test.log_templates.csv"
    ]
    output_directory = "/content/drive/MyDrive/ProjetEts/HDFS_results/"

    mapping(files, output_directory)



# Train
process_log_files(
    input_dir='/content/drive/MyDrive/ProjetEts/',
    output_dir='/content/drive/MyDrive/ProjetEts/HDFS_results/',
    json_filename='/content/drive/MyDrive/ProjetEts/HDFS_results/HDFS_train.log_templates.json',
    structured_log_filename='/content/drive/MyDrive/ProjetEts/HDFS_results/HDFS_train.log_structured.csv',
    anomaly_label_filename='/content/drive/MyDrive/ProjetEts/anomaly_label.csv',
    output_filename='/content/drive/MyDrive/ProjetEts/HDFS_results/HDFS_train.log_structured_blk.csv'
)




# Valid
process_log_files(
    input_dir='/content/drive/MyDrive/ProjetEts/',
    output_dir='/content/drive/MyDrive/ProjetEts/HDFS_results/',
    json_filename='/content/drive/MyDrive/ProjetEts/HDFS_results/HDFS_valid.log_templates.json',
    structured_log_filename='/content/drive/MyDrive/ProjetEts/HDFS_results/HDFS_valid.log_structured.csv',
    anomaly_label_filename='/content/drive/MyDrive/ProjetEts/anomaly_label.csv',
    output_filename='/content/drive/MyDrive/ProjetEts/HDFS_results/HDFS_valid.log_structured_blk.csv'
)




# Test
process_log_files(
    input_dir='/content/drive/MyDrive/ProjetEts/',
    output_dir='/content/drive/MyDrive/ProjetEts/HDFS_results/',
    json_filename='/content/drive/MyDrive/ProjetEts/HDFS_results/HDFS_test.log_templates.json',
    structured_log_filename='/content/drive/MyDrive/ProjetEts/HDFS_results/HDFS_test.log_structured.csv',
    anomaly_label_filename='/content/drive/MyDrive/ProjetEts/anomaly_label.csv',
    output_filename='/content/drive/MyDrive/ProjetEts/HDFS_results/HDFS_test.log_structured_blk.csv'
)




[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Processed 92.4% of log lines.
Processed 92.4% of log lines.
Processed 92.5% of log lines.
Processed 92.5% of log lines.
Processed 92.5% of log lines.
Processed 92.5% of log lines.
Processed 92.5% of log lines.
Processed 92.5% of log lines.
Processed 92.6% of log lines.
Processed 92.6% of log lines.
Processed 92.6% of log lines.
Processed 92.6% of log lines.
Processed 92.6% of log lines.
Processed 92.6% of log lines.
Processed 92.6% of log lines.
Processed 92.7% of log lines.
Processed 92.7% of log lines.
Processed 92.7% of log lines.
Processed 92.7% of log lines.
Processed 92.7% of log lines.
Processed 92.7% of log lines.
Processed 92.7% of log lines.
Processed 92.8% of log lines.
Processed 92.8% of log lines.
Processed 92.8% of log lines.
Processed 92.8% of log lines.
Processed 92.8% of log lines.
Processed 92.8% of log lines.
Processed 92.9% of log lines.
Processed 92.9% of log lines.
Processe