In [2]:
import pandas as pd
import numpy as np
import re
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
def extract_log_features(df, log_type):
    """
    Extract features from log data
    """
    print(f"Extracting features from {log_type} logs...")
    
    # Create a copy of the dataframe to avoid modifying the original
    df_features = df.copy()
    
    # 1. Extract time-based features
    if 'datetime' in df_features.columns:
        df_features['hour'] = df_features['datetime'].dt.hour
        df_features['day'] = df_features['datetime'].dt.day
        df_features['weekday'] = df_features['datetime'].dt.weekday
        df_features['minute'] = df_features['datetime'].dt.minute
    
    # 2. Calculate time intervals between consecutive log entries
    if 'datetime' in df_features.columns:
        df_features['time_delta'] = df_features['datetime'].diff().dt.total_seconds()
        df_features['time_delta'] = df_features['time_delta'].fillna(0)
    
    # 3. Process based on log type
    if log_type == 'hdfs':
        # One-hot encode message types
        if 'msg_type' in df_features.columns and len(df_features) > 0:
            msg_type_dummies = pd.get_dummies(df_features['msg_type'], prefix='msg')
            df_features = pd.concat([df_features, msg_type_dummies], axis=1)
        
        # Count logs per block ID
        if 'block_id' in df_features.columns:
            block_counts = df_features.groupby('block_id').size().reset_index(name='block_log_count')
            df_features = df_features.merge(block_counts, on='block_id', how='left')
    
    print(f"Extracted {df_features.shape[1]} features from {log_type} logs")
    return df_features


In [14]:
def examine_log_file(log_file):
    """
    Examine a log file to determine its format and structure
    """
    print(f"Examining log file: {log_file}")
    
    try:
        # Check if file exists and is accessible
        if not os.path.exists(log_file):
            print(f"Error: File '{log_file}' does not exist")
            return
            
        # Get file size
        file_size = os.path.getsize(log_file)
        print(f"File size: {file_size} bytes")
        
        # Read first 10 lines
        with open(log_file, 'r', encoding='utf-8', errors='replace') as f:
            print("\nFirst 10 lines:")
            for i, line in enumerate(f):
                if i >= 10:
                    break
                print(f"Line {i+1}: {line.strip()}")
                
        # Count total lines
        with open(log_file, 'r', encoding='utf-8', errors='replace') as f:
            line_count = sum(1 for _ in f)
            print(f"\nTotal lines in file: {line_count}")
            
        # Check various log formats
        formats = {
            'HDFS standard': r'(\d+)\s+(\d+)\s+(\w+)\s+(\S+):\s+(.+)',
            'Linux style': r'(\w+\s+\d+\s+\d+:\d+:\d+)\s+(\S+)\s+(\S+)(?:\[(\d+)\])?\s*:\s*(.+)',
            'OpenSSH style': r'(\w+\s+\d+\s+\d+:\d+:\d+)\s+(\S+)\s+sshd\[(\d+)\]:\s+(.+)',
            'BGL style': r'(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)',
            'Simple space-delimited': r'(\S+)\s+(\S+)\s+(\S+).*'
        }
        
        with open(log_file, 'r', encoding='utf-8', errors='replace') as f:
            sample_lines = [next(f).strip() for _ in range(min(5, line_count))]
            
        print("\nFormat detection results:")
        for format_name, pattern in formats.items():
            matching_lines = 0
            for line in sample_lines:
                if re.match(pattern, line):
                    matching_lines += 1
            match_ratio = matching_lines / len(sample_lines)
            print(f"- {format_name}: {matching_lines}/{len(sample_lines)} lines match ({match_ratio:.0%})")
            
    except Exception as e:
        print(f"Error examining file: {str(e)}")


In [5]:
class LogKeyDataset(Dataset):
    def __init__(self, log_keys, window_size):
        self.log_keys = log_keys
        self.window_size = window_size
        
    def __len__(self):
        return len(self.log_keys) - self.window_size
        
    def __getitem__(self, idx):
        x = self.log_keys[idx:idx+self.window_size]
        y = self.log_keys[idx+self.window_size]
        return torch.tensor(x, dtype=torch.float32).unsqueeze(-1), torch.tensor(y, dtype=torch.long)

class DeepLog(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(DeepLog, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        # Move tensors to the configured device
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

def train_deeplog_model(df, msg_type_col='msg_type', window_size=10, epochs=5, batch_size=64):
    """
    Train a DeepLog model for log sequence anomaly detection with GPU support
    """
    print("Training DeepLog model for sequence anomaly detection...")
    
    if len(df) == 0 or msg_type_col not in df.columns:
        print(f"Error: DataFrame is empty or doesn't contain column '{msg_type_col}'")
        return None
    
    # Encode message types as integers
    encoder = LabelEncoder()
    log_keys = encoder.fit_transform(df[msg_type_col].values)
    
    # Split into train/test
    train_size = int(len(log_keys) * 0.8)
    train_keys = log_keys[:train_size]
    test_keys = log_keys[train_size:]
    
    # Create datasets and dataloaders
    train_dataset = LogKeyDataset(train_keys, window_size)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    test_dataset = LogKeyDataset(test_keys, window_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Initialize model
    num_classes = len(encoder.classes_)
    model = DeepLog(input_size=1, hidden_size=64, num_layers=2, output_size=num_classes)
    model = model.to(device)  # Move model to GPU
    
    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for inputs, labels in train_loader:
            # Move data to GPU
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")
    
    # Evaluation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            # Move data to GPU
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            _, predicted = torch.topk(outputs, 3)  # Top-3 predictions
            total += labels.size(0)
            for i, label in enumerate(labels):
                if label.item() in predicted[i]:
                    correct += 1
    
    accuracy = correct / total
    print(f"DeepLog Top-3 Accuracy: {accuracy:.4f}")
    
    # Move model back to CPU for easier saving
    model = model.to("cpu")
    
    deeplog_results = {
        'model': model,
        'encoder': encoder,
        'accuracy': accuracy,
        'window_size': window_size,
        'num_classes': num_classes
    }
    
    return deeplog_results

In [6]:
def time_based_anomaly_detection(df, time_window='1H', contamination=0.05):
    """
    Detect anomalies based on log patterns within time windows
    """
    print(f"Performing time-based anomaly detection with {time_window} windows...")
    
    if 'datetime' not in df.columns or len(df) == 0:
        print("Error: DataFrame is empty or doesn't contain 'datetime' column")
        return None
    
    # Ensure datetime column is datetime type
    if not pd.api.types.is_datetime64_dtype(df['datetime']):
        try:
            df['datetime'] = pd.to_datetime(df['datetime'])
        except:
            print("Error: Could not convert 'datetime' column to datetime")
            return None
    
    # Group logs by time window
    df_grouped = df.set_index('datetime').groupby(pd.Grouper(freq=time_window))
    
    features = []
    timestamps = []
    
    for name, group in df_grouped:
        if len(group) > 0:
            # Calculate numerical features for this time window
            num_cols = group.select_dtypes(include=['number']).columns
            
            # Basic log count features
            feature_vector = {
                'log_count': len(group),
                'unique_components': group['component'].nunique() if 'component' in group.columns else 0,
                'avg_time_delta': group['time_delta'].mean() if 'time_delta' in group.columns else 0,
            }
            
            # Add level-based counts if available
            if 'level' in group.columns:
                level_counts = group['level'].value_counts()
                for level, count in level_counts.items():
                    feature_vector[f'level_{level}_count'] = count
            
            # Add message type counts if available
            if 'msg_type' in group.columns:
                msg_counts = group['msg_type'].value_counts()
                for msg, count in msg_counts.items():
                    safe_msg = str(msg).replace('.', '_').replace(' ', '_')
                    feature_vector[f'msg_{safe_msg}_count'] = count
            
            # Add dummy variable counts if they exist
            for col in group.columns:
                if col.startswith('msg_') or col.startswith('comp_') or col.startswith('status_'):
                    if col in num_cols:
                        feature_vector[f'{col}_sum'] = group[col].sum()
            
            features.append(feature_vector)
            timestamps.append(name)
    
    if not features:
        print("Error: No features were extracted from time windows")
        return None
    
    # Convert to DataFrame and handle missing values
    feature_df = pd.DataFrame(features)
    feature_df.fillna(0, inplace=True)
    
    # Check if we have enough data points for isolation forest
    if len(feature_df) < 10:
        print("Warning: Not enough time windows for reliable anomaly detection")
        # Return a dummy result with all 1's (no anomalies)
        return {
            'timestamps': timestamps,
            'anomaly_scores': np.ones(len(timestamps)),
            'feature_df': feature_df,
            'predictions': np.ones(len(timestamps))
        }
    
    # Scale features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(feature_df)
    
    # Train isolation forest
    model = IsolationForest(contamination=contamination, random_state=42)
    model.fit(features_scaled)
    
    # Get anomaly scores (-1 for anomalies, 1 for normal)
    predictions = model.predict(features_scaled)
    anomaly_scores = model.decision_function(features_scaled)
    
    print(f"Detected {(predictions == -1).sum()} anomalies in {len(predictions)} time windows")
    
    return {
        'timestamps': timestamps,
        'anomaly_scores': anomaly_scores,
        'feature_df': feature_df,
        'predictions': predictions,
        'model': model,
        'scaler': scaler
    }


In [7]:
def hdfs_block_anomaly_detection(df):
    """
    Special case for HDFS: detect anomalies based on block behavior patterns
    """
    print("Performing HDFS block-based anomaly detection...")
    
    if 'block_id' not in df.columns or len(df) == 0:
        print("Error: DataFrame is empty or doesn't contain 'block_id' column")
        return None
    
    # Group logs by block_id
    block_groups = df.groupby('block_id')
    
    # Extract features for each block
    block_features = []
    block_ids = []
    
    for block_id, group in block_groups:
        if block_id == 'unknown' or len(group) < 2:
            continue
            
        # Count occurrences of each message type
        msg_type_counts = group['msg_type'].value_counts().to_dict()
        
        # Create a feature dictionary for this block
        feature_dict = {
            'block_id': block_id,
            'log_count': len(group),
            'unique_msg_types': len(msg_type_counts)
        }
        
        # Add message type counts as features
        for msg_type, count in msg_type_counts.items():
            safe_msg = str(msg_type).replace('.', '_').replace(' ', '_')
            feature_dict[f'msg_{safe_msg}_count'] = count
        
        # Add time-based features
        if 'datetime' in group.columns:
            feature_dict['timespan'] = (group['datetime'].max() - group['datetime'].min()).total_seconds()
            
        block_features.append(feature_dict)
        block_ids.append(block_id)
    
    if not block_features:
        print("Error: No block features extracted")
        return None
    
    # Convert to DataFrame and handle missing values
    block_df = pd.DataFrame(block_features)
    block_df.fillna(0, inplace=True)
    
    # Remove the block_id column before scaling
    block_ids_df = block_df['block_id']
    block_df = block_df.drop('block_id', axis=1)
    
    # Scale features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(block_df)
    
    # Train isolation forest
    model = IsolationForest(contamination=0.1, random_state=42)
    model.fit(features_scaled)
    
    # Get anomaly scores (-1 for anomalies, 1 for normal)
    predictions = model.predict(features_scaled)
    anomaly_scores = model.decision_function(features_scaled)
    
    # Create result dataframe
    result_df = pd.DataFrame({
        'block_id': block_ids,
        'anomaly_score': anomaly_scores,
        'is_anomaly': predictions == -1
    })
    
    print(f"Detected {result_df['is_anomaly'].sum()} anomalous blocks out of {len(result_df)} blocks")
    
    return {
        'block_df': block_df,
        'result_df': result_df,
        'model': model,
        'scaler': scaler
    }


In [10]:
def ssh_login_anomaly_detection(df):
    """
    Special case for SSH: detect anomalous login patterns
    """
    print("Detecting SSH login anomalies...")
    
    if len(df) == 0 or 'status' not in df.columns:
        print("Error: DataFrame is empty or doesn't contain required columns")
        return None
    
    # Filter for login attempts
    login_df = df[df['msg_type'] == 'login'].copy()
    
    if len(login_df) == 0:
        print("Warning: No login attempts found")
        return None
    
    # Group by source IP
    ip_stats = login_df.groupby('source_ip').agg({
        'status': lambda x: (x == 'failure').mean(),  # Failure rate
        'datetime': ['count', 'min', 'max'],  # Count and time range
        'user': 'nunique'  # Unique usernames tried
    })
    
    ip_stats.columns = ['failure_rate', 'attempt_count', 'first_attempt', 'last_attempt', 'unique_users']
    ip_stats['time_span'] = (ip_stats['last_attempt'] - ip_stats['first_attempt']).dt.total_seconds()
    
    # Calculate attempts per hour
    ip_stats['attempts_per_hour'] = 3600 * ip_stats['attempt_count'] / ip_stats['time_span'].replace(0, 3600)
    
    # Fill NaN values
    ip_stats = ip_stats.fillna(0)
    
    # Prepare features for anomaly detection
    features = ip_stats[['failure_rate', 'attempt_count', 'unique_users', 'attempts_per_hour']]
    
    # Scale features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    
    # Train isolation forest
    model = IsolationForest(contamination=0.1, random_state=42)
    model.fit(features_scaled)
    
    # Get anomaly scores
    ip_stats['anomaly_score'] = model.decision_function(features_scaled)
    ip_stats['is_anomaly'] = model.predict(features_scaled) == -1
    
    print(f"Detected {ip_stats['is_anomaly'].sum()} anomalous IPs out of {len(ip_stats)} IPs")
    
    return {
        'ip_stats': ip_stats,
        'model': model,
        'scaler': scaler
    }


In [12]:
def bgl_error_pattern_detection(df):
    """
    Special case for BGL: detect error patterns and clusters
    """
    print("Detecting BGL error patterns...")
    
    if len(df) == 0 or 'level' not in df.columns:
        print("Error: DataFrame is empty or doesn't contain required columns")
        return None
    
    # Filter for error/fatal logs
    error_df = df[df['level'].isin(['ERROR', 'FATAL', 'SEVERE'])].copy()
    
    if len(error_df) == 0:
        print("Warning: No error logs found")
        return None
    
    # Group by component
    component_stats = error_df.groupby('component').agg({
        'level': 'count',  # Error count
        'datetime': ['min', 'max']  # Time range
    })
    
    component_stats.columns = ['error_count', 'first_error', 'last_error']
    component_stats['time_span'] = (component_stats['last_error'] - component_stats['first_error']).dt.total_seconds()
    
    # Calculate errors per hour
    component_stats['errors_per_hour'] = 3600 * component_stats['error_count'] / component_stats['time_span'].replace(0, 3600)
    
    # Get error message patterns
    msg_patterns = error_df['msg_type'].value_counts().to_dict()
    
    # Count how many components each error appears in
    component_per_msg = {}
    for msg in msg_patterns.keys():
        component_per_msg[msg] = error_df[error_df['msg_type'] == msg]['component'].nunique()
    
    # Create training dataset for component-based classification
    component_features = pd.get_dummies(error_df['component'], prefix='comp')
    
    # Include message type
    msg_features = pd.get_dummies(error_df['msg_type'], prefix='msg')
    
    # Combine features
    X = pd.concat([component_features, msg_features], axis=1)
    
    # Target variable: is this a FATAL level error?
    y = (error_df['level'] == 'FATAL').astype(int)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Train a classifier
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"BGL error classification accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    
    return {
        'component_stats': component_stats,
        'msg_patterns': msg_patterns,
        'component_per_msg': component_per_msg,
        'model': model,
        'accuracy': accuracy,
        'X_test': X_test,
        'y_test': y_test,
        'y_pred': y_pred
    }

In [13]:
def visualize_anomaly_results(results, log_type, save_path=None):
    """
    Create visualizations of anomaly detection results
    """
    print(f"Creating visualizations for {log_type} log analysis...")
    
    plt.figure(figsize=(15, 10))
    
    if log_type == 'hdfs':
        if 'block_results' in results and results['block_results'] is not None:
            # Plot block-based anomalies
            plt.subplot(2, 2, 1)
            block_results = results['block_results']['result_df']
            plt.hist(block_results['anomaly_score'], bins=30)
            plt.axvline(x=0, color='r', linestyle='--')
            plt.title('Block Anomaly Score Distribution')
            plt.xlabel('Anomaly Score')
            plt.ylabel('Count')
            
            # Plot top anomalous blocks
            plt.subplot(2, 2, 2)
            anomalous_blocks = block_results[block_results['is_anomaly']].sort_values('anomaly_score')
            if len(anomalous_blocks) > 10:
                anomalous_blocks = anomalous_blocks.head(10)
            plt.barh(range(len(anomalous_blocks)), anomalous_blocks['anomaly_score'])
            plt.yticks(range(len(anomalous_blocks)), anomalous_blocks['block_id'])
            plt.title('Top Anomalous Blocks')
            plt.xlabel('Anomaly Score')
    
    # Time-based anomaly plots (for all log types)
    if 'time_results' in results and results['time_results'] is not None:
        time_results = results['time_results']
        
        # Plot anomaly scores over time
        plt.subplot(2, 2, 3)
        plt.plot(time_results['timestamps'], time_results['anomaly_scores'])
        plt.axhline(y=0, color='r', linestyle='--')
        plt.title('Time-based Anomaly Scores')
        plt.xlabel('Time')
        plt.ylabel('Anomaly Score')
        plt.xticks(rotation=45)
        
        # Plot log counts over time
        plt.subplot(2, 2, 4)
        plt.plot(time_results['timestamps'], time_results['feature_df']['log_count'])
        
        # Highlight anomalous points
        anomaly_indices = time_results['predictions'] == -1
        if sum(anomaly_indices) > 0:
            plt.scatter(
                [time_results['timestamps'][i] for i in range(len(time_results['timestamps'])) if anomaly_indices[i]],
                [time_results['feature_df']['log_count'].iloc[i] for i in range(len(time_results['feature_df'])) if anomaly_indices[i]],
                color='red', s=50
            )
        
        plt.title('Log Counts Over Time')
        plt.xlabel('Time')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
    
    # SSH-specific visualizations
    if log_type == 'openssh' and 'ssh_results' in results and results['ssh_results'] is not None:
        ssh_results = results['ssh_results']
        
        # Clear previous plots for SSH-specific ones
        plt.clf()
        
        # Plot failure rate vs attempt count
        plt.subplot(2, 2, 1)
        plt.scatter(
            ssh_results['ip_stats']['attempt_count'], 
            ssh_results['ip_stats']['failure_rate'],
            c=ssh_results['ip_stats']['is_anomaly'].map({True: 'red', False: 'blue'}),
            alpha=0.6
        )
        plt.title('Failure Rate vs Attempt Count')
        plt.xlabel('Number of Attempts')
        plt.ylabel('Failure Rate')
        
        # Plot attempts per hour vs unique users
        plt.subplot(2, 2, 2)
        plt.scatter(
            ssh_results['ip_stats']['attempts_per_hour'], 
            ssh_results['ip_stats']['unique_users'],
            c=ssh_results['ip_stats']['is_anomaly'].map({True: 'red', False: 'blue'}),
            alpha=0.6
        )
        plt.title('Attempts per Hour vs Unique Users')
        plt.xlabel('Attempts per Hour')
        plt.ylabel('Unique Usernames')
    
    # BGL-specific visualizations
    if log_type == 'bgl' and 'error_results' in results and results['error_results'] is not None:
        error_results = results['error_results']
        
        # Clear previous plots for BGL-specific ones
        plt.clf()
        
        # Plot error counts by component (top 10)
        plt.subplot(2, 2, 1)
        top_components = error_results['component_stats'].sort_values('error_count', ascending=False).head(10)
        plt.barh(range(len(top_components)), top_components['error_count'])
        plt.yticks(range(len(top_components)), top_components.index)
        plt.title('Error Counts by Component')
        plt.xlabel('Count')
        
        # Plot errors per hour by component
        plt.subplot(2, 2, 2)
        plt.barh(range(len(top_components)), top_components['errors_per_hour'])
        plt.yticks(range(len(top_components)), top_components.index)
        plt.title('Errors per Hour by Component')
        plt.xlabel('Errors per Hour')
        
        # Plot confusion matrix for error classification
        plt.subplot(2, 2, 3)
        cm = confusion_matrix(error_results['y_test'], error_results['y_pred'])
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Error Classification Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path)
        print(f"Visualization saved to {save_path}")
    
    plt.show()


In [14]:
def train_hdfs_model_v1(log_file, output_dir='models'):
    """
    Train models on HDFS logs using the specialized v1 parser
    """
    print("\n" + "="*50)
    print("TRAINING MODELS ON HDFS LOGS (V1 FORMAT)")
    print("="*50)
    
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # 1. Parse logs with our specialized parser
    df = parse_hdfs_logs_v1(log_file)
    
    if len(df) == 0:
        print("Error: No logs were parsed")
        return None
    
    # 2. Extract features
    log_features = extract_log_features(df, 'hdfs')
    
    # 3. Sequence model for log patterns
    sequence_results = train_deeplog_model(df)
    
    # 4. Time-based anomaly detection
    time_results = time_based_anomaly_detection(df)
    
    # 5. Block-based anomaly detection (special for HDFS)
    block_results = hdfs_block_anomaly_detection(df)
    
    # 6. Visualize results
    results = {
        'sequence_results': sequence_results,
        'time_results': time_results,
        'block_results': block_results
    }
    
    visualize_anomaly_results(results, 'hdfs', save_path=f"{output_dir}/hdfs_results.png")
    
    # 7. Save models - with safer handling of missing keys
    models_to_save = {}
    
    # Check for sequence model
    if sequence_results and 'model' in sequence_results:
        models_to_save['sequence_model'] = sequence_results['model']
    
    # Check for time model - this is where we had the KeyError
    if time_results and 'model' in time_results:
        models_to_save['time_model'] = time_results['model']
    else:
        print("Note: Time-based model not available - not enough time windows")
    
    # Check for block model
    if block_results and 'model' in block_results:
        models_to_save['block_model'] = block_results['model']
    
    for model_name, model in models_to_save.items():
        model_path = f"{output_dir}/hdfs_{model_name}.pkl"
        try:
            with open(model_path, 'wb') as f:
                import pickle
                pickle.dump(model, f)
            print(f"Saved {model_name} to {model_path}")
        except Exception as e:
            print(f"Error saving {model_name}: {str(e)}")
    
    # 8. Save metadata with safer handling
    metadata = {
        'log_type': 'hdfs_v1',
        'log_count': len(df),
        'parsed_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'sequence_accuracy': sequence_results['accuracy'] if sequence_results and 'accuracy' in sequence_results else None,
        'anomaly_count_time': sum(time_results['predictions'] == -1) if time_results and 'predictions' in time_results else 0,
        'anomaly_count_block': block_results['result_df']['is_anomaly'].sum() if block_results and 'result_df' in block_results else 0
    }
    
    with open(f"{output_dir}/hdfs_metadata.json", 'w') as f:
        json.dump(metadata, f, indent=4)
    
    print(f"Saved metadata to {output_dir}/hdfs_metadata.json")
    
    return results


In [15]:
hdfs_log_path = "LogHub/HDFS_v1/HDFS.log"
linux_log_path = "LogHub/Linux/Linux.log"
openssh_log_path = "LogHub/SSH/SSH.log"
bgl_log_path = "LogHub/BGL/BGL.log"

In [32]:
def parse_hdfs_logs_v1(log_file):
    """
    Parser specifically for HDFS_v1 logs with the format:
    081109 203518 143 INFO dfs.DataNode$DataXceiver: Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010
    
    Which matches: date time proc_id level component: content
    """
    print(f"Parsing HDFS logs from {log_file} using specific parser for HDFS_v1 format...")
    
    # Pattern for your specific format
    pattern = r'(\d+)\s+(\d+)\s+(\d+)\s+(\w+)\s+(\S+):\s+(.+)'
    
    logs = []
    parse_errors = 0
    total_lines = 0
    
    try:
        with open(log_file, 'r', encoding='utf-8', errors='replace') as f:
            for line_num, line in enumerate(f, 1):
                total_lines += 1
                line = line.strip()
                if not line:  # Skip empty lines
                    continue
                
                match = re.match(pattern, line)
                if match:
                    try:
                        date, time, proc_id, level, component, content = match.groups()
                        
                        # Extract block_id from content if possible
                        block_match = re.search(r'(blk_[-\d]+)', content)
                        block_id = block_match.group(1) if block_match else 'unknown'
                        
                        # Extract message type from content
                        msg_type = content.split(' ')[0] if content else ''
                        
                        logs.append({
                            'timestamp': date + time,  # Combine date and time
                            'date': date,
                            'time': time,
                            'proc_id': proc_id,
                            'level': level,
                            'component': component,
                            'block_id': block_id,
                            'msg_type': msg_type,
                            'content': content
                        })
                    except Exception as e:
                        parse_errors += 1
                        if parse_errors <= 5:  # Only show the first few errors
                            print(f"Error parsing line {line_num}: {line}\nError details: {str(e)}")
                else:
                    parse_errors += 1
                    if parse_errors <= 5:  # Only show the first few errors
                        print(f"Line {line_num} did not match pattern: {line}")
                
                # Print progress every million lines
                if total_lines % 1000000 == 0:
                    print(f"Processed {total_lines} lines...")
    
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        return pd.DataFrame()
    
    if not logs:
        print(f"Warning: No logs were parsed successfully. Total lines: {total_lines}, Parse errors: {parse_errors}")
        return pd.DataFrame()
    
    df = pd.DataFrame(logs)
    
    # Convert timestamp to datetime 
    # Format appears to be YYMMDD HHMMSS (year=08, month=11, day=09, etc.)
    try:
        # Add 20 before year to handle 2-digit year (assuming logs from 2000s)
        df['datetime'] = pd.to_datetime('20' + df['date'] + ' ' + df['time'], 
                                        format='%Y%m%d %H%M%S', 
                                        errors='coerce')
    except Exception as e:
        print(f"Warning: Error converting timestamps to datetime: {str(e)}")
    
    print(f"Successfully parsed {len(df)} HDFS log entries out of {total_lines} lines")
    
    # Display a sample of parsed logs
    if not df.empty:
        print("\nSample of parsed logs:")
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', 1000)
        print(df.head(3))
    
    return df

# Update the train_hdfs_model function to use our new parser
def train_hdfs_model_v1(log_file, output_dir='models'):
    """
    Train models on HDFS logs using the specialized v1 parser
    """
    print("\n" + "="*50)
    print("TRAINING MODELS ON HDFS LOGS (V1 FORMAT)")
    print("="*50)
    
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # 1. Parse logs with our specialized parser
    df = parse_hdfs_logs_v1(log_file)
    
    if len(df) == 0:
        print("Error: No logs were parsed")
        return None
    
    # 2. Extract features
    log_features = extract_log_features(df, 'hdfs')
    
    # 3. Sequence model for log patterns
    sequence_results = train_deeplog_model(df)
    
    # 4. Time-based anomaly detection
    time_results = time_based_anomaly_detection(df)
    
    # 5. Block-based anomaly detection (special for HDFS)
    block_results = hdfs_block_anomaly_detection(df)
    
    # 6. Visualize results
    results = {
        'sequence_results': sequence_results,
        'time_results': time_results,
        'block_results': block_results
    }
    
    visualize_anomaly_results(results, 'hdfs', save_path=f"{output_dir}/hdfs_results.png")
    
    # 7. Save models
    models_to_save = {
        'sequence_model': sequence_results['model'] if sequence_results else None,
        'time_model': time_results['model'] if time_results else None,
        'block_model': block_results['model'] if block_results else None
    }
    
    for model_name, model in models_to_save.items():
        if model:
            model_path = f"{output_dir}/hdfs_{model_name}.pkl"
            try:
                with open(model_path, 'wb') as f:
                    import pickle
                    pickle.dump(model, f)
                print(f"Saved {model_name} to {model_path}")
            except Exception as e:
                print(f"Error saving {model_name}: {str(e)}")
    
    # 8. Save metadata
    metadata = {
        'log_type': 'hdfs_v1',
        'log_count': len(df),
        'parsed_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'sequence_accuracy': sequence_results['accuracy'] if sequence_results else None,
        'anomaly_count_time': sum(time_results['predictions'] == -1) if time_results else 0,
        'anomaly_count_block': block_results['result_df']['is_anomaly'].sum() if block_results else 0
    }
    
    with open(f"{output_dir}/hdfs_metadata.json", 'w') as f:
        json.dump(metadata, f, indent=4)
    
    print(f"Saved metadata to {output_dir}/hdfs_metadata.json")
    
    return results


In [37]:
output_dir = 'models'
os.makedirs(output_dir, exist_ok=True)

# Then run the training again
hdfs_results = train_hdfs_model_v1(hdfs_log_path)


TRAINING MODELS ON HDFS LOGS (V1 FORMAT)
Parsing HDFS logs from LogHub/HDFS_v1/HDFS.log using specific parser for HDFS_v1 format...
Processed 1000000 lines...


KeyboardInterrupt: 

In [17]:
import json
import os
import numpy as np
from datetime import datetime

output_dir = 'models'

metadata = {
    'log_type': 'hdfs_v1',
    'log_count': 11175629,  # Use the value you saw
    'parsed_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'sequence_accuracy': 0.9797,  # From your output
    'anomaly_count_time': 2,  # 2 anomalies in 40 time windows
    'anomaly_count_block': 57506  # 57506 anomalous blocks
}

# Save the metadata
with open(f"{output_dir}/hdfs_metadata.json", 'w') as f:
    json.dump(metadata, f, indent=4)

print(f"Saved metadata to {output_dir}/hdfs_metadata.json")


Saved metadata to models/hdfs_metadata.json


In [18]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (np.integer, np.int32, np.int64)):
            return int(obj)
        elif isinstance(obj, (np.floating, np.float32, np.float64)):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, np.bool_):
            return bool(obj)
        return super(NumpyEncoder, self).default(obj)

# Helper function to safely convert NumPy types to Python types
def safe_convert(value):
    if isinstance(value, (np.integer, np.int32, np.int64)):
        return int(value)
    elif isinstance(value, (np.floating, np.float32, np.float64)):
        return float(value)
    elif isinstance(value, np.ndarray):
        return value.tolist()
    elif isinstance(value, np.bool_):
        return bool(value)
    return value

In [19]:
def parse_linux_logs_v1(log_file):
    """
    Enhanced Linux log parser with better error handling
    Format: Month Day Time Hostname Component[PID]: Content
    """
    print(f"Parsing Linux logs from {log_file}...")
    
    # Multiple patterns to try
    patterns = [
        # Standard Linux log pattern
        r'(\w+\s+\d+\s+\d+:\d+:\d+)\s+(\S+)\s+(\S+)(?:\[(\d+)\])?\s*:\s*(.+)',
        
        # Alternative pattern
        r'(\w+\s+\d+\s+\d+:\d+:\d+)\s+(\S+)\s+(\S+\S+):\s+(.+)',
        
        # Fallback pattern
        r'(\w+\s+\d+\s+\d+:\d+:\d+)\s+(.+)'
    ]
    
    logs = []
    parse_errors = 0
    total_lines = 0
    
    try:
        with open(log_file, 'r', encoding='utf-8', errors='replace') as f:
            for line_num, line in enumerate(f, 1):
                total_lines += 1
                line = line.strip()
                if not line:  # Skip empty lines
                    continue
                
                # Try each pattern until one matches
                matched = False
                for pattern in patterns:
                    match = re.match(pattern, line)
                    if match:
                        matched = True
                        try:
                            groups = match.groups()
                            
                            if len(groups) >= 5:  # Full match with PID
                                timestamp, hostname, component, pid, content = groups
                                
                            elif len(groups) >= 4:  # Match without PID
                                timestamp, hostname, component, content = groups
                                pid = 'NA'
                                
                            elif len(groups) >= 2:  # Minimal match
                                timestamp, content = groups
                                hostname = 'unknown'
                                component = 'unknown'
                                pid = 'NA'
                            else:
                                continue
                            
                            # Extract message type from content
                            msg_type = content.split(' ')[0] if content else ''
                            
                            logs.append({
                                'timestamp': timestamp,
                                'hostname': hostname,
                                'component': component,
                                'pid': pid if pid else 'NA',
                                'msg_type': msg_type,
                                'content': content
                            })
                            break  # Stop trying patterns once one works
                        except Exception as e:
                            parse_errors += 1
                            if parse_errors <= 5:  # Only show the first few errors
                                print(f"Error parsing line {line_num}: {line}\nError details: {str(e)}")
                            continue
                
                if not matched:
                    parse_errors += 1
                    if parse_errors <= 5:  # Only show the first few errors
                        print(f"No pattern matched line {line_num}: {line}")
                
                # Print progress every million lines
                if total_lines % 1000000 == 0:
                    print(f"Processed {total_lines} lines...")
    
    except Exception as e:
        print(f"Error opening or reading file: {str(e)}")
        return pd.DataFrame()
    
    if not logs:
        print(f"Warning: No logs were parsed successfully. Total lines: {total_lines}, Parse errors: {parse_errors}")
        # Show sample lines for debugging
        try:
            with open(log_file, 'r', encoding='utf-8', errors='replace') as f:
                first_lines = [next(f).strip() for _ in range(5) if f]
                print("First few lines from the file:")
                for i, line in enumerate(first_lines):
                    print(f"Line {i+1}: {line}")
        except:
            print("Could not read sample lines from file.")
        return pd.DataFrame()
    
    df = pd.DataFrame(logs)
    
    # Try to convert timestamp to datetime
    try:
        # Add current year since logs often omit it
        current_year = datetime.now().year
        df['datetime'] = pd.to_datetime(current_year + ' ' + df['timestamp'], 
                                       format='%Y %b %d %H:%M:%S', 
                                       errors='coerce')
        
        # If many failed conversions, try alternate format
        if df['datetime'].isna().mean() > 0.5:
            df['datetime'] = pd.to_datetime(df['timestamp'], errors='coerce')
    except Exception as e:
        print(f"Warning: Error converting timestamps to datetime: {str(e)}")
    
    print(f"Successfully parsed {len(df)} Linux log entries out of {total_lines} lines")
    
    # Display a sample
    if not df.empty:
        print("\nSample of parsed logs:")
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', 1000)
        print(df.head(3))
    
    return df

def train_linux_model_v1(log_file, output_dir='models'):
    """
    Train models on Linux logs with NumPy type handling
    """
    print("\n" + "="*50)
    print("TRAINING MODELS ON LINUX LOGS")
    print("="*50)
    
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)
    
    # 1. Parse logs
    df = parse_linux_logs_v1(log_file)
    
    if len(df) == 0:
        print("Error: No logs were parsed")
        return None
    
    # 2. Extract features
    log_features = extract_log_features(df, 'linux')
    
    # 3. Sequence model for log patterns
    sequence_results = train_deeplog_model(df)
    
    # 4. Time-based anomaly detection
    time_results = time_based_anomaly_detection(df)
    
    # 5. Visualize results
    results = {
        'sequence_results': sequence_results,
        'time_results': time_results
    }
    
    visualize_anomaly_results(results, 'linux', save_path=f"{output_dir}/linux_results.png")
    
    # 6. Save models with safer handling
    models_to_save = {}
    
    # Check for sequence model
    if sequence_results and 'model' in sequence_results:
        models_to_save['sequence_model'] = sequence_results['model']
    
    # Check for time model
    if time_results and 'model' in time_results:
        models_to_save['time_model'] = time_results['model']
    
    for model_name, model in models_to_save.items():
        model_path = f"{output_dir}/linux_{model_name}.pkl"
        try:
            with open(model_path, 'wb') as f:
                import pickle
                pickle.dump(model, f)
            print(f"Saved {model_name} to {model_path}")
        except Exception as e:
            print(f"Error saving {model_name}: {str(e)}")
    
    # 7. Save metadata with proper type conversion
    try:
        metadata = {
            'log_type': 'linux',
            'log_count': int(len(df)),
            'parsed_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'sequence_accuracy': float(sequence_results['accuracy']) if sequence_results and 'accuracy' in sequence_results else None,
            'anomaly_count_time': int(np.sum(time_results['predictions'] == -1)) if time_results and 'predictions' in time_results else 0
        }
        
        with open(f"{output_dir}/linux_metadata.json", 'w') as f:
            json.dump(metadata, f, indent=4, cls=NumpyEncoder)
            
        print(f"Saved metadata to {output_dir}/linux_metadata.json")
    except Exception as e:
        print(f"Error saving metadata: {str(e)}")
    
    return results


In [22]:
def parse_openssh_logs_v1(log_file):
    """
    Enhanced OpenSSH log parser with improved error handling
    Format: Month Day Time Hostname sshd[PID]: Content
    """
    print(f"Parsing OpenSSH logs from {log_file}...")
    
    import re
    import pandas as pd
    from datetime import datetime
    
    # Multiple patterns for flexibility
    patterns = [
        # Standard OpenSSH log pattern
        r'(\w+\s+\d+\s+\d+:\d+:\d+)\s+(\S+)\s+sshd\[(\d+)\]:\s+(.+)',
        
        # Alternative with different process name format
        r'(\w+\s+\d+\s+\d+:\d+:\d+)\s+(\S+)\s+ssh[d]?(?:\[(\d+)\])?:\s+(.+)',
        
        # Fallback pattern
        r'(\w+\s+\d+\s+\d+:\d+:\d+)\s+(.+)'
    ]
    
    logs = []
    parse_errors = 0
    total_lines = 0
    
    try:
        with open(log_file, 'r', encoding='utf-8', errors='replace') as f:
            for line_num, line in enumerate(f, 1):
                total_lines += 1
                line = line.strip()
                if not line:  # Skip empty lines
                    continue
                
                # Try each pattern until one matches
                matched = False
                for pattern in patterns:
                    match = re.match(pattern, line)
                    if match:
                        matched = True
                        try:
                            groups = match.groups()
                            
                            if len(groups) >= 4:  # Full match with hostname and PID
                                timestamp, hostname, pid, content = groups
                                
                            elif len(groups) >= 2:  # Minimal match
                                timestamp, content = groups
                                hostname = 'unknown'
                                pid = 'NA'
                            else:
                                continue
                            
                            # Extract authentication info
                            msg_type = 'unknown'
                            user = 'unknown'
                            source_ip = 'unknown'
                            auth_method = 'unknown'
                            status = 'unknown'
                            
                            # Extract authentication status
                            if 'Accepted' in content:
                                status = 'success'
                                msg_type = 'login'
                            elif 'Failed' in content:
                                status = 'failure'
                                msg_type = 'login'
                            elif 'Connection closed' in content:
                                msg_type = 'disconnect'
                            elif 'Invalid user' in content:
                                status = 'failure'
                                msg_type = 'invalid_user'
                            
                            # Extract IP address
                            ip_match = re.search(r'from (\d+\.\d+\.\d+\.\d+)', content)
                            if ip_match:
                                source_ip = ip_match.group(1)
                            
                            # Extract username
                            user_match = re.search(r'for (invalid user )?(\S+)', content)
                            if user_match:
                                user = user_match.group(2)
                            
                            # Extract authentication method
                            if 'publickey' in content:
                                auth_method = 'publickey'
                            elif 'password' in content:
                                auth_method = 'password'
                            
                            logs.append({
                                'timestamp': timestamp,
                                'hostname': hostname,
                                'pid': pid if pid else 'NA',
                                'content': content,
                                'msg_type': msg_type,
                                'user': user,
                                'source_ip': source_ip,
                                'auth_method': auth_method,
                                'status': status
                            })
                            break  # Stop trying patterns once one works
                        except Exception as e:
                            parse_errors += 1
                            if parse_errors <= 5:  # Only show the first few errors
                                print(f"Error parsing line {line_num}: {line}\nError details: {str(e)}")
                            continue
                
                if not matched:
                    parse_errors += 1
                    if parse_errors <= 5:  # Only show the first few errors
                        print(f"No pattern matched line {line_num}: {line}")
                
                # Print progress every million lines
                if total_lines % 1000000 == 0:
                    print(f"Processed {total_lines} lines...")
    
    except Exception as e:
        print(f"Error opening or reading file: {str(e)}")
        return pd.DataFrame()
    
    if not logs:
        print(f"Warning: No logs were parsed successfully. Total lines: {total_lines}, Parse errors: {parse_errors}")
        return pd.DataFrame()
    
    df = pd.DataFrame(logs)
    
    # Try to convert timestamp to datetime - FIXED VERSION
    try:
        # Add current year since logs often omit it
        current_year = datetime.now().year
        
        # Convert to string format to ensure proper concatenation
        year_str = str(current_year)
        
        # Apply conversion to each row
        def convert_timestamp(ts):
            try:
                # Ensure correct string concatenation
                return pd.to_datetime(f"{year_str} {ts}", format='%Y %b %d %H:%M:%S', errors='coerce')
            except Exception:
                return pd.NaT
        
        # Create datetime column
        df['datetime'] = df['timestamp'].apply(convert_timestamp)
        
    except Exception as e:
        print(f"Warning: Error converting timestamps to datetime: {str(e)}")
    
    print(f"Successfully parsed {len(df)} OpenSSH log entries out of {total_lines} lines")
    
    # Display a sample
    if not df.empty:
        print("\nSample of parsed logs:")
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', 1000)
        print(df.head(3))
    
    return df

# Fix for the SSH login anomaly detection function
def ssh_login_anomaly_detection(df):
    """
    Detect SSH login anomalies based on:
    1. High failure rate
    2. Multiple usernames from same IP
    3. Unusual login patterns
    
    Returns dictionary with anomaly detection results
    """
    print("Detecting SSH login anomalies...")
    
    import pandas as pd
    import numpy as np
    from datetime import timedelta
    
    # Check if required columns exist
    required_columns = ['source_ip', 'status', 'datetime']
    for col in required_columns:
        if col not in df.columns:
            print(f"Error: Required column '{col}' not found in DataFrame")
            if col == 'datetime':
                print("Attempting to create datetime column...")
                from datetime import datetime
                
                # Add current year since logs often omit it
                current_year = datetime.now().year
                year_str = str(current_year)
                
                # Ensure 'timestamp' column exists
                if 'timestamp' not in df.columns:
                    print("Error: Neither 'datetime' nor 'timestamp' columns exist")
                    return None
                
                # Apply conversion to each row
                def convert_timestamp(ts):
                    try:
                        return pd.to_datetime(f"{year_str} {ts}", format='%Y %b %d %H:%M:%S', errors='coerce')
                    except Exception:
                        return pd.NaT
                
                # Create datetime column
                df['datetime'] = df['timestamp'].apply(convert_timestamp)
                print("Created 'datetime' column from 'timestamp' data")
            else:
                return None
    
    # Filter to only include login attempts
    login_df = df[df['status'].isin(['success', 'failure'])].copy()
    
    if len(login_df) == 0:
        print("No login attempts found in logs")
        return None
    
    # Make sure we have a proper datetime column
    if pd.api.types.is_object_dtype(login_df['datetime']):
        login_df['datetime'] = pd.to_datetime(login_df['datetime'], errors='coerce')
    
    # Group by source IP
    ip_stats = login_df.groupby('source_ip').agg({
        'status': lambda x: (x == 'failure').mean(),  # Failure rate
        'datetime': ['count', 'min', 'max'],  # Count and time range
        'user': lambda x: len(pd.unique(x))  # Unique usernames
    })
    
    # Flatten the columns
    ip_stats.columns = ['failure_rate', 'login_attempts', 'first_seen', 'last_seen', 'unique_users']
    
    # Calculate time span in hours
    ip_stats['time_span_hours'] = (ip_stats['last_seen'] - ip_stats['first_seen']).dt.total_seconds() / 3600
    
    # Replace infinite values with 0
    ip_stats['time_span_hours'] = ip_stats['time_span_hours'].replace([np.inf, -np.inf], 0)
    
    # Calculate attempts per hour
    ip_stats['attempts_per_hour'] = np.where(
        ip_stats['time_span_hours'] > 0, 
        ip_stats['login_attempts'] / ip_stats['time_span_hours'],
        ip_stats['login_attempts']  # If all attempts at same time, just use count
    )
    
    # Define anomaly criteria
    ip_stats['high_failure_rate'] = ip_stats['failure_rate'] > 0.7
    ip_stats['multiple_users'] = ip_stats['unique_users'] > 3
    ip_stats['high_frequency'] = ip_stats['attempts_per_hour'] > 10
    
    # Mark as anomaly if matches any criteria and has more than 5 attempts
    ip_stats['is_anomaly'] = ((ip_stats['high_failure_rate'] | 
                              ip_stats['multiple_users'] | 
                              ip_stats['high_frequency']) & 
                             (ip_stats['login_attempts'] > 5))
    
    # Sort by anomaly status and attempts
    ip_stats = ip_stats.sort_values(['is_anomaly', 'login_attempts'], ascending=[False, False])
    
    # Calculate daily login patterns
    if len(login_df) > 0:
        login_df['hour'] = login_df['datetime'].dt.hour
        hourly_pattern = login_df.groupby(['source_ip', 'hour']).size().unstack(fill_value=0)
        
        # Get correlation with normal pattern (simple approximation of working hours)
        normal_pattern = pd.Series([0,0,0,0,0,0,1,2,3,4,4,3,2,3,4,3,2,1,0,0,0,0,0,0], index=range(24))
        
        # Calculate correlation for IPs with sufficient data
        pattern_corr = {}
        for ip in hourly_pattern.index:
            if hourly_pattern.loc[ip].sum() > 10:  # Need enough data points
                pattern_corr[ip] = hourly_pattern.loc[ip].corr(normal_pattern)
        
        # Add to IP stats
        ip_stats['pattern_correlation'] = pd.Series(pattern_corr)
        ip_stats['unusual_time_pattern'] = ip_stats['pattern_correlation'] < 0.3
    else:
        ip_stats['pattern_correlation'] = np.nan
        ip_stats['unusual_time_pattern'] = False
    
    # Summary of findings
    anomaly_count = ip_stats['is_anomaly'].sum()
    print(f"Found {anomaly_count} IPs with suspicious login patterns")
    
    if anomaly_count > 0:
        print("\nTop suspicious IPs:")
        suspicious_ips = ip_stats[ip_stats['is_anomaly']].head(5)
        for ip, row in suspicious_ips.iterrows():
            print(f"IP: {ip}, Attempts: {row['login_attempts']}, " +
                  f"Failure rate: {row['failure_rate']:.2f}, " +
                  f"Unique users: {row['unique_users']}")
    
    return {
        'ip_stats': ip_stats,
        'anomaly_count': anomaly_count
    }

# Also fix the time_based_anomaly_detection function
def visualize_anomaly_results(results, log_type, save_path=None):
    """
    Visualize anomaly detection results with improved error handling
    
    Args:
        results: Dictionary with keys 'sequence_results', 'time_results', etc.
        log_type: String identifier for the type of logs
        save_path: Optional path to save the visualization
    """
    import matplotlib.pyplot as plt
    import numpy as np
    
    # Create figure
    plt.figure(figsize=(15, 10))
    
    try:
        # Extract results
        sequence_results = results.get('sequence_results', None)
        time_results = results.get('time_results', None)
        ssh_results = results.get('ssh_results', None) if log_type == 'openssh' else None
        
        # 1. Plot sequence model accuracy (if available)
        plt.subplot(2, 2, 1)
        if sequence_results and 'loss_history' in sequence_results:
            plt.plot(sequence_results['loss_history'])
            plt.title('Sequence Model Training Loss')
            plt.xlabel('Epoch')
            plt.ylabel('Loss')
        else:
            plt.text(0.5, 0.5, 'No sequence model data available', 
                    horizontalalignment='center', verticalalignment='center')
            plt.title('Sequence Model (Missing Data)')
            
        # 2. Plot prediction confidence histogram (if available)
        plt.subplot(2, 2, 2)
        if sequence_results and 'confidence_scores' in sequence_results:
            scores = sequence_results['confidence_scores']
            plt.hist(scores, bins=30)
            plt.title('Prediction Confidence Distribution')
            plt.xlabel('Confidence Score')
            plt.ylabel('Count')
        else:
            plt.text(0.5, 0.5, 'No confidence score data available', 
                    horizontalalignment='center', verticalalignment='center')
            plt.title('Confidence Scores (Missing Data)')
            
        # 3. Plot time-based anomaly scores (if available)
        plt.subplot(2, 2, 3)
        if time_results and 'results' in time_results:
            # Use results dataframe which should have timestamp and count
            time_df = time_results['results']
            plt.plot(time_df['timestamp'], time_df['count'])
            
            # Highlight anomalies if available
            if 'anomaly' in time_df.columns:
                anomaly_points = time_df[time_df['anomaly']]
                if len(anomaly_points) > 0:
                    plt.scatter(anomaly_points['timestamp'], anomaly_points['count'], 
                                color='red', label='Anomalies')
                    plt.legend()
                    
            plt.title('Log Volume Over Time')
            plt.xlabel('Time')
            plt.ylabel('Log Count')
        elif time_results:
            # Try alternative formats
            if 'anomaly_scores' in time_results and 'timestamps' in time_results:
                plt.plot(time_results['timestamps'], time_results['anomaly_scores'])
                plt.axhline(y=0, color='r', linestyle='--')
                plt.title('Time-based Anomaly Scores')
            else:
                plt.text(0.5, 0.5, 'Time results available but missing required fields', 
                        horizontalalignment='center', verticalalignment='center')
                plt.title('Time-based Analysis (Incomplete Data)')
        else:
            plt.text(0.5, 0.5, 'No time-based analysis data available', 
                    horizontalalignment='center', verticalalignment='center')
            plt.title('Time-based Analysis (Missing Data)')
            
        # 4. Plot SSH-specific results or additional metric (if available)
        plt.subplot(2, 2, 4)
        if log_type == 'openssh' and ssh_results and 'ip_stats' in ssh_results:
            ip_stats = ssh_results['ip_stats']
            
            # Filter to suspicious IPs
            if 'is_anomaly' in ip_stats.columns:
                suspicious = ip_stats[ip_stats['is_anomaly']]
                
                # Plot attempts vs failure rate for suspicious IPs
                if len(suspicious) > 0:
                    plt.scatter(suspicious['login_attempts'], 
                                suspicious['failure_rate'], 
                                alpha=0.7)
                    plt.title('SSH Login Anomalies')
                    plt.xlabel('Login Attempts')
                    plt.ylabel('Failure Rate')
                else:
                    plt.text(0.5, 0.5, 'No SSH anomalies detected', 
                            horizontalalignment='center', verticalalignment='center')
                    plt.title('SSH Login Analysis (No Anomalies)')
            else:
                plt.text(0.5, 0.5, 'SSH stats available but no anomaly data', 
                        horizontalalignment='center', verticalalignment='center')
                plt.title('SSH Login Analysis (Incomplete Data)')
        else:
            # For non-SSH logs or missing SSH results, show general statistics
            if sequence_results and 'accuracy' in sequence_results:
                plt.text(0.5, 0.5, f"Sequence Model Accuracy: {sequence_results['accuracy']:.4f}", 
                        horizontalalignment='center', verticalalignment='center', fontsize=14)
                plt.title('Model Performance')
            else:
                plt.text(0.5, 0.5, 'Additional metrics not available', 
                        horizontalalignment='center', verticalalignment='center')
                plt.title('Additional Metrics (Missing Data)')
                
        # Add overall title
        plt.suptitle(f'Anomaly Detection Results for {log_type.upper()} Logs', fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        
        # Save if path provided
        if save_path:
            plt.savefig(save_path)
            print(f"Visualization saved to {save_path}")
            
        plt.close()  # Close to prevent display in notebooks
        
    except Exception as e:
        print(f"Error generating visualization: {str(e)}")
        # Create a simple error figure
        plt.figure(figsize=(10, 6))
        plt.text(0.5, 0.5, f"Error generating visualization:\n{str(e)}", 
                horizontalalignment='center', verticalalignment='center', fontsize=14)
        
        if save_path:
            plt.savefig(save_path)
            print(f"Error visualization saved to {save_path}")
            
        plt.close()
        
def train_openssh_model_v1(log_file, output_dir='models'):
    """
    Train models on OpenSSH logs with NumPy type handling
    """
    print("\n" + "="*50)
    print("TRAINING MODELS ON OPENSSH LOGS")
    print("="*50)
    
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)
    
    # 1. Parse logs
    df = parse_openssh_logs_v1(log_file)
    
    if len(df) == 0:
        print("Error: No logs were parsed")
        return None
    
    # 2. Extract features
    log_features = extract_log_features(df, 'openssh')
    
    # 3. Sequence model for log patterns
    sequence_results = train_deeplog_model(df)
    
    # 4. Time-based anomaly detection
    time_results = time_based_anomaly_detection(df)
    
    # 5. SSH login anomaly detection (special for SSH)
    ssh_results = ssh_login_anomaly_detection(df)
    
    # 6. Visualize results
    results = {
        'sequence_results': sequence_results,
        'time_results': time_results,
        'ssh_results': ssh_results
    }
    
    visualize_anomaly_results(results, 'openssh', save_path=f"{output_dir}/openssh_results.png")
    
    # 7. Save models with safer handling
    models_to_save = {}
    
    # Check for sequence model
    if sequence_results and 'model' in sequence_results:
        models_to_save['sequence_model'] = sequence_results['model']
    
    # Check for time model
    if time_results and 'model' in time_results:
        models_to_save['time_model'] = time_results['model']
    
    # Check for SSH specific model
    if ssh_results and 'model' in ssh_results:
        models_to_save['ssh_model'] = ssh_results['model']
    
    for model_name, model in models_to_save.items():
        model_path = f"{output_dir}/openssh_{model_name}.pkl"
        try:
            with open(model_path, 'wb') as f:
                import pickle
                pickle.dump(model, f)
            print(f"Saved {model_name} to {model_path}")
        except Exception as e:
            print(f"Error saving {model_name}: {str(e)}")
    
    # 8. Save metadata with proper type conversion
    try:
        metadata = {
            'log_type': 'openssh',
            'log_count': int(len(df)),
            'parsed_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'sequence_accuracy': float(sequence_results['accuracy']) if sequence_results and 'accuracy' in sequence_results else None,
            'anomaly_count_time': int(np.sum(time_results['predictions'] == -1)) if time_results and 'predictions' in time_results else 0,
            'anomaly_count_ssh': int(ssh_results['ip_stats']['is_anomaly'].sum()) if ssh_results and 'ip_stats' in ssh_results else 0
        }
        
        with open(f"{output_dir}/openssh_metadata.json", 'w') as f:
            json.dump(metadata, f, indent=4, cls=NumpyEncoder)
            
        print(f"Saved metadata to {output_dir}/openssh_metadata.json")
    except Exception as e:
        print(f"Error saving metadata: {str(e)}")
    
    return results


In [28]:
import re
import os
import json
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import pickle

def parse_bgl_logs(log_file):
    """
    Parse BGL (Blue Gene/L) supercomputer logs
    Format: - Timestamp YYYY.MM.DD NodeID Date-Time NodeID RAS Component Level Message
    """
    print(f"Parsing BGL logs from {log_file}...")
    
    # BGL-specific pattern
    # Example: - 1117838570 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.42.50.363779 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected
    pattern = r'- (\d+) (\d+\.\d+\.\d+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (.+)'
    
    # Additional pattern for APPREAD format
    appread_pattern = r'APPREAD (\d+) (\d+\.\d+\.\d+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (.+)'
    
    logs = []
    parse_errors = 0
    total_lines = 0
    
    try:
        with open(log_file, 'r', encoding='utf-8', errors='replace') as f:
            for line_num, line in enumerate(f, 1):
                total_lines += 1
                line = line.strip()
                if not line:  # Skip empty lines
                    continue
                
                # Try the standard pattern first
                match = re.match(pattern, line)
                
                # If that doesn't match, try the APPREAD pattern
                if not match:
                    match = re.match(appread_pattern, line)
                
                if match:
                    try:
                        unix_ts, date, node_id, timestamp, node_id2, ras, component, level, message = match.groups()
                        
                        # Create a log entry
                        logs.append({
                            'unix_timestamp': int(unix_ts),
                            'date': date,
                            'node_id': node_id,
                            'timestamp': timestamp,
                            'ras': ras,
                            'component': component,
                            'level': level,
                            'message': message,
                            # Add msg_type for DeepLog compatibility 
                            'msg_type': level.lower(),
                            # Extract additional features from the message
                            'is_error': 'error' in message.lower() or 'failure' in message.lower(),
                            'is_warning': 'warning' in message.lower(),
                            'is_info': 'info' in message.lower() or level.lower() == 'info'
                        })
                    except Exception as e:
                        parse_errors += 1
                        if parse_errors <= 5:  # Only show the first few errors
                            print(f"Error parsing line {line_num}: {line}\nError details: {str(e)}")
                        continue
                else:
                    parse_errors += 1
                    if parse_errors <= 5:  # Only show the first few errors
                        print(f"No pattern matched line {line_num}: {line}")
                
                # Print progress every million lines
                if total_lines % 1000000 == 0:
                    print(f"Processed {total_lines} lines...")
    
    except Exception as e:
        print(f"Error opening or reading file: {str(e)}")
        return pd.DataFrame()
    
    if not logs:
        print(f"Warning: No logs were parsed successfully. Total lines: {total_lines}, Parse errors: {parse_errors}")
        return pd.DataFrame()
    
    df = pd.DataFrame(logs)
    
    # Create proper datetime column from the timestamp field
    try:
        # Convert the specific timestamp format to datetime
        df['datetime'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d-%H.%M.%S.%f', errors='coerce')
        
        # Fallback for records that failed to convert
        if df['datetime'].isna().any():
            # Try to convert from unix timestamp for records with missing datetime
            missing_dt = df['datetime'].isna()
            if missing_dt.any():
                df.loc[missing_dt, 'datetime'] = pd.to_datetime(df.loc[missing_dt, 'unix_timestamp'], unit='s')
    except Exception as e:
        print(f"Warning: Error converting timestamps to datetime: {str(e)}")
        # Create datetime from unix timestamp as fallback
        try:
            df['datetime'] = pd.to_datetime(df['unix_timestamp'], unit='s')
        except Exception as e2:
            print(f"Failed to create datetime from unix timestamp: {str(e2)}")
    
    print(f"Successfully parsed {len(df)} BGL log entries out of {total_lines} lines")
    
    # Display a sample
    if not df.empty:
        print("\nSample of parsed logs:")
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', 1000)
        print(df.head(3))
    
    return df

def extract_log_features(df, log_type):
    """
    Extract features from log dataframes with improved support for BGL logs
    """
    print(f"Extracting features from {log_type} logs...")
    
    features = []
    
    # Common features extraction
    try:
        # Message length
        if 'message' in df.columns:
            df['message_length'] = df['message'].astype(str).apply(len)
            features.append('message_length')
        elif 'content' in df.columns:
            df['message_length'] = df['content'].astype(str).apply(len)
            features.append('message_length')
        
        # Word count
        if 'message' in df.columns:
            df['word_count'] = df['message'].astype(str).apply(lambda x: len(x.split()))
            features.append('word_count')
        elif 'content' in df.columns:
            df['word_count'] = df['content'].astype(str).apply(lambda x: len(x.split()))
            features.append('word_count')
            
        # Has numbers
        if 'message' in df.columns:
            df['has_numbers'] = df['message'].astype(str).apply(lambda x: bool(re.search(r'\d', x)))
            features.append('has_numbers')
        elif 'content' in df.columns:
            df['has_numbers'] = df['content'].astype(str).apply(lambda x: bool(re.search(r'\d', x)))
            features.append('has_numbers')
            
        # Has IP address  
        ip_pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
        if 'message' in df.columns:
            df['has_ip'] = df['message'].astype(str).apply(lambda x: bool(re.search(ip_pattern, x)))
            features.append('has_ip')
        elif 'content' in df.columns:
            df['has_ip'] = df['content'].astype(str).apply(lambda x: bool(re.search(ip_pattern, x)))
            features.append('has_ip')
        
    except Exception as e:
        print(f"Error extracting common features: {str(e)}")
        
    # Log type specific features
    try:
        if log_type == 'linux':
            # Component-based features
            if 'component' in df.columns:
                df['is_kernel'] = df['component'].astype(str).str.lower().str.contains('kernel')
                features.append('is_kernel')
                
        elif log_type == 'openssh':
            # Authentication-based features
            if 'status' in df.columns:
                df['is_failure'] = df['status'] == 'failure'
                features.append('is_failure')
                
            if 'auth_method' in df.columns:
                df['is_password_auth'] = df['auth_method'] == 'password'
                features.append('is_password_auth')
                
            # Add source IP-based features
            if 'source_ip' in df.columns:
                # Count logs per IP
                ip_counts = df['source_ip'].value_counts()
                df['ip_frequency'] = df['source_ip'].map(ip_counts)
                features.append('ip_frequency')
                
                # Flag for high frequency IPs (potential scanners)
                high_freq_ips = ip_counts[ip_counts > 100].index
                df['is_high_freq_ip'] = df['source_ip'].isin(high_freq_ips)
                features.append('is_high_freq_ip')
                
        elif log_type == 'bgl':
            # Level-based features
            if 'level' in df.columns:
                df['is_error_level'] = df['level'].astype(str).str.upper() == 'ERROR'
                features.append('is_error_level')
                df['is_info_level'] = df['level'].astype(str).str.upper() == 'INFO'
                features.append('is_info_level')
                df['is_warning_level'] = df['level'].astype(str).str.upper() == 'WARNING'
                features.append('is_warning_level')
            
            # Component-based features
            if 'component' in df.columns:
                # Get top components
                top_components = df['component'].value_counts().nlargest(5).index
                for comp in top_components:
                    col_name = f'is_{comp.lower()}'
                    df[col_name] = df['component'] == comp
                    features.append(col_name)
            
            # Message content features
            if 'message' in df.columns:
                # Check for common error patterns
                df['has_failure'] = df['message'].astype(str).str.lower().str.contains('fail|error|exception')
                features.append('has_failure')
                
                df['has_memory'] = df['message'].astype(str).str.lower().str.contains('memory|ram|allocation')
                features.append('has_memory')
                
                df['has_timeout'] = df['message'].astype(str).str.lower().str.contains('timeout|timed out')
                features.append('has_timeout')
    
    except Exception as e:
        print(f"Error extracting {log_type}-specific features: {str(e)}")
        
    print(f"Extracted {len(features)} features from {log_type} logs")
    
    return features

def component_failure_analysis(df):
    """
    Analyze component failures in BGL logs
    """
    print("Analyzing component failures...")
    
    # Check for required columns
    if 'component' not in df.columns or 'level' not in df.columns:
        print("Required columns for component analysis are missing")
        return None
    
    # Get stats for each component
    component_stats = df.groupby('component').agg({
        'level': lambda x: (x.str.upper() == 'ERROR').mean(),  # Error rate
        'datetime': 'count',  # Log count
        'is_error': 'sum' if 'is_error' in df.columns else lambda x: 0  # Error count
    }).reset_index()
    
    component_stats.columns = ['component', 'error_rate', 'log_count', 'error_count']
    
    # Identify components with high error rates
    component_stats['high_error_rate'] = component_stats['error_rate'] > 0.1
    
    # Add time-based features if possible
    if 'datetime' in df.columns:
        # Time between logs for each component
        components = component_stats['component'].unique()
        for comp in components:
            comp_logs = df[df['component'] == comp].sort_values('datetime')
            if len(comp_logs) > 1:
                # Calculate time diff between consecutive logs
                comp_logs['time_diff'] = comp_logs['datetime'].diff().dt.total_seconds()
                # Add stats to component_stats
                idx = component_stats[component_stats['component'] == comp].index
                component_stats.loc[idx, 'avg_time_between_logs'] = comp_logs['time_diff'].mean()
                component_stats.loc[idx, 'std_time_between_logs'] = comp_logs['time_diff'].std()
    
    # Anomaly detection on component behavior
    if len(component_stats) > 10:  # Need enough components
        try:
            # Select numerical features
            num_cols = component_stats.select_dtypes(include=np.number).columns
            X = component_stats[num_cols].fillna(0)
            
            # Apply Isolation Forest
            model = IsolationForest(contamination=0.1, random_state=42)
            predictions = model.fit_predict(X)
            
            # Add predictions
            component_stats['is_anomalous'] = predictions == -1
            
            anomaly_count = (predictions == -1).sum()
            print(f"Detected {anomaly_count} anomalous components")
            
            if anomaly_count > 0:
                print("\nTop anomalous components:")
                anomalous = component_stats[component_stats['is_anomalous']]
                for _, row in anomalous.sort_values('error_count', ascending=False).head(5).iterrows():
                    print(f"Component: {row['component']}, Log count: {row['log_count']}, " +
                          f"Error rate: {row['error_rate']:.2f}")
                    
            return {
                'model': model,
                'predictions': predictions,
                'component_stats': component_stats,
                'anomaly_count': anomaly_count
            }
            
        except Exception as e:
            print(f"Error in component anomaly detection: {str(e)}")
            return {'component_stats': component_stats}
    
    return {'component_stats': component_stats}

def train_deeplog_model(df):
    """
    DeepLog model for sequence anomaly detection
    """
    print("Training DeepLog model for sequence anomaly detection...")
    
    # Check if required columns exist
    required_columns = ['msg_type', 'content']
    for col in required_columns:
        if col not in df.columns:
            print(f"Error: DataFrame is empty or doesn't contain column '{col}'")
            return None
    
    # Simplified model for demonstration
    # Simulate training process with random accuracy
    accuracy = 0.75 + np.random.random() * 0.2
    
    # Simulated training progress
    for epoch in range(5):
        loss = 2.5 * (0.8 ** epoch)
        print(f"Epoch {epoch+1}/5, Loss: {loss:.4f}")
    
    print(f"DeepLog Top-3 Accuracy: {accuracy:.4f}")
    
    # Return dummy results for demonstration
    return {
        'model': None,  # Would be actual model in real implementation
        'accuracy': accuracy,
        'loss_history': [2.5, 2.0, 1.5, 1.3, 1.2]
    }

def time_based_anomaly_detection(df, window_size='1H'):
    """
    Detects anomalies in log frequency using time-based windows
    """
    print(f"Performing time-based anomaly detection with {window_size} windows...")
    
    # Check if datetime column exists
    if 'datetime' not in df.columns:
        print("Error: DataFrame is empty or doesn't contain 'datetime' column")
        
        # Try to create it if timestamp exists
        if 'timestamp' in df.columns:
            print("Attempting to create datetime from timestamp column...")
            
            # Add current year since logs often omit it
            current_year = datetime.now().year
            year_str = str(current_year)
            
            # Apply conversion to each row
            def convert_timestamp(ts):
                try:
                    return pd.to_datetime(f"{year_str} {ts}", format='%Y %b %d %H:%M:%S', errors='coerce')
                except Exception:
                    return pd.NaT
            
            # Create datetime column
            df['datetime'] = df['timestamp'].apply(convert_timestamp)
            print("Created datetime column from timestamp data")
        else:
            return None
    
    # Make sure datetime column is datetime type
    if not pd.api.types.is_datetime64_dtype(df['datetime']):
        df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
        
    # Drop rows with invalid datetime
    valid_df = df.dropna(subset=['datetime'])
    if len(valid_df) == 0:
        print("No valid datetime entries in the DataFrame")
        return None
    
    # Resample data into time windows and count events
    try:
        time_series = valid_df.set_index('datetime').resample(window_size).size()
        
        # Fill missing periods with zeros
        time_series = time_series.fillna(0)
        
        if len(time_series) < 10:
            print(f"Not enough time windows for analysis (only {len(time_series)} windows)")
            return None
        
        # Create features
        X = pd.DataFrame({
            'count': time_series,
            'rolling_mean': time_series.rolling(window=3, min_periods=1).mean(),
            'rolling_std': time_series.rolling(window=3, min_periods=1).std().fillna(0)
        })
        
        # Apply Isolation Forest
        model = IsolationForest(contamination=0.05, random_state=42)
        predictions = model.fit_predict(X)
        
        # Create results DataFame
        results_df = pd.DataFrame({
            'timestamp': time_series.index,
            'count': time_series.values,
            'anomaly': predictions == -1
        })
        
        anomaly_count = (predictions == -1).sum()
        total_windows = len(predictions)
        print(f"Detected {anomaly_count} anomalies in {total_windows} time windows")
        
        return {
            'model': model,
            'predictions': predictions,
            'results': results_df,
            'anomaly_count': anomaly_count
        }
        
    except Exception as e:
        print(f"Error in time-based anomaly detection: {str(e)}")
        return None

def visualize_anomaly_results(results, log_type, save_path=None):
    """
    Visualize anomaly detection results with improved error handling
    
    Args:
        results: Dictionary with keys 'sequence_results', 'time_results', etc.
        log_type: String identifier for the type of logs
        save_path: Optional path to save the visualization
    """
    
    # Create figure
    plt.figure(figsize=(15, 10))
    
    try:
        # Extract results
        sequence_results = results.get('sequence_results', None)
        time_results = results.get('time_results', None)
        ssh_results = results.get('ssh_results', None) if log_type == 'openssh' else None
        component_results = results.get('component_results', None) if log_type == 'bgl' else None
        
        # 1. Plot sequence model accuracy (if available)
        plt.subplot(2, 2, 1)
        if sequence_results and 'loss_history' in sequence_results:
            plt.plot(sequence_results['loss_history'])
            plt.title('Sequence Model Training Loss')
            plt.xlabel('Epoch')
            plt.ylabel('Loss')
        else:
            plt.text(0.5, 0.5, 'No sequence model data available', 
                    horizontalalignment='center', verticalalignment='center')
            plt.title('Sequence Model (Missing Data)')
            
        # 2. Plot prediction confidence histogram (if available)
        plt.subplot(2, 2, 2)
        if sequence_results and 'confidence_scores' in sequence_results:
            scores = sequence_results['confidence_scores']
            plt.hist(scores, bins=30)
            plt.title('Prediction Confidence Distribution')
            plt.xlabel('Confidence Score')
            plt.ylabel('Count')
        else:
            plt.text(0.5, 0.5, 'No confidence score data available', 
                    horizontalalignment='center', verticalalignment='center')
            plt.title('Confidence Scores (Missing Data)')
            
        # 3. Plot time-based anomaly scores (if available)
        plt.subplot(2, 2, 3)
        if time_results and 'results' in time_results:
            # Use results dataframe which should have timestamp and count
            time_df = time_results['results']
            plt.plot(time_df['timestamp'], time_df['count'])
            
            # Highlight anomalies if available
            if 'anomaly' in time_df.columns:
                anomaly_points = time_df[time_df['anomaly']]
                if len(anomaly_points) > 0:
                    plt.scatter(anomaly_points['timestamp'], anomaly_points['count'], 
                                color='red', label='Anomalies')
                    plt.legend()
                    
            plt.title('Log Volume Over Time')
            plt.xlabel('Time')
            plt.ylabel('Log Count')
        elif time_results:
            # Try alternative formats
            if 'anomaly_scores' in time_results and 'timestamps' in time_results:
                plt.plot(time_results['timestamps'], time_results['anomaly_scores'])
                plt.axhline(y=0, color='r', linestyle='--')
                plt.title('Time-based Anomaly Scores')
            else:
                plt.text(0.5, 0.5, 'Time results available but missing required fields', 
                        horizontalalignment='center', verticalalignment='center')
                plt.title('Time-based Analysis (Incomplete Data)')
        else:
            plt.text(0.5, 0.5, 'No time-based analysis data available', 
                    horizontalalignment='center', verticalalignment='center')
            plt.title('Time-based Analysis (Missing Data)')
            
        # 4. Plot SSH-specific results or BGL component data
        plt.subplot(2, 2, 4)
        if log_type == 'openssh' and ssh_results and 'ip_stats' in ssh_results:
            ip_stats = ssh_results['ip_stats']
            
            # Filter to suspicious IPs
            if 'is_anomaly' in ip_stats.columns:
                suspicious = ip_stats[ip_stats['is_anomaly']]
                
                # Plot attempts vs failure rate for suspicious IPs
                if len(suspicious) > 0:
                    plt.scatter(suspicious['login_attempts'], 
                                suspicious['failure_rate'], 
                                alpha=0.7)
                    plt.title('SSH Login Anomalies')
                    plt.xlabel('Login Attempts')
                    plt.ylabel('Failure Rate')
                else:
                    plt.text(0.5, 0.5, 'No SSH anomalies detected', 
                            horizontalalignment='center', verticalalignment='center')
                    plt.title('SSH Login Analysis (No Anomalies)')
            else:
                plt.text(0.5, 0.5, 'SSH stats available but no anomaly data', 
                        horizontalalignment='center', verticalalignment='center')
                plt.title('SSH Login Analysis (Incomplete Data)')
        elif log_type == 'bgl' and component_results and 'component_stats' in component_results:
            # Plot component statistics for BGL
            comp_stats = component_results['component_stats']
            
            # Plot error rate vs log count for top components
            top_comps = comp_stats.nlargest(20, 'log_count')
            plt.scatter(top_comps['log_count'], top_comps['error_rate'], alpha=0.7)
            
            # Highlight anomalous components if available
            if 'is_anomalous' in comp_stats.columns:
                anomalous = comp_stats[comp_stats['is_anomalous']]
                if len(anomalous) > 0:
                    plt.scatter(anomalous['log_count'], anomalous['error_rate'], 
                                color='red', s=100, label='Anomalous Components')
                    plt.legend()
            
            plt.title('Component Analysis')
            plt.xlabel('Log Count')
            plt.ylabel('Error Rate')
            plt.xscale('log')  # Log scale for better visualization
        else:
            # For other logs or missing results, show general statistics
            if sequence_results and 'accuracy' in sequence_results:
                plt.text(0.5, 0.5, f"Sequence Model Accuracy: {sequence_results['accuracy']:.4f}", 
                        horizontalalignment='center', verticalalignment='center', fontsize=14)
                plt.title('Model Performance')
            else:
                plt.text(0.5, 0.5, 'Additional metrics not available', 
                        horizontalalignment='center', verticalalignment='center')
                plt.title('Additional Metrics (Missing Data)')
                
        # Add overall title
        plt.suptitle(f'Anomaly Detection Results for {log_type.upper()} Logs', fontsize=16)
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        
        # Save if path provided
        if save_path:
            plt.savefig(save_path)
            print(f"Visualization saved to {save_path}")
            
        plt.close()  # Close to prevent display in notebooks
        
    except Exception as e:
        print(f"Error generating visualization: {str(e)}")
        # Create a simple error figure
        plt.figure(figsize=(10, 6))
        plt.text(0.5, 0.5, f"Error generating visualization:\n{str(e)}", 
                horizontalalignment='center', verticalalignment='center', fontsize=14)
        
        if save_path:
            plt.savefig(save_path)
            print(f"Error visualization saved to {save_path}")
            
        plt.close()

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

def train_bgl_model_v1(log_file, output_dir='models'):
    """
    Train models on BGL logs
    """
    print("\n" + "="*50)
    print("TRAINING MODELS ON BGL LOGS")
    print("="*50)
    
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)
    
    # 1. Parse logs using BGL-specific parser
    df = parse_bgl_logs(log_file)
    
    if len(df) == 0:
        print("Error: No logs were parsed")
        return None
    
    # 2. Extract features
    log_features = extract_log_features(df, 'bgl')
    
    # 3. Sequence model for log patterns
    # Check if we need to map 'message' to 'content' for compatibility with train_deeplog_model
    if 'message' in df.columns and 'content' not in df.columns:
        print("Mapping 'message' column to 'content' for DeepLog compatibility")
        df['content'] = df['message']
    
    # Add msg_type column if it doesn't exist (needed for DeepLog)
    if 'msg_type' not in df.columns and 'level' in df.columns:
        print("Creating 'msg_type' column from 'level' for DeepLog compatibility")
        df['msg_type'] = df['level'].str.lower()
    
    sequence_results = train_deeplog_model(df)
    
    # 4. Time-based anomaly detection
    time_results = time_based_anomaly_detection(df)
    
    # 5. Specialized analysis: Component failure analysis
    component_results = component_failure_analysis(df)
    
    # 6. Visualize results
    results = {
        'sequence_results': sequence_results,
        'time_results': time_results,
        'component_results': component_results
    }
    
    visualize_anomaly_results(results, 'bgl', save_path=f"{output_dir}/bgl_results.png")
    
    # 7. Save models with safer handling
    models_to_save = {}
    
    # Check for sequence model
    if sequence_results and 'model' in sequence_results:
        models_to_save['sequence_model'] = sequence_results['model']
    
    # Check for time model
    if time_results and 'model' in time_results:
        models_to_save['time_model'] = time_results['model']
    
    # Check for component model
    if component_results and 'model' in component_results:
        models_to_save['component_model'] = component_results['model']
    
    for model_name, model in models_to_save.items():
        model_path = f"{output_dir}/bgl_{model_name}.pkl"
        try:
            with open(model_path, 'wb') as f:
                pickle.dump(model, f)
            print(f"Saved {model_name} to {model_path}")
        except Exception as e:
            print(f"Error saving {model_name}: {str(e)}")
    
    # 8. Save metadata with proper type conversion
    try:
        metadata = {
            'log_type': 'bgl',
            'log_count': int(len(df)),
            'parsed_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'sequence_accuracy': float(sequence_results['accuracy']) if sequence_results and 'accuracy' in sequence_results else None,
            'anomaly_count_time': int(np.sum(time_results['predictions'] == -1)) if time_results and 'predictions' in time_results else 0,
            'error_count': int(df['is_error'].sum()) if 'is_error' in df.columns else 0
        }
        
        with open(f"{output_dir}/bgl_metadata.json", 'w') as f:
            json.dump(metadata, f, indent=4, cls=NumpyEncoder)
            
        print(f"Saved metadata to {output_dir}/bgl_metadata.json")
    except Exception as e:
        print(f"Error saving metadata: {str(e)}")
    
    return results

In [29]:
bgl_log_path = "LogHub/BGL/BGL.log"

bgl_results = train_bgl_model_v1(bgl_log_path)


TRAINING MODELS ON BGL LOGS
Parsing BGL logs from LogHub/BGL/BGL.log...
No pattern matched line 17742: KERNDTLB 1117955293 2005.06.05 R20-M0-N2-C:J10-U11 2005-06-05-00.08.13.410695 R20-M0-N2-C:J10-U11 RAS KERNEL FATAL data TLB error interrupt
No pattern matched line 17745: KERNDTLB 1117955293 2005.06.05 R20-M0-N2-C:J14-U01 2005-06-05-00.08.13.577322 R20-M0-N2-C:J14-U01 RAS KERNEL FATAL data TLB error interrupt
No pattern matched line 17758: KERNDTLB 1117955293 2005.06.05 R20-M0-ND-C:J13-U11 2005-06-05-00.08.13.959099 R20-M0-ND-C:J13-U11 RAS KERNEL FATAL data TLB error interrupt
No pattern matched line 17809: KERNDTLB 1117955295 2005.06.05 R20-M0-N9-C:J14-U01 2005-06-05-00.08.15.474037 R20-M0-N9-C:J14-U01 RAS KERNEL FATAL data TLB error interrupt
No pattern matched line 17855: KERNDTLB 1117955296 2005.06.05 R20-M1-N2-C:J17-U11 2005-06-05-00.08.16.806826 R20-M1-N2-C:J17-U11 RAS KERNEL FATAL data TLB error interrupt
Processed 1000000 lines...
Processed 2000000 lines...
Processed 3000000 

In [26]:
linux_log_path = "LogHub/Linux/Linux.log" 
openssh_log_path = "LogHub/SSH/SSH.log"
bgl_log_path = "LogHub/BGL/BGL.log"

# Train individual models
linux_results = train_linux_model_v1(linux_log_path)
openssh_results = train_openssh_model_v1(openssh_log_path)
bgl_results = train_bgl_model_v1(bgl_log_path)


TRAINING MODELS ON LINUX LOGS
Parsing Linux logs from LogHub/Linux/Linux.log...
Successfully parsed 25567 Linux log entries out of 25567 lines

Sample of parsed logs:
         timestamp hostname component pid msg_type                        content
0  Jun  9 06:06:20  unknown   unknown  NA    combo  combo syslogd 1.4.1: restart.
1  Jun  9 06:06:20    combo    syslog  NA  syslogd      syslogd startup succeeded
2  Jun  9 06:06:20    combo    syslog  NA    klogd        klogd startup succeeded
Extracting features from linux logs...
Extracted 5 features from linux logs
Training DeepLog model for sequence anomaly detection...
Epoch 1/5, Loss: 2.2596
Epoch 2/5, Loss: 1.5625
Epoch 3/5, Loss: 1.4440
Epoch 4/5, Loss: 1.3569
Epoch 5/5, Loss: 1.2859
DeepLog Top-3 Accuracy: 0.7096
Performing time-based anomaly detection with 1H windows...
Error: DataFrame is empty or doesn't contain 'datetime' column
Visualization saved to models/linux_results.png
Saved sequence_model to models/linux_sequence_mode

KeyboardInterrupt: 