In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score, roc_curve, auc
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from scipy.spatial.distance import euclidean
import logging
from datetime import datetime

# Set up logging for IDS monitoring
logging.basicConfig(filename='ids_log.log', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

class IntrusionDetectionSystem:
    def __init__(self):
        self.clf = None
        self.scaler = None
        self.imputer = None
        self.feature_columns = None

    # Debugging function to check shape alignment
    def _debug_shape(self, df, name="DataFrame"):
        print(f"{name} shape: {df.shape}")
        logging.info(f"{name} shape: {df.shape}")

    # Function to safely load CSV files
    def _safe_load_csv(self, filepath, description="data"):
        try:
            df = pd.read_csv(filepath, delimiter=';', encoding='utf-8')
            logging.info(f"Successfully loaded {description} from {filepath}")
            return df
        except pd.errors.ParserError as e:
            logging.warning(f"Error loading {filepath}: {e}. Skipping problematic rows...")
            return pd.read_csv(filepath, on_bad_lines='skip', delimiter=';', encoding='utf-8')
        except Exception as e:
            logging.error(f"Failed to load {filepath}: {e}")
            raise

    # Function to clean mixed or improperly parsed columns
    def _clean_data_columns(self, df):
        for col in df.columns:
            if df[col].dtype == 'object':
                df[col] = pd.to_numeric(df[col], errors='coerce')
        return df

    # Transformation function for normal traffic (simulating benign variation)
    def _transform_normal_data(self, data, noise_level=0.5):
        noise = np.random.normal(0, noise_level, data.shape)
        scaled_data = data * np.random.uniform(0.5, 1.5, data.shape)
        transformed_data = scaled_data + noise
        return transformed_data

    # Train the IDS model
    def train(self, normal_path, attack_paths):
        # 1. Load normal dataset
        normal = self._safe_load_csv(normal_path, "normal traffic")
        normal['label'] = 0

        # 2. Load and combine all attack datasets
        attack_dfs = []
        for attack_path in attack_paths:
            attack_df = self._safe_load_csv(attack_path, f"attack traffic from {attack_path.split('/')[-1]}")
            attack_df['label'] = 1
            attack_dfs.append(attack_df)
        
        # Combine all attack data
        attack = pd.concat(attack_dfs, ignore_index=True)
        
        # Combine normal and attack data
        data = pd.concat([normal, attack], ignore_index=True)
        self._debug_shape(data, "Combined Data")

        # Clean data
        data = self._clean_data_columns(data)

        # 3. Transform normal traffic to introduce variation
        normal_features = data[data['label'] == 0].drop(columns=['label'])
        self._debug_shape(normal_features, "Normal Features Before Transformation")

        transformed_normal_features = self._transform_normal_data(normal_features.to_numpy(), noise_level=0.5)
        transformed_normal = pd.DataFrame(transformed_normal_features, columns=normal_features.columns)
        transformed_normal['label'] = 0
        self._debug_shape(transformed_normal, "Transformed Normal Data")

        # Combine transformed normal with attack data
        attack_data = data[data['label'] == 1]
        data_transformed = pd.concat([transformed_normal, attack_data], ignore_index=True)
        self._debug_shape(data_transformed, "Data Transformed After Transformation")

        # Ensure consistency in cleaning
        data_transformed = self._clean_data_columns(data_transformed)

        # 4. Handle missing values
        non_nan_columns = data_transformed.columns[data_transformed.notna().any()].tolist()
        data_transformed = data_transformed[non_nan_columns]
        self._debug_shape(data_transformed, "Data Transformed After Dropping NaN Columns")

        # Separate features and labels
        features_transformed = data_transformed.drop(columns=['label'])
        label_transformed = data_transformed['label']
        self._debug_shape(features_transformed, "Features Transformed")

        # Impute missing values
        self.imputer = SimpleImputer(strategy='median')
        features_cleaned = self.imputer.fit_transform(features_transformed)
        features_cleaned_df = pd.DataFrame(features_cleaned, columns=features_transformed.columns)
        features_cleaned_df['label'] = label_transformed.reset_index(drop=True)
        self._debug_shape(features_cleaned_df, "Features Cleaned")

        # 5. Preprocessing
        self.scaler = StandardScaler()
        numerical_features = [col for col in features_cleaned_df.columns if col != 'label']
        features_cleaned_df[numerical_features] = self.scaler.fit_transform(features_cleaned_df[numerical_features])
        features_cleaned_df['label'] = features_cleaned_df['label'].astype(int)

        # Store feature columns for monitoring
        self.feature_columns = numerical_features

        # Calculate Euclidean distances for transformation validation
        normal_features_clean = np.nan_to_num(normal_features.to_numpy(), nan=0.0, posinf=0.0, neginf=0.0)
        transformed_normal_features_clean = np.nan_to_num(transformed_normal_features, nan=0.0, posinf=0.0, neginf=0.0)
        euclidean_distances = [euclidean(original, transformed) 
                              for original, transformed in zip(normal_features_clean, transformed_normal_features_clean)]
        euclidean_mean = np.mean(euclidean_distances)
        euclidean_std = np.std(euclidean_distances)
        print(f"Euclidean Distance: Mean = {euclidean_mean}, Std Dev = {euclidean_std}")
        logging.info(f"Euclidean Distance: Mean = {euclidean_mean}, Std Dev = {euclidean_std}")

        # Plot Euclidean Distance
        plt.figure(figsize=(10, 6))
        sns.histplot(euclidean_distances, kde=True, bins=50, color='green', label='Euclidean Distance')
        plt.title('Euclidean Distance Before and After Transformation', fontsize=16)
        plt.xlabel('Euclidean Distance', fontsize=14)
        plt.ylabel('Frequency', fontsize=14)
        plt.legend(fontsize=12)
        plt.show()

        # 6. Split data
        X = features_cleaned_df.drop(columns=['label'])
        y = features_cleaned_df['label']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
        self._debug_shape(X_train, "X_train")
        self._debug_shape(X_test, "X_test")

        print("Class distribution in y_train:")
        print(y_train.value_counts())

        # 7. Handle imbalance with SMOTE
        majority_class_count = y_train.value_counts().max()
        smote = SMOTE(sampling_strategy={1: majority_class_count}, random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
        self._debug_shape(X_train_resampled, "X_train_resampled")

        print("Class distribution after SMOTE:")
        print(y_train_resampled.value_counts())

        # 8. Train XGBoost classifier
        self.clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
        self.clf.fit(X_train_resampled, y_train_resampled)
        logging.info("IDS model training completed.")

        # 9. Evaluate the model
        self._evaluate_model(X_test, y_test)

    # Evaluation function
    def _evaluate_model(self, X_test, y_test):
        y_pred = self.clf.predict(X_test)
        conf_matrix = confusion_matrix(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        accuracy = accuracy_score(y_test, y_pred)

        # ROC Curve
        y_pred_prob = self.clf.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
        roc_auc = auc(fpr, tpr)

        print(f"Accuracy: {accuracy:.2f}")
        logging.info(f"Model Accuracy: {accuracy:.2f}, ROC AUC: {roc_auc:.2f}")

        # Confusion Matrix Plot
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Intrusion'], yticklabels=['Benign', 'Intrusion'])
        plt.title('Confusion Matrix', fontsize=16)
        plt.xlabel('Predicted', fontsize=14)
        plt.ylabel('True', fontsize=14)
        plt.show()

        # Metrics Plot
        metrics_df = pd.DataFrame(report).transpose().iloc[:-1, :3]
        ax = metrics_df.plot(kind='bar', figsize=(14, 8), color=['blue', 'green', 'red'])
        plt.title('Classification Metrics', fontsize=30)
        plt.xlabel('Metrics', fontsize=30)
        plt.ylabel('Score', fontsize=30)
        plt.yticks(fontsize=30)
        plt.legend(fontsize=30)
        labels = [label.get_text() for label in ax.get_xticklabels()]
        updated_labels = [f"{label}" for label in labels]
        ax.set_xticklabels(updated_labels, fontsize=30, rotation=0)
        plt.tight_layout()
        plt.show()

        # ROC Curve Plot
        plt.figure(figsize=(10, 6))
        plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='grey', linestyle='--')
        plt.title('Receiver Operating Characteristic (ROC) Curve', fontsize=16)
        plt.xlabel('False Positive Rate', fontsize=14)
        plt.ylabel('True Positive Rate', fontsize=14)
        plt.legend(fontsize=12)
        plt.show()

    # Real-time monitoring with combined normal and attack data
    def monitor_traffic(self, normal_path, attack_paths):
        if not all([self.clf, self.scaler, self.imputer]):
            raise ValueError("Model not trained. Call train() first.")

        # Load normal data
        normal = self._safe_load_csv(normal_path, "normal traffic for monitoring")
        normal['label'] = 0

        # Load and combine attack data
        attack_dfs = []
        for attack_path in attack_paths:
            attack_df = self._safe_load_csv(attack_path, f"attack traffic for monitoring from {attack_path.split('/')[-1]}")
            attack_df['label'] = 1
            attack_dfs.append(attack_df)
        
        attack = pd.concat(attack_dfs, ignore_index=True)

        # Combine normal and attack data for monitoring
        new_data = pd.concat([normal, attack], ignore_index=True)
        self._debug_shape(new_data, "Combined Monitoring Data")

        # Clean the data
        new_data = self._clean_data_columns(new_data)
        
        # Ensure feature alignment using stored feature columns from training
        if self.feature_columns:
            available_features = [col for col in self.feature_columns if col in new_data.columns]
            if not available_features:
                raise ValueError("No matching features found in new data compared to training data.")
            features = new_data[available_features]
        else:
            raise ValueError("Feature columns not set during training.")

        # Impute missing values
        features_imputed = self.imputer.transform(features)
        features_df = pd.DataFrame(features_imputed, columns=features.columns)
        
        # Scale the features
        features_scaled = self.scaler.transform(features_df)

        # Make predictions
        predictions = self.clf.predict(features_scaled)
        probabilities = self.clf.predict_proba(features_scaled)[:, 1]

        # Counter for intrusions detected
        intrusion_count = 0

        # Log and print results with actual labels for comparison
        actual_labels = new_data['label'].values
        for idx, (pred, prob, actual) in enumerate(zip(predictions, probabilities, actual_labels)):
            timestamp = datetime.now()
            confidence = prob if pred == 1 else 1 - prob
            pred_label = "Intrusion" if pred == 1 else "Benign"
            actual_label = "Intrusion" if actual == 1 else "Benign"
            
            if pred == 1:
                intrusion_count += 1
                logging.warning(f"Packet {idx} at {timestamp}: Predicted {pred_label} (Confidence: {confidence:.2f}), Actual: {actual_label}")
                print(f"Packet {idx} at {timestamp}: Predicted {pred_label} (Confidence: {confidence:.2f}), Actual: {actual_label}")
            else:
                logging.info(f"Packet {idx} at {timestamp}: Predicted {pred_label} (Confidence: {confidence:.2f}), Actual: {actual_label}")
                print(f"Packet {idx} at {timestamp}: Predicted {pred_label} (Confidence: {confidence:.2f}), Actual: {actual_label}")

        # Print total number of intrusions detected
        print(f"\nTotal number of intrusions detected: {intrusion_count}")
        logging.info(f"Total number of intrusions detected: {intrusion_count}")

# Main execution
if __name__ == "__main__":
    # Initialize IDS
    ids = IntrusionDetectionSystem()

    # Paths to datasets
    normal_path = "/Users/bodapati/Desktop/Eoc project/ICS Dataset for Smart Grid Anomaly Detection/ics-dataset-for-smart-grids/but-iec104-i/normal-traffic.csv"
    
    attack_paths = [
        "/Users/bodapati/Desktop/Eoc project/ICS Dataset for Smart Grid Anomaly Detection/ics-dataset-for-smart-grids/but-iec104-i/switching-attack.csv",
        "/Users/bodapati/Desktop/Eoc project/ICS Dataset for Smart Grid Anomaly Detection/ics-dataset-for-smart-grids/but-iec104-i/connection-loss.csv",
        "/Users/bodapati/Desktop/Eoc project/ICS Dataset for Smart Grid Anomaly Detection/ics-dataset-for-smart-grids/but-iec104-i/dos-attack.csv",
        "/Users/bodapati/Desktop/Eoc project/ICS Dataset for Smart Grid Anomaly Detection/ics-dataset-for-smart-grids/but-iec104-i/injection-attack.csv",
        "/Users/bodapati/Desktop/Eoc project/ICS Dataset for Smart Grid Anomaly Detection/ics-dataset-for-smart-grids/but-iec104-i/rogue-device.csv",
        "/Users/bodapati/Desktop/Eoc project/ICS Dataset for Smart Grid Anomaly Detection/ics-dataset-for-smart-grids/but-iec104-i/scanning-attack.csv"
    ]
    
    # Train the IDS model with all attack datasets
    ids.train(normal_path, attack_paths)

    # Perform real-time monitoring with combined normal and attack data
    ids.monitor_traffic(normal_path, attack_paths)