In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten
from tensorflow.keras.utils import to_categorical
import os

# Load and preprocess dataset
def load_data(dataset_path):
    # Load CSV
    df = pd.read_csv(os.path.join(dataset_path, 'Pre_train_D_1.csv'))  # Adjust file name as needed
    # Example: Combine multiple CSVs if needed
    # df = pd.concat([pd.read_csv(os.path.join(dataset_path, f)) for f in os.listdir(dataset_path) if f.endswith('.csv')])
    
    # Preprocess
    # Convert hex Arbitration_ID to int, handle non-hex values
    def safe_hex_to_int(x):
        try:
            return int(x, 16) if isinstance(x, str) else x
        except (ValueError, TypeError):
            return 0  # Default for invalid values
    
    df['Arbitration_ID'] = df['Arbitration_ID'].apply(safe_hex_to_int)
    
    # Convert Data (hex string) to array of integers, ensure fixed length (8 bytes)
    def hex_to_int_array(hex_str):
        try:
            if isinstance(hex_str, str) and hex_str.strip():
                # Remove spaces and convert hex to list of integers
                hex_str = hex_str.replace(' ', '')
                data = [int(hex_str[i:i+2], 16) for i in range(0, len(hex_str), 2)]
                # Pad with zeros or truncate to 8 bytes
                if len(data) < 8:
                    data.extend([0] * (8 - len(data)))
                return data[:8]  # Ensure exactly 8 bytes
            return [0] * 8  # Default for NaN, empty, or invalid
        except (ValueError, TypeError):
            return [0] * 8  # Fallback for any parsing errors
    
    df['Data'] = df['Data'].apply(hex_to_int_array)
    # Convert to numpy array, ensuring shape (n_samples, 8)
    data_array = np.array(df['Data'].tolist())  # Shape: (n_samples, 8)
    
    # Verify shape consistency
    if data_array.shape[1] != 8:
        raise ValueError(f"Data array has inconsistent shape: {data_array.shape}")
    
    # Normalize features
    scaler = StandardScaler()
    arbitration_id = scaler.fit_transform(df[['Arbitration_ID']].values)
    dlc = scaler.fit_transform(df[['DLC']].values)
    timestamp_diff = scaler.fit_transform(df['Timestamp'].diff().fillna(0).values.reshape(-1, 1))
    
    # Combine features: Timestamp diff, Arbitration_ID, DLC, Data (8 bytes)
    X = np.concatenate([timestamp_diff, arbitration_id, dlc, data_array], axis=1)  # Shape: (n_samples, 11)
    
    # Encode labels (Class: Normal/Attack)
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df['Class'])
    y = to_categorical(y)  # One-hot encode for multi-class
    
    # Create sequences for LSTM (window of 20 messages)
    sequence_length = 20
    X_seq = []
    y_seq = []
    for i in range(len(X) - sequence_length):
        X_seq.append(X[i:i+sequence_length])
        y_seq.append(y[i+sequence_length-1])  # Label for last message in sequence
    
    return np.array(X_seq), np.array(y_seq), label_encoder

# Build CNN-LSTM model
def build_cnn_lstm_model(input_shape, num_classes):
    model = Sequential([
        # CNN layers
        Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape),
        MaxPooling1D(pool_size=2),
        Conv1D(filters=32, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        # LSTM layers
        LSTM(128, return_sequences=False),
        Dropout(0.3),
        # Dense layers
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Main execution
if __name__ == "__main__":
    # Dataset path
    dataset_path = "/Users/bodapati/Downloads/Car_Hacking_Challenge_Dataset_rev20Mar2021/0_Preliminary/0_Training"
    
    # Load and preprocess data
    X, y, label_encoder = load_data(dataset_path)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Build model
    input_shape = (X.shape[1], X.shape[2])  # (sequence_length, num_features)
    num_classes = y.shape[1]
    model = build_cnn_lstm_model(input_shape, num_classes)
    
    # Train model
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=1)
    
    # Evaluate model
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Accuracy: {accuracy:.4f}")
    
    # Save model
    model.save('cnn_lstm_car_hacking.h5')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m8064/8064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 4ms/step - accuracy: 0.9081 - loss: 0.2638 - val_accuracy: 0.9188 - val_loss: 0.2109
Epoch 2/10
[1m8064/8064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 3ms/step - accuracy: 0.9180 - loss: 0.2104 - val_accuracy: 0.9213 - val_loss: 0.2003
Epoch 3/10
[1m8064/8064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - accuracy: 0.9194 - loss: 0.2047 - val_accuracy: 0.9216 - val_loss: 0.1974
Epoch 4/10
[1m8064/8064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 3ms/step - accuracy: 0.9209 - loss: 0.1995 - val_accuracy: 0.9214 - val_loss: 0.1955
Epoch 5/10
[1m8064/8064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 4ms/step - accuracy: 0.9210 - loss: 0.1973 - val_accuracy: 0.9216 - val_loss: 0.1934
Epoch 6/10
[1m8064/8064[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 4ms/step - accuracy: 0.9219 - loss: 0.1956 - val_accuracy: 0.9218 - val_loss: 0.1942
Epoch 7/10



Test Accuracy: 0.9222
