In [None]:
# tabnet_can.py
import os
import random
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

# --------------------------
# Reproducibility
# --------------------------
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# ==========================
# Feature Engineering
# ==========================
def safe_hex_to_int(x):
    try:
        return int(str(x), 16)
    except Exception:
        return 0

def create_features(df):
    df = df.copy()
    df['CAN_ID_numeric'] = df.get('CAN_ID_hex', 0).apply(safe_hex_to_int)

    # Normalize Data_hex
    df['Data_clean'] = df.get('Data_hex', '').fillna('').astype(str).str.replace(' ', '')
    df['Data_clean'] = df['Data_clean'].apply(lambda s: (s + '0'*16)[:16])

    for i in range(8):
        df[f'Byte_{i}'] = df['Data_clean'].apply(
            lambda x: int(x[i*2:(i+1)*2], 16) if len(x) >= (i+1)*2 else 0
        )

    byte_cols = [f'Byte_{i}' for i in range(8)]
    df['Data_mean'] = df[byte_cols].mean(axis=1)
    df['Data_std'] = df[byte_cols].std(axis=1).fillna(0)
    df['Data_sum'] = df[byte_cols].sum(axis=1)

    if 'DLC' in df.columns:
        df['DLC_cat'] = LabelEncoder().fit_transform(df['DLC'].astype(str))
    else:
        df['DLC_cat'] = 0

    df['CAN_Priority'] = pd.cut(df['CAN_ID_numeric'],
                                bins=[-1, 255, 1023, np.inf],
                                labels=[0, 1, 2]).astype(int)
    return df

# ==========================
# Main pipeline
# ==========================
if __name__ == "__main__":
    # Training files
    train_files = [
        "/Users/bodapati/Downloads/Car_Hacking_Challenge_Dataset_rev20Mar2021/0_Preliminary/0_Training/Pre_train_D_0.csv",
   "/Users/bodapati/Downloads/Car_Hacking_Challenge_Dataset_rev20Mar2021/0_Preliminary/0_Training/Pre_train_D_1.csv",
   "/Users/bodapati/Downloads/Car_Hacking_Challenge_Dataset_rev20Mar2021/0_Preliminary/0_Training/Pre_train_D_2.csv",
   "/Users/bodapati/Downloads/Car_Hacking_Challenge_Dataset_rev20Mar2021/0_Preliminary/0_Training/Pre_train_S_0.csv",
   "/Users/bodapati/Downloads/Car_Hacking_Challenge_Dataset_rev20Mar2021/0_Preliminary/0_Training/Pre_train_S_1.csv",
    "/Users/bodapati/Downloads/Car_Hacking_Challenge_Dataset_rev20Mar2021/0_Preliminary/0_Training/Pre_train_S_2.csv"
    ]

    test_files = [
        "/Users/bodapati/Downloads/Car_Hacking_Challenge_Dataset_rev20Mar2021/0_Preliminary/1_Submission/Pre_submit_D.csv",
    "/Users/bodapati/Downloads/Car_Hacking_Challenge_Dataset_rev20Mar2021/0_Preliminary/1_Submission/Pre_submit_S.csv"
]

    # ---------- load training data ----------
    train_df = pd.concat([pd.read_csv(f) for f in train_files], ignore_index=True)
    train_df.rename(columns={'Arbitration_ID': 'CAN_ID_hex', 'Data': 'Data_hex'}, inplace=True)
    train_df = create_features(train_df)

    # ---------- binary labels ----------
    if 'Class' in train_df.columns:
        if 'R' in train_df['Class'].values:
            train_df['Binary_Label'] = train_df['Class'].apply(lambda x: 1 if x == 'R' else 0)
        elif 'Attack' in train_df['Class'].values:
            train_df['Binary_Label'] = train_df['Class'].apply(lambda x: 1 if x == 'Attack' else 0)
        else:
            train_df['Binary_Label'] = train_df['Class'].apply(lambda x: 0 if str(x).lower() == 'normal' else 1)
    elif 'SubClass' in train_df.columns:
        train_df['Binary_Label'] = train_df['SubClass'].apply(lambda x: 0 if pd.isna(x) or str(x).lower() == 'normal' else 1)
    else:
        train_df['Binary_Label'] = np.random.choice([0, 1], size=len(train_df), p=[0.7, 0.3])

    features = ['CAN_ID_numeric', 'Data_mean', 'Data_std', 'Data_sum'] + \
               [f'Byte_{i}' for i in range(8)] + ['DLC_cat', 'CAN_Priority']

    X_train, X_val, y_train, y_val = train_test_split(
        train_df[features], train_df['Binary_Label'],
        test_size=0.2, stratify=train_df['Binary_Label'], random_state=42
    )

    # scale continuous features
    scaler = StandardScaler()
    X_train[features] = scaler.fit_transform(X_train[features])
    X_val[features] = scaler.transform(X_val[features])

    # TabNet expects numpy
    X_train_np, y_train_np = X_train.values, y_train.values
    X_val_np, y_val_np = X_val.values, y_val.values

    # ---------- train TabNet ----------
    clf = TabNetClassifier(
        n_d=64, n_a=64, n_steps=5,
        gamma=1.5, n_independent=2, n_shared=2,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=2e-3),
        scheduler_params={"step_size":20, "gamma":0.9},
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        mask_type="entmax",  # "sparsemax" also works
        verbose=10,
        device_name="cuda" if torch.cuda.is_available() else "cpu"
    )

    clf.fit(
        X_train=X_train_np, y_train=y_train_np,
        eval_set=[(X_val_np, y_val_np)],
        eval_name=["val"],
        eval_metric=["accuracy"],
        max_epochs=50, patience=10,
        batch_size=1024, virtual_batch_size=128,
        num_workers=0, drop_last=False
    )

    # ---------- validation report ----------
    val_preds = clf.predict(X_val_np)
    print("\nValidation Report:")
    print(classification_report(y_val_np, val_preds, digits=4))
    print("Confusion Matrix:")
    print(confusion_matrix(y_val_np, val_preds))

    # ---------- test ----------
    test_df = pd.concat([pd.read_csv(f) for f in test_files], ignore_index=True)
    test_df.rename(columns={'Arbitration_ID': 'CAN_ID_hex', 'Data': 'Data_hex'}, inplace=True)
    test_df = create_features(test_df)

    if 'Class' in test_df.columns:
        if 'R' in test_df['Class'].values:
            test_df['Binary_Label'] = test_df['Class'].apply(lambda x: 1 if x == 'R' else 0)
        elif 'Attack' in test_df['Class'].values:
            test_df['Binary_Label'] = test_df['Class'].apply(lambda x: 1 if x == 'Attack' else 0)
        else:
            test_df['Binary_Label'] = test_df['Class'].apply(lambda x: 0 if str(x).lower() == 'normal' else 1)
        labels_exist = True
    else:
        test_df['Binary_Label'] = np.zeros(len(test_df), dtype=int)
        labels_exist = False

    X_test, y_test = test_df[features], test_df['Binary_Label']
    X_test[features] = scaler.transform(X_test[features])
    X_test_np, y_test_np = X_test.values, y_test.values

    preds_test = clf.predict(X_test_np)

    if labels_exist:
        print("\nTest Report:")
        print(classification_report(y_test_np, preds_test, digits=4))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test_np, preds_test))
    else:
        print("Test predictions (no labels available):", preds_test[:20])




epoch 0  | loss: 0.08834 | val_accuracy: 0.94242 |  0:08:06s
epoch 10 | loss: 0.06061 | val_accuracy: 0.9874  |  1:30:28s
epoch 20 | loss: 0.05609 | val_accuracy: 0.98772 |  2:42:12s
epoch 30 | loss: 0.05419 | val_accuracy: 0.98806 |  8:54:23s

Early stopping occurred at epoch 31 with best_epoch = 21 and best_val_accuracy = 0.98811





Validation Report:
              precision    recall  f1-score   support

           0     0.9876    0.9996    0.9936    674549
           1     0.9948    0.8587    0.9217     59882

    accuracy                         0.9881    734431
   macro avg     0.9912    0.9292    0.9577    734431
weighted avg     0.9882    0.9881    0.9877    734431

Confusion Matrix:
[[674278    271]
 [  8460  51422]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[features] = scaler.transform(X_test[features])



Test Report:
              precision    recall  f1-score   support

           0     0.9703    0.9995    0.9847   3358210
           1     0.9940    0.7395    0.8480    393836

    accuracy                         0.9722   3752046
   macro avg     0.9822    0.8695    0.9164   3752046
weighted avg     0.9728    0.9722    0.9703   3752046

Confusion Matrix:
[[3356450    1760]
 [ 102612  291224]]
