In [176]:
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

In [177]:
# Load datasets

rt_df = pd.read_csv("dataset_lanl/rt_mod.csv",index_col=False)
auth_df = pd.read_csv("dataset_lanl/auth_mod.csv",index_col=False)
df = pd.concat([rt_df,auth_df]).sort_values(by='time')
print(df['label'].value_counts())

label
1    10000
0    10000
Name: count, dtype: int64


In [None]:
# Encoding
low_card_cols = ['authentication_type', 'logon_type', 'authentication_orientation', 'success_failure']
df = pd.get_dummies(df, columns=low_card_cols)


label_cols = ['src_user', 'src_domain', 'source_computer', 'destination_computer']
for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

print(df)

          time  src_user  src_domain  source_computer  destination_computer  \
763  -1.927975      2172         121             1627                   578   
766  -1.927975      2127         121             1627                   578   
765  -1.927975      1647         121             1627                   578   
764  -1.927975      2153         121             1627                   578   
767  -1.927975      2164         121             1627                   578   
...        ...       ...         ...              ...                   ...   
9993  3.313168      2140         121             1627                   578   
9994  3.313168      1626         121             1627                   578   
9995  3.313168      2126         121             1627                   578   
9996  3.313168      1438         121             1627                   578   
9997  3.313168      1412         121             1627                   578   

      label  authentication_type_Kerberos  \
763   

In [179]:
X = df.drop(columns=['label'])
y = df['label'].astype(float)
X = X.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)

X_numpy = X.values
y_numpy = y.values

X_tensor = torch.tensor(X_numpy,dtype=torch.float32)
y_tensor = torch.tensor(y_numpy,dtype=torch.float32)

print(f"X_tensor {X_tensor.shape}")
print(f"y_tensor {y_tensor.shape}")

X_tensor torch.Size([20000, 34])
y_tensor torch.Size([20000])


In [180]:
sequence_length = 10
num_samples = X_tensor.shape[0] // sequence_length
X_tensor_seq = X_tensor[:num_samples * sequence_length].reshape(num_samples, sequence_length, -1)
y_tensor_seq = y_tensor[:num_samples * sequence_length].reshape(num_samples, sequence_length)
y_tensor_seq = y_tensor_seq[:, -1]  # shape: (num_samples,)

print("X_tensor_seq",X_tensor_seq.shape)
print("y_tensor_seq",y_tensor_seq.shape)
'''X_train, X_test, y_train, y_test = train_test_split(X_tensor_seq, y_tensor_seq, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)'''

# Assume X_tensor_seq is in order
num_samples = len(X_tensor_seq)

train_end = int(0.72 * num_samples)  # 80% train, but 10% for val comes from here
val_end = int(0.8 * num_samples)

X_train = X_tensor_seq[:train_end]
y_train = y_tensor_seq[:train_end]

X_val = X_tensor_seq[train_end:val_end]
y_val = y_tensor_seq[train_end:val_end]

X_test = X_tensor_seq[val_end:]
y_test = y_tensor_seq[val_end:]

print("Training data:", X_train.shape)
print("Validation data:", X_val.shape)
print("Testing data:", X_test.shape)

# Create datasets
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

X_tensor_seq torch.Size([2000, 10, 34])
y_tensor_seq torch.Size([2000])
Training data: torch.Size([1440, 10, 34])
Validation data: torch.Size([160, 10, 34])
Testing data: torch.Size([400, 10, 34])


In [181]:
class CNN_LSTM(nn.Module):
    def __init__(self, input_size, cnn_out_channels=64, lstm_hidden=128, use_pooling=True):
        super().__init__()
        self.use_pooling = use_pooling
        self.conv = nn.Conv1d(in_channels=input_size, out_channels=cnn_out_channels, kernel_size=3)
        self.bn = nn.BatchNorm1d(cnn_out_channels)  # Helps generalization

        if use_pooling:
            self.pool = nn.MaxPool1d(kernel_size=2)
        else:
            self.pool = nn.Identity()  # If you want to skip pooling

        self.lstm = nn.LSTM(input_size=cnn_out_channels, hidden_size=lstm_hidden, 
                            batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.7)
        self.fc1 = nn.Linear(lstm_hidden * 2, 64)  # *2 because LSTM is bidirectional
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        # x shape: (batch, time, features)
        x = x.permute(0, 2, 1)  # (batch, features, time)
        x = self.bn(F.relu(self.conv(x)))  # Conv1D + BatchNorm + ReLU
        x = self.pool(x)  # Pool if enabled
        x = x.permute(0, 2, 1)  # (batch, time, features)
        x, _ = self.lstm(x)  # LSTM output: (batch, time, hidden*2)
        x = self.dropout(x)
        x = x[:, -1, :]  # Use the last time step
        x = F.relu(self.fc1(x))
        return self.fc2(x)  # Don't apply sigmoid here!


In [None]:
model = CNN_LSTM(input_size=X_tensor_seq.shape[2])

criterion = nn.BCEWithLogitsLoss()  # Binary classification
optimizer = optim.Adam(model.parameters(), lr=0.0001)

num_epochs = 25

print(y.shape)

for epoch in range(num_epochs):
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_X).squeeze(1)  # Shape: (batch,)
        loss = criterion(output, batch_y.float())
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

    # Validation step
    model.eval()
    with torch.no_grad():
        val_loss = 0.0
        correct = 0
        total = 0
        for batch_X, batch_y in val_loader:
            output = model(batch_X).squeeze(1)
            loss = criterion(output, batch_y.float())
            val_loss += loss.item()

            probs = torch.sigmoid(output)              # <-- This is crucial
            predicted = (probs > 0.5).float()
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)

        val_loss /= len(val_loader)
        val_accuracy = correct / total
        print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

    
    model.train()


(20000,)


Epoch 1/50, Loss: 0.5816
Epoch 1/50, Validation Loss: 0.6047, Accuracy: 0.8562
Epoch 2/50, Loss: 0.4688
Epoch 2/50, Validation Loss: 0.4783, Accuracy: 0.9500
Epoch 3/50, Loss: 0.3343
Epoch 3/50, Validation Loss: 0.3453, Accuracy: 0.9688
Epoch 4/50, Loss: 0.2544
Epoch 4/50, Validation Loss: 0.2418, Accuracy: 0.9750
Epoch 5/50, Loss: 0.1913
Epoch 5/50, Validation Loss: 0.1675, Accuracy: 0.9750
Epoch 6/50, Loss: 0.1288
Epoch 6/50, Validation Loss: 0.1282, Accuracy: 0.9688
Epoch 7/50, Loss: 0.1046
Epoch 7/50, Validation Loss: 0.1111, Accuracy: 0.9688
Epoch 8/50, Loss: 0.0604
Epoch 8/50, Validation Loss: 0.1032, Accuracy: 0.9688
Epoch 9/50, Loss: 0.0512
Epoch 9/50, Validation Loss: 0.0987, Accuracy: 0.9750
Epoch 10/50, Loss: 0.0379
Epoch 10/50, Validation Loss: 0.0972, Accuracy: 0.9750
Epoch 11/50, Loss: 0.0482
Epoch 11/50, Validation Loss: 0.0970, Accuracy: 0.9750
Epoch 12/50, Loss: 0.0294
Epoch 12/50, Validation Loss: 0.0962, Accuracy: 0.9750
Epoch 13/50, Loss: 0.0191
Epoch 13/50, Validat

In [183]:
# Final testing on the test dataset
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    test_loss = 0.0
    correct = 0
    total = 0
    for batch_X, batch_y in test_loader:
        output = model(batch_X)
        loss = criterion(output, batch_y.float().unsqueeze(1))
        test_loss += loss.item()

        # Calculate accuracy
        predicted = (output > 0.5).float()  # assuming binary classification
        correct += (predicted == batch_y).sum().item()
        total += batch_y.size(0)

    test_loss /= len(test_loader)
    test_accuracy = correct / total
    print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


Test Loss: 0.1857, Test Accuracy: 26.5800
