In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import Dataset, DataLoader

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Data
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df['Query'].tolist(), df['Label'].tolist()

# Load training and testing data
train_queries, train_labels = load_data('trainingdata.csv')
test_queries, test_labels = load_data('testingdata.csv')

# Tokenizer & BERT Model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)

# Feature Extraction using BERT
def extract_features(queries):
    inputs = tokenizer(queries, padding=True, truncation=True, return_tensors="pt")
    input_ids, attention_mask = inputs['input_ids'].to(device), inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)

    return outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Extract CLS token embeddings

# Extract BERT features
train_features = extract_features(train_queries)
test_features = extract_features(test_queries)

# Normalize Features
scaler = MinMaxScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

# Convert labels to numpy array
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

# Select only non-injected queries for training autoencoder
normal_queries = train_features[train_labels == 0]

# Custom Dataset Class
class SQLDataset(Dataset):
    def __init__(self, features):
        self.features = torch.tensor(features, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

# DataLoader
batch_size = 32
train_dataset = SQLDataset(normal_queries)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Autoencoder Model
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Initialize Autoencoder
input_dim = train_features.shape[1]
autoencoder = Autoencoder(input_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

# Train Autoencoder
epochs = 50
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        reconstructed = autoencoder(batch)
        loss = criterion(reconstructed, batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss:.4f}")

# Evaluate on Test Data
test_features_tensor = torch.tensor(test_features, dtype=torch.float32).to(device)
with torch.no_grad():
    reconstructed_features = autoencoder(test_features_tensor)

# Compute Reconstruction Loss
reconstruction_loss = torch.mean((test_features_tensor - reconstructed_features) ** 2, dim=1).cpu().numpy()

# Set Threshold for Anomaly Detection
threshold = np.percentile(reconstruction_loss, 95)  # 95th percentile

# Predict Labels
predicted_labels = (reconstruction_loss > threshold).astype(int)

# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels)
recall = recall_score(test_labels, predicted_labels)
f1 = f1_score(test_labels, predicted_labels)

print(f"Accuracy: {accuracy:.4f}")

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Data
def load_data(file_path):
    df = pd.read_csv(file_path)
    return df['Query'].tolist(), np.array(df['Label'])

train_queries, train_labels = load_data('trainingdata.csv')
test_queries, test_labels = load_data('testingdata.csv')

# Load Tokenizer & BERT Model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)

# Feature Extraction with BERT (Batch-wise Processing to Avoid Memory Overflow)
def extract_features(queries, batch_size=16):
    features = []
    bert_model.eval()

    with torch.no_grad():
        for i in range(0, len(queries), batch_size):
            batch = queries[i : i + batch_size]
            inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=64)
            input_ids, attention_mask = inputs['input_ids'].to(device), inputs['attention_mask'].to(device)

            outputs = bert_model(input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Extract CLS token embeddings
            features.append(cls_embeddings)

    return np.vstack(features)

# Extract BERT features
print("Extracting BERT Features...")
train_features = extract_features(train_queries)
test_features = extract_features(test_queries)

# Normalize Features
scaler = MinMaxScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

# Select Only Non-Injection Queries (Label=0) for Autoencoder Training
normal_queries = train_features[train_labels == 0]

# Custom Dataset Class
class SQLDataset(Dataset):
    def __init__(self, features):
        self.features = torch.tensor(features, dtype=torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx]

# DataLoader
batch_size = 32
train_dataset = SQLDataset(normal_queries)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Optimized Autoencoder Model
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(32, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Initialize Autoencoder
input_dim = train_features.shape[1]
autoencoder = Autoencoder(input_dim).to(device)
criterion = nn.SmoothL1Loss()  # Huber Loss for Robustness
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

# Train Autoencoder
epochs = 50
print("Training Autoencoder...")
for epoch in range(epochs):
    total_loss = 0
    autoencoder.train()

    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        reconstructed = autoencoder(batch)
        loss = criterion(reconstructed, batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss / len(train_loader):.6f}")

# Evaluate on Test Data
autoencoder.eval()
test_features_tensor = torch.tensor(test_features, dtype=torch.float32).to(device)

with torch.no_grad():
    reconstructed_features = autoencoder(test_features_tensor)

# Compute Reconstruction Loss (Euclidean Distance)
reconstruction_loss = torch.mean((test_features_tensor - reconstructed_features) ** 2, dim=1).cpu().numpy()

# Compute Threshold using Gaussian Distribution
mean_loss = np.mean(reconstruction_loss)
std_loss = np.std(reconstruction_loss)
threshold = mean_loss + (2 * std_loss)  # 95% Confidence Interval

# Predict Labels
predicted_labels = (reconstruction_loss > threshold).astype(int)

# Evaluation Metrics
accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels)
recall = recall_score(test_labels, predicted_labels)
f1 = f1_score(test_labels, predicted_labels)
roc_auc = roc_auc_score(test_labels, reconstruction_loss)  # Using loss values for AUC

# Print Metrics
print("\n==== Model Evaluation ====")
print(f"Accuracy    : {accuracy:.4f}")
print(f"Precision   : {precision:.4f}")
print(f"Recall      : {recall:.4f}")
print(f"F1 Score    : {f1:.4f}")
print(f"ROC-AUC     : {roc_auc:.4f}")
print(f"Anomaly Threshold: {threshold:.6f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Extracting BERT Features...


KeyboardInterrupt: 