# Model Training

## Setting up datasets

In [None]:
import os
import pandas as pd

#Put the path to the directories that contain your processed data. 
#The directory should be split into high and low priority
dirs = [
    "sensor_pcap_classification/Data/Processed/BackdoorSensor",
    "sensor_pcap_classification/Data/Processed/CO2Sensor",
    "sensor_pcap_classification/Data/Processed/NoiseSensor",
    "sensor_pcap_classification/Data/Processed/TempHumiditySensor",
    "sensor_pcap_classification/Data/Processed/VibrationSensor"]

image_files = []
classifications = []
sensor_types = []

# Iterate over the files in the additional directories
for dir in dirs:
    sensor_type = os.path.basename(dir)  # Extract the sensor type from the directory path
    high_priority_dir = os.path.join(dir, "high_pr")
    for file_name in os.listdir(high_priority_dir):
        file_path = os.path.join(high_priority_dir, file_name)
        image_files.append(file_path)
        classifications.append("high")
        sensor_types.append(sensor_type)

    low_priority_dir = os.path.join(dir, "low_pr")
    for file_name in os.listdir(low_priority_dir):
        file_path = os.path.join(low_priority_dir, file_name)
        image_files.append(file_path)
        classifications.append("low")
        sensor_types.append(sensor_type)

# Create the DataFrame
image_dataset = pd.DataFrame({
    "Image File": image_files,
    "Classification": classifications,
    "Sensor Type": sensor_types  # Add the Sensor Type column
})

image_dataset


In [None]:
csv_file_path = "sensor_pcap_classification/Classification_model/image_dataset_all.csv"
image_dataset.to_csv(csv_file_path)
image_dataset_all= pd.read_csv(csv_file_path, index_col=0)
image_dataset_all

A few trials were done, results found that the whole dataset wasn't needed for good results. Hence, we are taking a sample of the larger dataset.

In [None]:
import pandas as pd

# Group the image_dataset by "Sensor Type" and "Classification"
grouped = image_dataset.groupby(["Sensor Type", "Classification"])

# Function to sample 500 rows from each group
def sample_rows(group):
    return group.sample(n=500, random_state=42)

# Sample 500 rows from each group and concatenate into one DataFrame
sampled_dataset = pd.concat([sample_rows(group) for _, group in grouped])

# Reset the index of the new DataFrame
sampled_dataset = sampled_dataset.reset_index(drop=True)


In [None]:
csv_file_path = "sensor_pcap_classification/Classification_model/image_dataset_training.csv"
sampled_dataset.to_csv(csv_file_path)
image_dataset= pd.read_csv(csv_file_path, index_col=0)
image_dataset=image_dataset.reset_index(drop=True)
image_dataset

## Data preparation for training

In [None]:
!pip install timm

In [None]:
import os
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from timm import create_model
from PIL import Image
import pandas as pd
from sklearn.model_selection import train_test_split

# Set the device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the transform for preprocessing the images
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load your dataset CSV file
df = image_dataset

# Separate high and low classified images
high_classified = df[df["Classification"] == "high"]
low_classified = df[df["Classification"] == "low"]

# Split high and low classified images into train, validation, and test sets while maintaining the same ratio
train_high, valtest_high = train_test_split(high_classified, test_size=0.2, random_state=42, stratify=high_classified["Sensor Type"])
val_high, test_high = train_test_split(valtest_high, test_size=0.5, random_state=42, stratify=valtest_high["Sensor Type"])

train_low, valtest_low = train_test_split(low_classified, test_size=0.2, random_state=42, stratify=low_classified["Sensor Type"])
val_low, test_low = train_test_split(valtest_low, test_size=0.5, random_state=42, stratify=valtest_low["Sensor Type"])

# Concatenate high and low classified images for each dataset
train_df = pd.concat([train_high, train_low], ignore_index=True)
val_df = pd.concat([val_high, val_low], ignore_index=True)
test_df = pd.concat([test_high, test_low], ignore_index=True)

# Define the custom dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, transform=None):
        self.data = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        image_path = self.data.iloc[index]["Image File"]
        image = Image.open(image_path)

        if self.transform:
            image = self.transform(image)

        label = self.data.iloc[index]["Classification"]
        label = 1 if label == "high" else 0

        return image, label

# Create custom datasets for training, validation, and test
train_dataset = CustomDataset(train_df, transform=transform)
val_dataset = CustomDataset(val_df, transform=transform)
test_dataset = CustomDataset(test_df, transform=transform)

# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


## Model Training

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np

# Define the model
model_name = "resnet18"  # Replace with the desired model architecture
num_classes = 2
model = create_model(model_name, pretrained=True, num_classes=num_classes).to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 5
best_accuracy = 0.0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    with tqdm(total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch") as pbar:
        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * images.size(0)

            pbar.set_postfix(loss=loss.item())
            pbar.update()

    # Calculate average train loss
    train_loss = train_loss / len(train_dataset)

    # Evaluation on the validation set
    model.eval()
    val_accuracy = 0.0
    val_predictions = []
    val_targets = []

    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)

            val_accuracy += accuracy_score(predicted.cpu(), labels.cpu()) * images.size(0)
            val_predictions.extend(predicted.cpu().numpy())
            val_targets.extend(labels.cpu().numpy())

    # Calculate average validation accuracy
    val_accuracy = val_accuracy / len(val_dataset)

    # Calculate F1 score
    val_f1 = f1_score(val_targets, val_predictions)

    # Print training progress
    print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f} - Val Accuracy: {val_accuracy:.4f} - Val F1 Score: {val_f1:.4f}")

    # Save the model with the best validation accuracy
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        torch.save(model.state_dict(), "sensor_pcap_classification/Classification_model/main_pcap_classification_model.pth")


## Model Evaluation

In [None]:
# Load the model from the original checkpoint file
model = create_model(model_name, pretrained=False, num_classes=num_classes).to(device)
model.load_state_dict(torch.load("sensor_pcap_classification/Classification_model/main_pcap_classification_model.pth"))
# Evaluation on the validation set
model.eval()
val_predictions = []
val_targets = []

with torch.no_grad():
    for images, labels in val_loader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)

        val_predictions.extend(predicted.cpu().numpy())
        val_targets.extend(labels.cpu().numpy())

# Calculate validation accuracy
val_accuracy = accuracy_score(val_targets, val_predictions)

# Calculate F1 score
val_f1 = f1_score(val_targets, val_predictions)

# Calculate and print the confusion matrix
confusion = confusion_matrix(val_targets, val_predictions)
print("Confusion Matrix:")
print(confusion)
print("Validation Accuracy:", val_accuracy)
print("Validation F1 Score:", val_f1)


## Time for inference

In [None]:
import time
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from timm import create_model
from PIL import Image
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix  # Add these import statements

# Define the model architecture and number of classes
model_name = 'resnet18'
num_classes = 2

# Set the device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model from the original checkpoint file
model = create_model(model_name, pretrained=False, num_classes=num_classes).to(device)
model.load_state_dict(torch.load("sensor_pcap_classification/Classification_model/main_pcap_classification_model.pth"))

# Evaluation on the validation set
model.eval()
val_predictions = []
val_targets = []
total_inference_time = 0

with torch.no_grad():
    for images, labels in val_loader:
        images = images.to(device)
        labels = labels.to(device)

        start_time = time.time()  # Start timing the inference
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        end_time = time.time()  # End timing the inference

        total_inference_time += (end_time - start_time)

        val_predictions.extend(predicted.cpu().numpy())
        val_targets.extend(labels.cpu().numpy())

# Calculate validation accuracy
val_accuracy = accuracy_score(val_targets, val_predictions)

# Calculate F1 score
val_f1 = f1_score(val_targets, val_predictions)

# Calculate and print the confusion matrix
confusion = confusion_matrix(val_targets, val_predictions)
print("Confusion Matrix:")
print(confusion)
print("Validation Accuracy:", val_accuracy)
print("Validation F1 Score:", val_f1)

# Calculate average inference time per pcap
num_pcaps = len(val_loader.dataset)  # Number of samples (pcaps) in the validation set
average_inference_time_per_pcap = total_inference_time / num_pcaps
print("Average Inference Time per PCAP:", average_inference_time_per_pcap, "seconds")
