In [None]:
import os
import pyshark
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import zipfile
import shutil
import py7zr

In [None]:
DATASET_DIR = 'path/to/USTC-TFC2016'

# unzip and extract .7z files
def extract_7z_files(dataset_dir):
    for file in os.listdir(dataset_dir):
        if file.endswith(".7z"):
            filepath = os.path.join(dataset_dir, file)
            extract_dir = os.path.join(dataset_dir, file[:-3])
            with py7zr.SevenZipFile(filepath, mode='r') as archive:
                archive.extractall(path=extract_dir)
            print(f"Extracted: {file}")

def extract_features(pcap_file):
    """
    Extract basic features from a .pcap file.
    Features include packet count, average packet size, and protocol distribution.
    """
    cap = pyshark.FileCapture(pcap_file)
    packet_count = 0
    total_packet_size = 0
    protocol_counts = {}
    
    for packet in cap:
        packet_count += 1
        total_packet_size += int(packet.length)
        protocol = packet.highest_layer
        protocol_counts[protocol] = protocol_counts.get(protocol, 0) + 1
    
    cap.close()
    avg_packet_size = total_packet_size / packet_count if packet_count > 0 else 0
    features = {
        'packet_count': packet_count,
        'avg_packet_size': avg_packet_size,
        **protocol_counts
    }
    return features

In [None]:
extract_7z_files(DATASET_DIR)

# parse benign and malignant directories
benign_dir = os.path.join(DATASET_DIR, "benign")
malignant_dir = os.path.join(DATASET_DIR, "malignant")

# feature extraction
data = []
labels = []

for label, category_dir in enumerate([benign_dir, malignant_dir]):
    for root, _, files in os.walk(category_dir):
        for file in files:
            if file.endswith(".pcap"):
                pcap_path = os.path.join(root, file)
                try:
                    features = extract_features(pcap_path)
                    data.append(features)
                    labels.append(label)  # 0 for benign, 1 for malignant
                except Exception as e:
                    print(f"Error processing {pcap_path}: {e}")

In [None]:
df = pd.DataFrame(data)
df.fillna(0, inplace=True)  # fill missing features
df['label'] = labels

In [None]:
X = df.drop(columns=['label'])
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# train simple logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))