In [12]:
import pyshark
import nest_asyncio

# Apply nest_asyncio if an event loop is already running
nest_asyncio.apply()

packets = pyshark.FileCapture('./Data/Physical_Interaction/Topology_B/physicalInteraction7.pcapng')
# packets = pyshark.FileCapture('./Data/Physical_Interaction/Topology_B/physicalInteraction7.pcapng')


In [24]:
import pandas as pd
import os
from glob import glob
import ast

file_path = './Data/'

# List to store DataFrames
dataframes = []

# Find all CSV files in the directory and subdirectories
csv_files = glob(os.path.join(file_path, '**', '*.csv'), recursive=True)

# Iterate over CSV files
for file in csv_files:
    print(file)
    
    # Read each CSV file
    df = pd.read_csv(file)
    
    # Adjust 'Packet Number' in the new DataFrame if needed
    if dataframes and 'Packet Number' in df.columns:
        df['Old_Packet Number'] = df['Packet Number']
        last_packet_number = max([frame['Packet Number'].max() for frame in dataframes if 'Packet Number' in frame.columns])
        df['Packet Number'] += last_packet_number
    
    df['File Source'] = os.path.basename(file)
    
    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames at once
data = pd.concat(dataframes, ignore_index=True)


./Data/Scenario/Topology_A/groundtruth/scenario1_groundtruth.csv
./Data/Scenario/Topology_A/groundtruth/scenario2_groundtruth.csv
./Data/Scenario/Topology_B/groundtruth/scenario1_groundtruth.csv
./Data/Physical_Interaction/Topology_A/groundtruth/physicalInteraction3_groundtruth.csv
./Data/Physical_Interaction/Topology_A/groundtruth/physicalInteraction1_groundtruth.csv
./Data/Physical_Interaction/Topology_A/groundtruth/physicalInteraction2_groundtruth.csv
./Data/Physical_Interaction/Topology_B/groundtruth/physicalInteraction3_groundtruth.csv
./Data/Physical_Interaction/Topology_B/groundtruth/physicalInteraction4_groundtruth.csv
./Data/Physical_Interaction/Topology_B/groundtruth/physicalInteraction7_groundtruth.csv
./Data/Physical_Interaction/Topology_B/groundtruth/physicalInteraction1_groundtruth.csv
./Data/Physical_Interaction/Topology_B/groundtruth/physicalInteraction5_groundtruth.csv
./Data/Physical_Interaction/Topology_B/groundtruth/physicalInteraction6_groundtruth.csv
./Data/Physic

In [25]:
# print for each column with a reasonable amount of unique values some statistiscs like the count of elements, the number of unique elements, the most common element and the frequency of the most common element

for column in data.columns:
    if data[column].nunique() < 100:
        #if column contains 'ZigBee'
        if "ZigBee" in column:
            print(f'Column: {column}')
            print(data[column].describe())
            print()
            
            value_counts = data[column].value_counts()
            human_command_counts = data[data['Human Command'] == 1][column].value_counts()
            
            for value, count in value_counts.items():
                human_command_count = human_command_counts.get(value, 0)
                percentage = (human_command_count / count) * 100
                print(f'Value: {value}, Count: {count}, Human Command Count: {human_command_count}, Percentage: {percentage:.2f}%')
            print()
            
        

Column: Device Name ZigBee
count      211025
unique         23
top       Unknown
freq       110771
Name: Device Name ZigBee, dtype: object

Value: Unknown, Count: 110771, Human Command Count: 8, Percentage: 0.01%
Value: Coordinator, Count: 49504, Human Command Count: 3323, Percentage: 6.71%
Value: Ledvance Bulb, Count: 7296, Human Command Count: 106, Percentage: 1.45%
Value: Power Plug 1, Count: 6703, Human Command Count: 17, Percentage: 0.25%
Value: Smart Socket, Count: 6588, Human Command Count: 20, Percentage: 0.30%
Value: Power Plug 2, Count: 5740, Human Command Count: 20, Percentage: 0.35%
Value: Philips Lamp 1, Count: 4368, Human Command Count: 61, Percentage: 1.40%
Value: Moes Bulb, Count: 4181, Human Command Count: 42, Percentage: 1.00%
Value: Philips Motion, Count: 3890, Human Command Count: 19, Percentage: 0.49%
Value: Philips Lamp 3, Count: 2756, Human Command Count: 8, Percentage: 0.29%
Value: Ledvance Z3 Plug, Count: 2734, Human Command Count: 48, Percentage: 1.76%
Value: 

In [4]:

# Drop rows with missing data in specific columns
data = data.dropna(subset=['Command String'])

# Convert 'Command String' column to dictionaries and create a DataFrame
try:
    converted_list = [ast.literal_eval(command) for command in data['Command String']]
    new_columns = pd.DataFrame(converted_list, index=data.index)  # Align with the original index
    
    # Extend `data` with the new columns
    data = data.join(new_columns)
except (ValueError, SyntaxError):
    print("Error in parsing 'Command String' column.")

# Reset the index to 'Packet Number' if it exists
if 'Packet Number' in data.columns:
    data = data.reset_index(drop=True).set_index('Packet Number')

print("Data processing complete.")

Data processing complete.
