# Experiment 1: Cross-Device Entropy Comparison Study

This notebook analyzes entropy metrics across different IoT devices to identify security and randomness differences in update traffic.

**Devices analyzed**: D-Link, Eufy, Sony TV, Tapo, Xiaomi

**Metrics**: Shannon entropy, Rényi entropy, Tsallis entropy

## Setup: Import Libraries and Configure

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Configure visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Configuration
ENTROPY_DIR = os.path.expanduser("~/update_traffic/controlled/entropy")
OUTPUT_DIR = os.path.expanduser("~/update_traffic/analysis_output")
FIGURES_DIR = os.path.join(OUTPUT_DIR, "figures")
DATA_DIR = os.path.join(OUTPUT_DIR, "data")

# Create output directories
Path(FIGURES_DIR).mkdir(parents=True, exist_ok=True)
Path(DATA_DIR).mkdir(parents=True, exist_ok=True)

# Device names to process
DEVICES = ['dlink', 'eufy', 'sony_tv', 'tapo', 'xiaomi']
METRICS = ['entropy_shannon', 'entropy_renyi', 'entropy_tsallis']

print(f"Output directory: {OUTPUT_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"Entropy directory: {ENTROPY_DIR}")

## Step 1: Data Loading Functions

In [None]:
def extract_entropy_value(x):
    """Extract entropy value from tuple string like '(1, 0.82769)'"""
    try:
        if isinstance(x, str):
            x = x.strip('()')
            parts = x.split(',')
            if len(parts) >= 2:
                return float(parts[1].strip())
        return np.nan
    except:
        return np.nan

def load_entropy_data(device_name):
    """Load entropy CSV for a device (use formatted version if available)"""
    formatted_path = os.path.join(ENTROPY_DIR, f"{device_name}_formatted.csv")
    raw_path = os.path.join(ENTROPY_DIR, f"{device_name}.csv")
    
    # Prefer formatted data
    if os.path.exists(formatted_path):
        try:
            df = pd.read_csv(formatted_path)
            # Parse tuple strings
            for metric in METRICS:
                if metric in df.columns:
                    df[metric] = df[metric].apply(lambda x: extract_entropy_value(x))
            return df, "formatted"
        except Exception as e:
            print(f"Error reading formatted {device_name}: {e}")
    
    # Fall back to raw data
    if os.path.exists(raw_path):
        try:
            df = pd.read_csv(raw_path)
            df = df[['entropy_shannon', 'entropy_renyi', 'entropy_tsallis']].copy()
            for metric in METRICS:
                df[metric] = pd.to_numeric(df[metric], errors='coerce')
                df[metric] = df[metric].mask(df[metric] < 0, np.nan)
            return df, "raw"
        except Exception as e:
            print(f"Error reading raw {device_name}: {e}")
    
    return None, None

print("Loading functions defined.")

## Step 2: Load Data for All Devices

In [None]:
print("Loading entropy data for all devices...\n")

device_data = {}
all_entropy = []

for device in DEVICES:
    df, source = load_entropy_data(device)
    if df is not None:
        device_data[device] = df
        print(f"✓ {device:12} - {len(df):5} packets from {source} source")
        df['device'] = device
        all_entropy.append(df)
    else:
        print(f"✗ {device:12} - Not found")

# Combine all data
combined_df = pd.concat(all_entropy, ignore_index=True)
print(f"\nTotal packets across all devices: {len(combined_df):,}")

## Step 3: Compute Statistics

In [None]:
def compute_statistics(df, device_name):
    """Compute comprehensive statistics for a device"""
    stats_dict = {'device': device_name}
    
    for metric in METRICS:
        if metric in df.columns:
            col = df[metric]
            col = pd.to_numeric(col, errors='coerce')
            col = col[col >= 0]
            
            short_name = metric.split('_')[1]
            stats_dict[f'{short_name}_mean'] = col.mean()
            stats_dict[f'{short_name}_std'] = col.std()
            stats_dict[f'{short_name}_min'] = col.min()
            stats_dict[f'{short_name}_max'] = col.max()
            stats_dict[f'{short_name}_median'] = col.median()
            stats_dict[f'{short_name}_q25'] = col.quantile(0.25)
            stats_dict[f'{short_name}_q75'] = col.quantile(0.75)
            stats_dict[f'{short_name}_count'] = col.count()
    
    return stats_dict

print("Computing entropy statistics per device...\n")

stats_list = []
for device in DEVICES:
    if device in device_data:
        stats = compute_statistics(device_data[device], device)
        stats_list.append(stats)

stats_df = pd.DataFrame(stats_list)
stats_df.to_csv(os.path.join(DATA_DIR, "entropy_statistics.csv"), index=False)

# Display summary
summary = stats_df[['device', 'shannon_mean', 'renyi_mean', 'tsallis_mean', 'shannon_std']]
print("Summary Statistics (Mean Values):")
print(summary.to_string(index=False))
print(f"\n✓ Statistics saved to: {DATA_DIR}/entropy_statistics.csv")

## Step 4: Statistical Testing (ANOVA)

In [None]:
print("Performing ANOVA (Analysis of Variance)...\n")

# Extract Shannon entropy for each device
shannon_groups = []
for device in DEVICES:
    if device in device_data:
        vals = device_data[device]['entropy_shannon'].dropna().values
        vals = vals[vals >= 0]
        shannon_groups.append(vals)

# Perform ANOVA
f_stat, p_value = stats.f_oneway(*shannon_groups)

print("Shannon Entropy ANOVA Results:")
print(f"  F-statistic: {f_stat:.4f}")
print(f"  P-value: {p_value:.2e}")
print(f"  Significant difference: {'✓ YES (p < 0.05)' if p_value < 0.05 else '✗ NO (p >= 0.05)'}")
print(f"\nInterpretation: Device entropy values are {'significantly different' if p_value < 0.05 else 'NOT significantly different'}")

## Step 5: Visualization - Box Plots

In [None]:
# Create box plots for all three entropy metrics
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, metric in enumerate(['entropy_shannon', 'entropy_renyi', 'entropy_tsallis']):
    name = metric.split('_')[1].capitalize()
    
    # Prepare data
    plot_data = []
    labels = []
    for device in DEVICES:
        if device in device_data:
            vals = device_data[device][metric].dropna()
            vals = vals[vals >= 0]
            plot_data.append(vals)
            labels.append(device.capitalize())
    
    # Box plot
    bp = axes[idx].boxplot(plot_data, labels=labels, patch_artist=True)
    for patch in bp['boxes']:
        patch.set_facecolor('lightblue')
    
    axes[idx].set_ylabel('Entropy Value', fontsize=11)
    axes[idx].set_title(f'{name} Entropy Distribution', fontsize=12, fontweight='bold')
    axes[idx].grid(True, alpha=0.3, axis='y')
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, "entropy_boxplots.pdf"), dpi=300, bbox_inches='tight')
print(f"✓ Saved: {FIGURES_DIR}/entropy_boxplots.pdf")
plt.show()

## Step 6: Visualization - Distributions

In [None]:
# Create histograms for each device and entropy metric
fig, axes = plt.subplots(3, 5, figsize=(16, 10))

for metric_idx, metric in enumerate(METRICS):
    name = metric.split('_')[1].capitalize()
    for device_idx, device in enumerate(DEVICES):
        ax = axes[metric_idx, device_idx]
        if device in device_data:
            vals = device_data[device][metric].dropna()
            vals = vals[vals >= 0]
            
            ax.hist(vals, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
            ax.set_title(f'{device.upper()}\n{name}', fontsize=10, fontweight='bold')
            ax.set_xlabel('Entropy', fontsize=9)
            if device_idx == 0:
                ax.set_ylabel('Frequency', fontsize=9)
            ax.grid(True, alpha=0.3, axis='y')

plt.suptitle('Entropy Distributions Across Devices', fontsize=14, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, "entropy_distributions.pdf"), dpi=300, bbox_inches='tight')
print(f"✓ Saved: {FIGURES_DIR}/entropy_distributions.pdf")
plt.show()

## Step 7: Visualization - Heatmap

In [None]:
# Create heatmap of mean entropy values
heatmap_data = stats_df[['device', 'shannon_mean', 'renyi_mean', 'tsallis_mean']].set_index('device')
heatmap_data.columns = ['Shannon', 'Rényi', 'Tsallis']

plt.figure(figsize=(8, 5))
sns.heatmap(heatmap_data.T, annot=True, fmt='.3f', cmap='RdYlGn', 
            cbar_kws={'label': 'Entropy Value'}, linewidths=1, linecolor='gray')
plt.title('Mean Entropy Values by Device', fontsize=12, fontweight='bold')
plt.xlabel('Device', fontsize=11)
plt.ylabel('Entropy Metric', fontsize=11)
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, "entropy_heatmap.pdf"), dpi=300, bbox_inches='tight')
print(f"✓ Saved: {FIGURES_DIR}/entropy_heatmap.pdf")
plt.show()

## Step 8: Visualization - Violin Plots

In [None]:
# Create violin plot for Shannon entropy
fig, ax = plt.subplots(figsize=(12, 6))

violin_data = []
for device in DEVICES:
    if device in device_data:
        vals = device_data[device]['entropy_shannon'].dropna()
        vals = vals[vals >= 0]
        for v in vals[:min(200, len(vals))]:  # Limit for clarity
            violin_data.append({'Device': device.capitalize(), 'Shannon Entropy': v})

violin_df = pd.DataFrame(violin_data)
sns.violinplot(data=violin_df, x='Device', y='Shannon Entropy', ax=ax, palette='Set2')
ax.set_ylabel('Shannon Entropy', fontsize=11)
ax.set_xlabel('Device', fontsize=11)
ax.set_title('Shannon Entropy Distribution (Violin Plot)', fontsize=12, fontweight='bold')
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, "entropy_violins.pdf"), dpi=300, bbox_inches='tight')
print(f"✓ Saved: {FIGURES_DIR}/entropy_violins.pdf")
plt.show()

## Step 9: Summary Report

In [None]:
print("\n" + "="*70)
print("CROSS-DEVICE ENTROPY ANALYSIS - SUMMARY REPORT")
print("="*70)

print(f"\nKey Findings:")
print(f"  • Total devices analyzed: {len(DEVICES)}")
print(f"  • Total packets processed: {len(combined_df):,}")
print(f"  • Entropy metrics: {', '.join([m.split('_')[1].title() for m in METRICS])}")

print(f"\nStatistical Results (ANOVA on Shannon Entropy):")
print(f"  • F-statistic: {f_stat:.4f}")
print(f"  • P-value: {p_value:.2e}")
print(f"  • Significant differences: {'YES' if p_value < 0.05 else 'NO'}")

print(f"\nDevice Rankings (by Shannon Entropy Mean):")
ranked = stats_df[['device', 'shannon_mean', 'shannon_std']].sort_values('shannon_mean', ascending=False).reset_index(drop=True)
for idx, row in ranked.iterrows():
    print(f"  {idx+1}. {row['device'].capitalize():12} - Mean: {row['shannon_mean']:.4f} ± {row['shannon_std']:.4f}")

print(f"\nInterpretation:")
print(f"  • Higher entropy → More randomness/encryption")
print(f"  • Lower entropy → More structured patterns/plaintext")

print(f"\nOutputs Generated:")
print(f"  Data: {DATA_DIR}/entropy_statistics.csv")
print(f"  Figures:")
print(f"    - entropy_boxplots.pdf")
print(f"    - entropy_distributions.pdf")
print(f"    - entropy_heatmap.pdf")
print(f"    - entropy_violins.pdf")
print("\n" + "="*70)

## Detailed Statistics Table

In [None]:
# Display full statistics
display_stats = stats_df.copy()
# Round numeric columns
for col in display_stats.columns:
    if col != 'device':
        display_stats[col] = display_stats[col].round(4)

print("Full Entropy Statistics:")
print(display_stats.to_string(index=False))