In [30]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from typing import List, Dict, Any
import json

class NetworkAnalysisVisualizer:
    def __init__(self):
        self.data = {}
        self.metrics = [
            'totalTransferTime', 'downloadSpeed', 'dnsLookup',
            'tcpConnection', 'tlsHandshake', 'timeToFirstByte'
        ]
        self.colors = px.colors.qualitative.Set3

    def load_data(self, file_paths: List[str]) -> None:
        """Load data from multiple JSON files."""
        for file_path in file_paths:
            with open(file_path, 'r') as f:
                data = json.load(f)
                tool_name = data[0].get('toolName', file_path.split('/')[-1].replace('.json', ''))
                self.data[tool_name] = data

    def _prepare_transfer_df(self, filter_mode='all') -> pd.DataFrame:
        """Convert nested JSON data to flat DataFrame."""
        rows = []
        for tool, measurements in self.data.items():
            for measurement in measurements:
                for transfer in measurement['fileTransfers']:
                    if filter_mode == 'success' and not (transfer['transferSuccess'] and 
                                                       transfer['hashMatch'] and 
                                                       transfer['sizeMatch']):
                        continue
                    if filter_mode == 'partial' and not (transfer['transferSuccess'] and 
                                                       (not transfer['hashMatch'] or 
                                                        not transfer['sizeMatch'])):
                        continue
                    if filter_mode == 'failure' and transfer['transferSuccess']:
                        continue
                    
                    row = {
                        'tool': tool,
                        'fileSize': transfer.get('fileSize', 0),
                        **{metric: transfer.get(metric, 0) for metric in self.metrics}
                    }
                    rows.append(row)
        
        return pd.DataFrame(rows)

    def violin_plot(self, metric: str, filter_mode='all') -> go.Figure:
        """Create violin plot for timing distribution."""
        df = self._prepare_transfer_df(filter_mode)
        fig = go.Figure()
        
        num_colors = len(self.colors)
        
        for idx, tool in enumerate(df['tool'].unique()):
            tool_data = df[df['tool'] == tool][metric]
            fig.add_trace(go.Violin(
                x=[tool] * len(tool_data),
                y=tool_data,
                name=tool,
                box_visible=True,
                meanline_visible=True,
                fillcolor=self.colors[idx % num_colors],  # Cycle through colors
                line_color='black'
            ))
        
        fig.update_layout(
            title=f'Distribution of {metric} by Tool',
            xaxis_title='Tool',
            yaxis_title=metric,
            showlegend=False
        )
        return fig
    def cdf_plot(self, metric: str, filter_mode='all') -> go.Figure:
        """Create CDF plot for timing/latency analysis."""
        df = self._prepare_transfer_df(filter_mode)
        fig = go.Figure()
        
        for idx, tool in enumerate(df['tool'].unique()):
            tool_data = df[df['tool'] == tool][metric].sort_values()
            cum_prob = np.linspace(0, 1, len(tool_data))
            
            fig.add_trace(go.Scatter(
                x=tool_data,
                y=cum_prob,
                name=tool,
                line=dict(color=self.colors[idx])
            ))
        
        fig.update_layout(
            title=f'Cumulative Distribution Function of {metric}',
            xaxis_title=metric,
            yaxis_title='Cumulative Probability',
            showlegend=True
        )
        return fig

    def scatter_with_trendline(self, y_metric: str, filter_mode='all') -> go.Figure:
        """Create scatter plot with trendlines for file size vs. metric."""
        df = self._prepare_transfer_df(filter_mode)
        fig = go.Figure()
        
        for idx, tool in enumerate(df['tool'].unique()):
            tool_df = df[df['tool'] == tool]
            
            # Add scatter points
            fig.add_trace(go.Scatter(
                x=tool_df['fileSize'],
                y=tool_df[y_metric],
                mode='markers',
                name=f'{tool} (data)',
                marker=dict(color=self.colors[idx])
            ))
            
            # Add trendline
            z = np.polyfit(tool_df['fileSize'], tool_df[y_metric], 1)
            p = np.poly1d(z)
            x_range = np.linspace(tool_df['fileSize'].min(), tool_df['fileSize'].max(), 100)
            
            fig.add_trace(go.Scatter(
                x=x_range,
                y=p(x_range),
                mode='lines',
                name=f'{tool} (trend)',
                line=dict(color=self.colors[idx], dash='dash')
            ))
        
        fig.update_layout(
            title=f'File Size vs {y_metric}',
            xaxis_title='File Size (bytes)',
            yaxis_title=y_metric,
            showlegend=True
        )
        return fig

    def box_plot_transfer_time(self, filter_mode='all') -> go.Figure:
        """Create box plots for transfer time for each individual measurement per tool."""
        df = self._prepare_transfer_df(filter_mode)
        
        fig = go.Figure()
        
        for idx, tool in enumerate(df['tool'].unique()):
            tool_df = df[df['tool'] == tool]
            
            fig.add_trace(go.Box(
                x=tool_df['measurementNumber'],
                y=tool_df['totalTransferTime'],
                name=tool,
                marker_color=self.colors[idx]
            ))
        
        fig.update_layout(
            title='Transfer Time Distribution by Measurement and Tool',
            xaxis_title='Measurement Number',
            yaxis_title='Transfer Time (ms)',
            boxmode='group'
        )
        return fig


    def box_plot_by_size(self, metric: str, size_bins=5, filter_mode='all') -> go.Figure:
        """Create box plots grouped by file size bins."""
        df = self._prepare_transfer_df(filter_mode)
        df['size_group'] = pd.qcut(df['fileSize'], q=size_bins, labels=[f'Q{i+1}' for i in range(size_bins)])
        
        fig = go.Figure()
        
        for idx, tool in enumerate(df['tool'].unique()):
            tool_df = df[df['tool'] == tool]
            
            fig.add_trace(go.Box(
                x=tool_df['size_group'],
                y=tool_df[metric],
                name=tool,
                marker_color=self.colors[idx]
            ))
        
        fig.update_layout(
            title=f'{metric} Distribution by File Size Quantiles',
            xaxis_title='File Size Group',
            yaxis_title=metric,
            boxmode='group'
        )
        return fig

    def stacked_time_components(self, filter_mode='all') -> go.Figure:
        """Create stacked bar chart for time component breakdown."""
        df = self._prepare_transfer_df(filter_mode)
        time_components = ['dnsLookup', 'tcpConnection', 'tlsHandshake', 'timeToFirstByte']
        
        fig = go.Figure()
        
        for idx, component in enumerate(time_components):
            fig.add_trace(go.Bar(
                name=component,
                x=df['tool'].unique(),
                y=[df[df['tool'] == tool][component].mean() for tool in df['tool'].unique()],
                marker_color=self.colors[idx]
            ))
        
        fig.update_layout(
            title='Average Time Components by Tool',
            xaxis_title='Tool',
            yaxis_title='Time (ms)',
            barmode='stack'
        )
        return fig

    def success_rate_heatmap(self) -> go.Figure:
        """Create heatmap of success rates by file size and tool."""
        df = self._prepare_transfer_df('all')
        df['size_group'] = pd.qcut(df['fileSize'], q=5, labels=['Very Small', 'Small', 'Medium', 'Large', 'Very Large'])
        
        success_matrix = pd.pivot_table(
            df,
            values='transferSuccess',
            index='tool',
            columns='size_group',
            aggfunc='mean'
        )
        
        fig = go.Figure(data=go.Heatmap(
            z=success_matrix.values,
            x=success_matrix.columns,
            y=success_matrix.index,
            colorscale='RdYlGn',
            text=np.round(success_matrix.values * 100, 1),
            texttemplate='%{text}%',
            textfont={"size": 10},
            hoverongaps=False
        ))
        
        fig.update_layout(
            title='Transfer Success Rate by Tool and File Size',
            xaxis_title='File Size Group',
            yaxis_title='Tool'
        )
        return fig

    def error_breakdown(self) -> go.Figure:
        """Create bar chart for error counts by stage and tool."""
        df = self._prepare_transfer_df('all')
        error_types = ['tunnelError', 'diagnosticsError', 'measurementError', 'cleanupError']
        
        fig = go.Figure()
        
        for idx, error_type in enumerate(error_types):
            fig.add_trace(go.Bar(
                name=error_type,
                x=df['tool'].unique(),
                y=[df[df['tool'] == tool][error_type].sum() for tool in df['tool'].unique()],
                marker_color=self.colors[idx]
            ))
        
        fig.update_layout(
            title='Error Counts by Stage and Tool',
            xaxis_title='Tool',
            yaxis_title='Error Count',
            barmode='group'
        )
        return fig

# Import necessary libraries
import os
import json
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from typing import List

# Initialize the visualizer
visualizer = NetworkAnalysisVisualizer()

# Function to load JSON files from a specified directory and load into the visualizer
def load_all_json_from_directory(directory_path: str) -> None:
    """Load all JSON files from the specified directory and load them into the visualizer."""
    file_paths = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.json')]
    visualizer.load_data(file_paths)

# Specify the directory containing JSON files
directory_path = './all-graphed'  # Change this to your actual directory path

# Load the data
load_all_json_from_directory(directory_path)

# Generate and display example visualizations
# You can comment/uncomment the figures you wish to generate

# Violin plot for 'totalTransferTime'
violin_fig = visualizer.violin_plot('totalTransferTime')
violin_fig.show()

# CDF plot for 'downloadSpeed'
cdf_fig = visualizer.cdf_plot('downloadSpeed')
cdf_fig.show()

# Scatter plot with trendline for 'downloadSpeed' vs 'fileSize'
scatter_fig = visualizer.scatter_with_trendline('downloadSpeed')
scatter_fig.show()

# Box plot by file size bins for 'dnsLookup'
# box_fig = visualizer.box_plot_by_size('dnsLookup')
# box_fig.show()

# Stacked time components bar chart
stacked_fig = visualizer.stacked_time_components()
stacked_fig.show()

# Success rate heatmap
heatmap_fig = visualizer.success_rate_heatmap()
heatmap_fig.show()

# Error breakdown bar chart
error_fig = visualizer.error_breakdown()
error_fig.show()


IndexError: list index out of range

In [19]:
import os
import json
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from typing import List

# Initialize the visualizer
visualizer = NetworkAnalysisVisualizer()

# Function to load JSON files from a specified directory and load into the visualizer
def load_all_json_from_directory(directory_path: str) -> None:
    """Load all JSON files from the specified directory and load them into the visualizer."""
    file_paths = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.json')]
    visualizer.load_data(file_paths)

# Specify the directory containing JSON files
directory_path = './all-24-11-05-17-50-43'  # Change this to your actual directory path

# Load the data
load_all_json_from_directory(directory_path)

# Generate and display example visualizations
# You can comment/uncomment the figures you wish to generate

# Violin plot for 'totalTransferTime'
violin_fig = visualizer.violin_plot('totalTransferTime')
violin_fig.show()

# CDF plot for 'downloadSpeed'
cdf_fig = visualizer.cdf_plot('downloadSpeed')
cdf_fig.show()

# Scatter plot with trendline for 'downloadSpeed' vs 'fileSize'
scatter_fig = visualizer.scatter_with_trendline('downloadSpeed')
scatter_fig.show()

# Box plot by file size bins for 'dnsLookup'
box_fig = visualizer.box_plot_by_size('dnsLookup')
box_fig.show()

# Stacked time components bar chart
stacked_fig = visualizer.stacked_time_components()
stacked_fig.show()

# Success rate heatmap
heatmap_fig = visualizer.success_rate_heatmap()
heatmap_fig.show()

# Error breakdown bar chart
error_fig = visualizer.error_breakdown()
error_fig.show()

# New method to handle duplicate bin edges
def box_plot_by_size(self, metric: str, size_bins=5, filter_mode='all') -> go.Figure:
    """Create box plots grouped by file size bins."""
    df = self._prepare_transfer_df(filter_mode)
    df['size_group'] = pd.qcut(df['fileSize'], q=size_bins, labels=[f'Q{i+1}' for i in range(size_bins)], duplicates='drop')
    
    fig = go.Figure()
    
    for idx, tool in enumerate(df['tool'].unique()):
        tool_df = df[df['tool'] == tool]
        
        fig.add_trace(go.Box(
            x=tool_df['size_group'],
            y=tool_df[metric],
            name=tool,
            marker_color=self.colors[idx]
        ))
    
    fig.update_layout(
        title=f'{metric} Distribution by File Size Quantiles',
        xaxis_title='File Size Group',
        yaxis_title=metric,
        boxmode='group'
    )
    return fig

# New method to plot partial successes, only successes, and full failures
def plot_transfer_outcomes(self) -> go.Figure:
    """Create a bar chart showing the count of partial successes, only successes, and full failures."""
    df = self._prepare_transfer_df('all')
    df['outcome'] = df.apply(lambda row: 'Success' if row['transferSuccess'] and row['hashMatch'] and row['sizeMatch'] else ('Partial Success' if row['transferSuccess'] else 'Failure'), axis=1)
    
    outcome_counts = df.groupby(['tool', 'outcome']).size().unstack(fill_value=0)
    
    fig = go.Figure()
    
    for outcome in outcome_counts.columns:
        fig.add_trace(go.Bar(
            name=outcome,
            x=outcome_counts.index,
            y=outcome_counts[outcome],
            marker_color=self.colors[outcome_counts.columns.get_loc(outcome)]
        ))
    
    fig.update_layout(
        title='Transfer Outcomes by Tool',
        xaxis_title='Tool',
        yaxis_title='Count',
        barmode='stack'
    )
    return fig

# Example usage of the new method
outcome_fig = visualizer.plot_transfer_outcomes()
outcome_fig.show()

FileNotFoundError: [Errno 2] No such file or directory: './all-24-11-05-17-50-43'

In [13]:
transfer_time_fig = visualizer.box_plot_transfer_time()
transfer_time_fig.show()

KeyError: "'measurementNumber' column is missing from the DataFrame."