# Function Lib

In [None]:

def plot_large_dataset(file_path, chunk_size=1000000, downsample_method='uniform', n_points=10000):
    """
    Plot Reading vs Seconds from a large dataset by processing it in chunks and downsampling.
    
    Parameters:
    -----------
    file_path : str
        Path to the data file
    chunk_size : int
        Number of rows to read in each chunk
    downsample_method : str
        Method for downsampling: 'uniform', 'mean', or 'bin'
    n_points : int
        Target number of points to plot
    """
    print(f"Starting to process dataset: {Path(file_path).name}")
    start_time = time.time()
    
    # First, determine the header structure and column names
    print("Examining file structure...")
    with open(file_path, 'r') as f:
        # Read first few lines to analyze header
        header_lines = [f.readline() for _ in range(20)]  # Read up to 20 lines to find header
    
    # Look for the line that seems to contain column names
    header_row = None
    for i, line in enumerate(header_lines):
        if 'Index' in line and ('Second' in line or 'Time' in line):
            header_row = i
            print(f"Found header at line {header_row}: {line.strip()}")
            header_content = line.strip()
            break
    
    if header_row is None:
        # If we couldn't find the header, assume it's the last non-empty line before data
        for i in reversed(range(len(header_lines))):
            if header_lines[i].strip():
                header_row = i
                header_content = header_lines[i].strip()
                print(f"Using line {header_row} as header: {header_content}")
                break
    
    # Determine column names from the header
    columns = [col.strip() for col in header_content.split(',')]
    print(f"Detected columns: {columns}")
    
    # Find appropriate column names for time and reading
    time_col = None
    reading_col = None
    
    # Look for time/seconds column
    time_candidates = ['Seconds', 'Time', 'seconds', 'time']
    for col in columns:
        if col in time_candidates or any(tc in col for tc in time_candidates):
            time_col = col
            break
    
    # Look for reading column
    reading_candidates = ['Reading', 'Value', 'reading', 'value', 'Data']
    for col in columns:
        if col in reading_candidates or any(rc in col for rc in reading_candidates):
            reading_col = col
            break
    
    # If we couldn't find matching columns, use Index (first column) and the second column
    if time_col is None:
        if 'Index' in columns:
            time_col = 'Index'
        else:
            time_col = columns[0]
        print(f"Using '{time_col}' as time column")
    
    if reading_col is None:
        # Use the second column as reading if different from time column
        for col in columns:
            if col != time_col:
                reading_col = col
                break
        if reading_col is None and len(columns) > 1:
            reading_col = columns[1]
        print(f"Using '{reading_col}' as reading column")
    
    print(f"Selected columns: Time = '{time_col}', Reading = '{reading_col}'")
    
    # First, let's find the min and max of time values to determine the range
    min_seconds = float('inf')
    max_seconds = float('-inf')
    
    print("Scanning file for time range...")
    chunks_read = 0
    total_rows = 0
    
    # Read file in chunks to determine time range
    for chunk in pd.read_csv(file_path, skiprows=header_row, chunksize=chunk_size, 
                            usecols=[time_col, reading_col]):
        min_seconds = min(min_seconds, chunk[time_col].min())
        max_seconds = max(max_seconds, chunk[time_col].max())
        chunks_read += 1
        total_rows += len(chunk)
        print(f"Scanned chunk {chunks_read}, total rows: {total_rows}")
    
    print(f"Time range: {min_seconds/60:.2f} to {max_seconds/60:.2f} minutes ({min_seconds:.2f} to {max_seconds:.2f} seconds)")
    
    # For binning approach
    if downsample_method == 'bin':
        # Create bins for time ranges
        num_bins = n_points
        bin_edges = np.linspace(min_seconds, max_seconds, num_bins + 1)
        bin_width = (max_seconds - min_seconds) / num_bins
        
        # Arrays to store results
        bin_counts = np.zeros(num_bins)
        bin_sums = np.zeros(num_bins)
        
        print("Processing chunks for binning...")
        chunks_read = 0
        
        # Bin centers for plotting - calculate here but use later
        bin_centers = min_seconds + (np.arange(num_bins) + 0.5) * bin_width
        
        # Process each chunk
        for chunk in pd.read_csv(file_path, skiprows=header_row, chunksize=chunk_size,
                                usecols=[time_col, reading_col]):
            # Assign each row to a bin
            bin_indices = np.floor((chunk[time_col] - min_seconds) / bin_width).astype(int)
            # Handle edge case
            bin_indices = np.clip(bin_indices, 0, num_bins - 1)
            
            # Update bin counts and sums for this chunk
            for i in range(num_bins):
                mask = (bin_indices == i)
                bin_counts[i] += mask.sum()
                bin_sums[i] += chunk.loc[mask, reading_col].sum()
            
            chunks_read += 1
            print(f"Processed chunk {chunks_read} for binning")
        
        # Calculate mean for each bin
        bin_means = np.zeros(num_bins)
        for i in range(num_bins):
            if bin_counts[i] > 0:
                bin_means[i] = bin_sums[i] / bin_counts[i]
        
        # Bin centers for plotting - convert to minutes
        bin_centers_min = bin_centers / 60.0
        
        # Create Plotly figure
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=bin_centers_min,
            y=bin_means,
            mode='lines',
            line=dict(width=2),
            name=f'{reading_col} (Binned Average)',
            hovertemplate=f'<b>Time:</b> %{{x:.2f}} minutes<br><b>{reading_col}:</b> %{{y:.4f}}<extra></extra>'
        ))
        
        fig.update_layout(
            title=f'{reading_col} vs {time_col} - Binned Average ({n_points} bins)',
            xaxis_title=f'Minutes',
            yaxis_title=f'{reading_col} (Average)',
            width=1200,
            height=600,
            showlegend=True,
            hovermode='x unified'
        )
        
        fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
        fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
        
    elif downsample_method == 'uniform':
        # Uniform sampling - take evenly spaced chunks
        seconds_all = []
        readings_all = []
        
        # Calculate how many rows to skip between samples
        total_rows_estimate = 20000000  # From your header info
        skip_factor = max(1, total_rows_estimate // n_points)
        
        print(f"Using uniform sampling with skip factor: {skip_factor}")
        
        # Read only the rows we need
        sampled_data = pd.read_csv(file_path, skiprows=lambda x: x == 0 or (x > header_row and x % skip_factor != 0),
                                  usecols=[time_col, reading_col])
        
        print(f"Sampled {len(sampled_data)} points from dataset")
        
        # Convert seconds to minutes for plotting
        sampled_data['time_minutes'] = sampled_data[time_col] / 60.0
        
        # Create Plotly figure
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=sampled_data['time_minutes'],
            y=sampled_data[reading_col],
            mode='lines',
            line=dict(width=1),
            name=f'{reading_col} (Uniform Sample)',
            hovertemplate=f'<b>Time:</b> %{{x:.2f}} minutes<br><b>{reading_col}:</b> %{{y:.4f}}<extra></extra>'
        ))
        
        fig.update_layout(
            title=f'{reading_col} vs {time_col} - Uniform Sampling (approx. {n_points} points)',
            xaxis_title=f'{time_col} (minutes)',
            yaxis_title=reading_col,
            width=1200,
            height=600,
            showlegend=True,
            hovermode='x unified'
        )
        
        fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
        fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
    
    elif downsample_method == 'mean':
        # Determine the number of chunks to process
        chunk_means = []
        chunk_times = []
        
        print("Processing chunks for mean values...")
        chunks_read = 0
        
        # Process each chunk
        for chunk in pd.read_csv(file_path, skiprows=header_row, chunksize=chunk_size,
                                usecols=[time_col, reading_col]):
            # Calculate mean for this chunk
            mean_reading = chunk[reading_col].mean()
            mean_time = chunk[time_col].mean()
            
            chunk_means.append(mean_reading)
            chunk_times.append(mean_time)
            
            chunks_read += 1
            print(f"Processed chunk {chunks_read} for means")
        
        # Convert seconds to minutes for plotting
        chunk_times_min = [t / 60.0 for t in chunk_times]
        
        # Create Plotly figure
        fig = go.Figure()
        fig.add_trace(go.Scatter(
            x=chunk_times_min,
            y=chunk_means,
            mode='lines+markers',
            line=dict(width=2),
            marker=dict(size=6),
            name=f'{reading_col} (Chunk Averages)',
            hovertemplate=f'<b>Time:</b> %{{x:.2f}} minutes<br><b>{reading_col}:</b> %{{y:.4f}}<extra></extra>'
        ))
        
        fig.update_layout(
            title=f'{reading_col} vs {time_col} - Chunk Averages ({chunks_read} chunks)',
            xaxis_title=f'{time_col} (minutes)',
            yaxis_title=f'{reading_col} (Average)',
            width=1200,
            height=600,
            showlegend=True,
            hovermode='x unified'
        )
        
        fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
        fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
    
    end_time = time.time()
    print(f"Processing completed in {end_time - start_time:.2f} seconds")
    
    # Save the plot as HTML (interactive) and optionally as PNG
    output_file_html = f"{reading_col}_vs_{time_col}_minutes_{downsample_method}.html"
    output_file_png = f"{reading_col}_vs_{time_col}_minutes_{downsample_method}.png"
    
    # Save as interactive HTML
    fig.write_html(output_file_html)
    print(f"Interactive plot saved as {output_file_html}")
    
    # Optionally save as PNG (requires kaleido: pip install kaleido)
    try:
        fig.write_image(output_file_png, width=1200, height=600, scale=2)
        print(f"Static plot saved as {output_file_png}")
    except Exception as e:
        print(f"Could not save PNG (install kaleido for PNG export): {e}")
    
    # Show the plot
    fig.show()
    
    return output_file_html, fig


In [None]:
from tqdm.notebook import tqdm  # For progress bars
from utils import dataset_analyze_rasp_ff, open_file_nf1, seconds_to_duration
import os
import plotly.express as px
import plotly.graph_objects as go
import time
from pathlib import Path
import pandas as pd
import numpy as np
import os

if 'result_df' not in globals():
    result_df = pd.DataFrame()

result_filenames = set(result_df['File name']) if not result_df.empty else set()

unique_filenames = set()

# Define paths
base_dir = './data/Experiment_Data'
reels_dir = os.path.join(base_dir, 'SIR_Experiment','Reels')
voice_dir = os.path.join(base_dir,'SIR_Experiment' ,'Voice call')
pubg_dir = os.path.join(base_dir, 'SIR_Experiment','pubg')
streaming_dir = os.path.join(base_dir, 'Video straming','db')

# Automatically collect all CSV files from both folders
reels_files = [os.path.join(reels_dir, f) for f in os.listdir(reels_dir) if f.endswith('.csv')]
voice_files = [os.path.join(voice_dir, f) for f in os.listdir(voice_dir) if f.endswith('.csv')]
pubg_files = [os.path.join(pubg_dir, f) for f in os.listdir(pubg_dir) if f.endswith('.csv')]
streaming_files = [os.path.join(streaming_dir, f) for f in os.listdir(streaming_dir) if f.endswith('.csv')]


# Combine lists
file_list = reels_files + voice_files + pubg_files + streaming_files

# Process files
files_passed = 0
skipped = 0
duplicates_count = 0
problematic_files = []

print(len(file_list), "files total")  # Print total files
result_df = pd.DataFrame()
for file_path in file_list:
    file_name = os.path.basename(file_path)

    # Skip if already processed
    if file_name in result_filenames:
        print(f"{file_name} skipped because already processed")
        skipped += 1
        continue

    try:
        files_passed += 1
        print(f"{file_name} passed. Count: {files_passed}")
        result_df=dataset_analyze_rasp_ff(file_path, result_df=result_df)
    except Exception as e:
        print(f"❌ Error with {file_name}: {e}")
        problematic_files.append(file_name)

print(f"\n✅ Done. {files_passed} files processed, {skipped} skipped (already in result_df).")
if problematic_files:
    print("⚠️ Problematic files:", problematic_files)


In [None]:
# import pandas as pd
# import numpy as np
# import plotly.express as px
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots
# import os

# Your file path
file_name = './data/Experiment_Data/SIR_Experiment/Reels/1_5_6pro_LTE_insta_stat_64sps.csv'

# Load and process the data using the optimized function
df1, sps_mean, sps_count_mean, duration, energy_org = open_file_nf_6pro_3ch_rasp_ff(file_name)

print(f"File loaded successfully!")
print(f"Duration: {duration:.2f} seconds ({duration/60:.2f} minutes)")
print(f"Data points: {len(df1)}")
print(f"SPS Mean: {sps_mean:.2f}")

# P_RF is already calculated in the function, but let's ensure it's correct
df1['P_RF'] = df1['P_BB'] + df1['P_PA']

# Apply exponential moving average for smoothing
with pd.option_context("mode.copy_on_write", True):
    df1['P_BAT_smooth'] = apply_exponential_moving_average(df1['P_BAT'], 30)
    df1['P_BB_smooth'] = apply_exponential_moving_average(df1['P_BB'], 30)
    df1['P_PA_smooth'] = apply_exponential_moving_average(df1['P_PA'], 30)
    df1['P_RF_smooth'] = apply_exponential_moving_average(df1['P_RF'], 30)

# Create time column for plotting (using time_sec_abs for better axis handling)
df1['time_minutes'] = df1['time_sec_abs'] / 60

# Plot 1: Main power consumption plot
fig = px.line(df1, 
              x='time_minutes', 
              y=['P_BAT_smooth', 'P_BB_smooth', 'P_PA_smooth', 'P_RF_smooth'],
              labels={
                  'value': 'Power (W)', 
                  'time_minutes': 'Time (minutes)',
                  'variable': 'Power Type'
              },
              color_discrete_sequence=['black', 'steelblue', 'red', 'green'],
              height=600,
              width=1500,
              title=f'Power Consumption - {os.path.basename(file_name)}'
              )

# Add useful_data as secondary y-axis
fig.add_trace(
    go.Scatter(
        x=df1['time_minutes'],
        y=df1['useful_data'],
        mode='lines',
        name='useful_data',
        line=dict(color='orange', width=2),
        yaxis='y2'
    )
)

# Update layout with secondary y-axis
fig.update_layout(
    xaxis=dict(
        title='<b>Time (minutes)</b>',
        tickformat='.2f'
    ),
    yaxis=dict(
        title='<b>Power (W)</b>',
        side='left'
    ),
    yaxis2=dict(
        title='<b>Useful Data</b>',
        side='right',
        overlaying='y',
        range=[0, 1.2]
    ),
    xaxis_rangeslider_visible=True,
    legend=dict(
        x=0.770, 
        y=1.0, 
        traceorder="normal",
        bgcolor='rgba(0,0,0,0)',
        font=dict(family="Times New Roman", size=20, color="black")
    ),
    font_family="Times New Roman",
    font_color="black",
    font_size=25
)

# Update range slider thickness
fig.update_xaxes(rangeslider_thickness=0.03)

# Make legend text bold
fig.for_each_trace(lambda t: t.update(name='<b>' + t.name.replace('_smooth', '') + '</b>'))

# Show and save the plot
fig.show()
fig.write_html(f"{os.path.basename(file_name).replace('.csv', '_power_plot.html')}")


In [None]:
import pandas as pd

# Load your result_df if not done already
# result_df = pd.read_csv("result_df.csv")

# Add scenario_id column
# Create scenario_id from the relevant columns
result_df['scenario_id'] = (
    result_df['Device'].astype(str).str.strip() + "_" +
    result_df['RAN Technology'].astype(str).str.strip() + "_" +
    result_df['Platform'].astype(str).str.strip() + "_" +
    result_df['Condition'].astype(str).str.strip()
)
# Compute average energy values per scenario
# Clean energy columns just in case
energy_cols = ['E_RF Jm', 'E_BAT Jm', 'E_BB Jm', 'E_PA Jm']
result_df[energy_cols] = result_df[energy_cols].apply(pd.to_numeric, errors='coerce')

# Group by scenario_id
scenario_summary_df = result_df.groupby('scenario_id')[energy_cols].mean().reset_index()

# Optional: rename for clarity
scenario_summary_df.columns = ['scenario_id', 'E_RF_Jm', 'E_BAT_Jm', 'E_BB_Jm', 'E_PA_Jm']

# Save to CSV for frontend usage
scenario_summary_df.to_csv("./website/server/scenario_summary_df.csv", index=False)


# Show preview in notebook
scenario_summary_df.head(10)
