# Import Libraries

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import numpy as np
import pandas as pd
import colorsys
import psutil
import random
import tqdm
import yaml
import glob
import os
import gc
import re

from datetime import datetime, timedelta
from multiprocessing import Pool
from IPython.display import clear_output

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import plotly.io as pio
# pio.renderers.default = "notebook"
# pio.renderers.default = "notebook_connected"
pio.renderers.default = "iframe"

import plotly.graph_objects as go
import plotly.express as px

from plotly.subplots import make_subplots

In [4]:
from src.utils import get_root_directory, float_to_int

In [5]:
# Get root directory of the project
root_dir = get_root_directory()

In [6]:
# import jupyterlab
# print(jupyterlab.__version__)

# !pip install jupyterlab==3.6 
# !pip install --upgrade jupyterlab
# print(jupyterlab.__version__)

In [7]:
# Maximize Column Display 
pd.set_option('display.max_colwidth', None)     # Display all content within each cell without truncation
pd.set_option('display.max_columns', None)      # Display all columns
pd.set_option('display.width', None)            # Display entire width of DataFrame is displayed

pd.set_option('display.max_rows', None)         # Display all rows

In [8]:
# Get the current CPU usage as a percentage
cpu_usage = psutil.cpu_percent(interval=1)  # Interval of 1 second
print(f"Current CPU usage: {cpu_usage}%")

# Get the per-core usage
cpu_per_core = psutil.cpu_percent(interval=1, percpu=True)
print(f"CPU usage per core: {cpu_per_core}")

# Get the total number of cores
cpu_cores = psutil.cpu_count()
print(f"Total CPU cores: {cpu_cores}")

Current CPU usage: 7.9%
CPU usage per core: [9.3, 25.3, 7.1, 10.0, 8.2, 9.2, 8.1, 7.1, 10.1, 7.1, 8.0, 5.0]
Total CPU cores: 12


# Trend of Performance Measures

## FDOT D5

In [9]:
# Configurations
signal_ids = [
    "1285", "1290",
    "1300", "1315", "1325", "1330", 
    "1455", "1470", "1490",
    "1500", "1555",
    "1707", "1725", "1790", "1795", 
    "1960",
    "2055", 
    "2485", 
    "2665", 
    # "D5I-3000"
]

In [10]:
# Configurations
signal_id = "1500"
# date = "2024-06-01"

In [11]:
def load_data(dirpath: str, signal_id: str):
    # Cycle-level SPaT
    filepaths = f"{dirpath}/{signal_id}/*"
    filepaths = [filepath for filepath in glob.glob(filepaths)]

    data = []
    for filepath in filepaths:
        data.append(pd.read_pickle(filepath))
        
    df_id = pd.concat(data, axis=0, ignore_index=True)

    return df_id

In [12]:
def generate_shades(base_color, num_shades):
    """
    Generate more distinct shades of a base color in RGBA format using HSL adjustments.
    
    :param base_color: Tuple (R, G, B) in 0-255 scale.
    :param num_shades: Number of color variations to generate.
    :return: List of RGBA color strings.
    """
    base_color = [x / 255 for x in base_color]  # Normalize to 0-1 scale (RGB)
    base_h, base_l, base_s = colorsys.rgb_to_hls(base_color[0], base_color[1], base_color[2])

    shades = []
    for i in range(num_shades):
        # Make the difference more distinct by adjusting lightness and saturation significantly
        lightness = max(0, min(1, base_l * (1 - 0.25 * i)))  # Reduce lightness by 25% per step
        saturation = max(0, min(1, base_s * (1 - 0.2 * i)))  # Reduce saturation by 20% per step

        r, g, b = colorsys.hls_to_rgb(base_h, lightness, saturation)
        shades.append(f"rgba({int(r * 255)}, {int(g * 255)}, {int(b * 255)}, 1)")

    return shades

### Traffic Profile

#### Cycle

In [13]:
# df_vehicle_cycle_profile_id = (
#     pd.read_pickle(f"../data/production/atspm/fdot_d5/feature_extraction/signal_profile/cycle/vehicle_signal/{signal_id}/{date}.pkl")
# )

# print(df_vehicle_cycle_profile_id.shape)
# # df_vehicle_cycle_profile_id.tail(1)

In [14]:
# # Filter data
# start_time = "2024-06-01 00:00:00"; end_time = "2024-06-01 00:15:00"

# proc_df_vehicle_cycle_profile_id = (
#     df_vehicle_cycle_profile_id.query("cycleBegin >= @start_time and cycleEnd <= @end_time")
# )

# # Define a colormap for signal types
# dict_colormap = {"green": "#27ae60", "yellow": "#f1c40f", "redClearance": "#ec7063", "red": "#cb4335"}
# phase_no = 2

# # Flatten data for timeline visualization
# data = []
# for i, row in proc_df_vehicle_cycle_profile_id.iterrows():
#     for key, color in dict_colormap.items():
#         if not isinstance(row[f"{key}Phase{phase_no}"], list):
#             continue
#         for start, end in row[f"{key}Phase{phase_no}"]:
#             data.append({
#                 "signalType": key,
#                 "startTime": start,
#                 "endTime": end,
#                 "color": color
#             })

# # Create DataFrame for timeline
# df = pd.DataFrame(data)

# # Add a constant 'row' column to keep everything on the same line
# df["row"] = "allCycles"

# # Plot timeline
# fig = px.timeline(
#     df,
#     x_start="startTime",
#     x_end="endTime",
#     y="row",
#     color="signalType",
#     color_discrete_map=dict_colormap,
#     title=f"Signal Timing of Phase {phase_no} with Time",
#     labels={"row": "Cycle", "signalType": "Signal Phase"}
# )

# fig.update_layout(
#     xaxis=dict(
#         title="Time",
#         titlefont=dict(size=14),  # X-axis title font size
#         tickfont=dict(size=13),  # X-axis tick label font size
#         tickformat="%H:%M:%S",  # Format time on the x-axis
#         range=[start_time, end_time],  # Define range
#         gridcolor="gray",  # Custom grid line color
#         layer="above traces",  # Bring grid lines on top
#     ),
#     yaxis=dict(
#         title="Signal Phase",
#         titlefont=dict(size=14),  # Y-axis title font size
#         tickfont=dict(size=13),  # Y-axis tick label font size
#         gridcolor="gray",  # Custom grid line color
#         layer="above traces",  # Bring y-axis grid lines on top
#     ),
#     title=dict(
#         text=f"Signal Timing of Phase {phase_no} with Time",
#         font=dict(size=16),  # Title font size
#     ),
#     legend=dict(
#         font=dict(size=13),  # Legend font size
#         title=dict(font=dict(size=14)),  # Legend title font size
#     ),
#     height=250,  # Adjust figure height
#     font=dict(size=12)  # Default font size for all unspecified elements
# )

# fig.show()

In [15]:
# # Filter data
# start_time = "2024-06-01 00:00:00"; end_time = "2024-06-01 00:15:00"

# proc_df_vehicle_cycle_profile_id = (
#     df_vehicle_cycle_profile_id.query("cycleBegin >= @start_time and cycleEnd <= @end_time")
# )


# # Define a colormap for signal types
# dict_colormap = {"green": "#27ae60", "yellow": "#f1c40f", "redClearance": "#ec7063", "red": "#cb4335"}

# # Define phase numbers
# phase_nos = [1, 2, 3, 4, 5, 6, 7, 8]

# # Create subplots
# fig = make_subplots(
#     rows=len(phase_nos),
#     cols=1,
#     shared_xaxes=False,
#     subplot_titles=[f"Phase {phase_no}" for phase_no in phase_nos]
# )

# # Track which legends have already been added
# added_legends = set()

# # Loop through each phase and create a timeline for each
# for idx, phase_no in enumerate(phase_nos, start=1):
#     # Flatten data for the current phase
#     data = []
#     for _, row in proc_df_vehicle_cycle_profile_id.iterrows():
#         for key, color in dict_colormap.items():
#             signal_column = f"{key}Phase{phase_no}"
#             if signal_column not in row or not isinstance(row[signal_column], list):
#                 continue
#             for start, end in row[signal_column]:
#                 data.append({
#                     "signalType": key,
#                     "startTime": pd.to_datetime(start),
#                     "endTime": pd.to_datetime(end),
#                 })

#     # Convert to DataFrame
#     if data:
#         df = pd.DataFrame(data)

#         # Add a constant 'row' column to keep everything on the same line
#         df["row"] = "allCycles"

#         # Create a timeline plot for this phase
#         traces = px.timeline(
#             df,
#             x_start="startTime",
#             x_end="endTime",
#             y="row",  # Keep all bars on the same row
#             color="signalType",
#             color_discrete_map=dict_colormap,
#         )

#         # Add traces from the timeline plot to the subplot
#         for trace in traces.data:
#             trace.showlegend = trace.name not in added_legends  # Add legend only for the first occurrence
#             added_legends.add(trace.name)
#             fig.add_trace(trace, row=idx, col=1)

# # Update layout for all subplots without y-axis labels
# layout_updates = {}
# for i in range(1, len(phase_nos) + 1):
#     layout_updates[f"xaxis{i}"] = dict(
#         title="Time",
#         tickformat="%H:%M:%S",  # Format time ticks as HH:MM:SS
#         type="date",  # Ensure x-axis is treated as datetime
#         gridcolor="gray",  # Custom grid line color
#         layer="above traces",  # Bring grid lines on top
#     )
#     layout_updates[f"yaxis{i}"] = dict(
#         showticklabels=False,  # Hide y-axis ticks
#     )

# # Apply layout updates and set overlay mode
# fig.update_layout(
#     height=150 * len(phase_nos),  # Dynamic height based on number of subplots
#     title="Signal Timing for Multiple Phases",
#     legend=dict(
#         title="Signal Type",
#         font=dict(size=12),
#     ),
#     barmode="overlay",  # Set overlay mode
#     **layout_updates
# )

# fig.show()

### SPaT

#### Cycle

In [16]:
# df_spat_id = (
#     pd.read_pickle(f"../data/production/atspm/fdot_d5/feature_extraction/feature/cycle/vehicle_signal/spat/{signal_id}/{date}.pkl")
# )

# # print(df_spat_id.shape)
# # df_spat_id.head(1)

In [17]:
# # Filter data
# start_time = "2024-06-01 00:00:00"; end_time = "2024-06-01 11:59:00"

# proc_df_spat_id = (
#     df_spat_id.query("cycleBegin >= @start_time and cycleEnd <= @end_time")
# )

# dict_colormap = {"green": "#27ae60", "yellow": "#f1c40f", "redClearance": "#ec7063", "red": "#cb4335"}

# # Flatten data
# data = []
# for _, row in proc_df_spat_id.iterrows():
#     for key, _ in dict_colormap.items():
#         cycle_begin = row["cycleBegin"]
#         cycle_length = row["cycleLength"]
#         data.append({
#             "cycleBegin": cycle_begin,
#             "cycleLength": cycle_length,
#         })

# # Create DataFrame for timeline
# df = pd.DataFrame(data)

# # Create a smoothed line plot
# fig = px.line(
#     proc_df_spat_id,
#     x="cycleBegin",
#     y="cycleLength",
#     title="Trend of Cycle Length",
# )

# # Smooth the lines
# fig.update_traces(
#     # mode="lines+markers",
#     line_shape="spline",  # Use spline for smooth curves
#     # marker=dict(size=6)  # Adjust marker size for better visibility
# )

# # Update layout for better readability
# fig.update_layout(
#     xaxis=dict(
#         title="Time",
#         tickformat="%H:%M:%S",  # Format ticks as HH:MM:SS
#     ),
#     yaxis=dict(
#         title="Cycle Length (Sec)",
#     ),
#     height=500  # Adjust figure height
# )

# fig.show()

In [18]:
# # Filter data
# start_time = "2024-06-01 00:00:00"; end_time = "2024-06-01 11:59:00"

# proc_df_spat_id = (
#     df_spat_id.query("cycleBegin >= @start_time and cycleEnd <= @end_time")
# )

# # Define a colormap for signal types
# dict_colormap = {"green": "#27ae60", "yellow": "#f1c40f", "redClearance": "#ec7063", "red": "#cb4335"}
# phase_no = 2

# # Flatten data
# data = []
# for _, row in proc_df_spat_id.iterrows():
#     for key, _ in dict_colormap.items():
#         cycle_begin = row["cycleBegin"]
#         duration = row[f"{key}DurationPhase{phase_no}"]
#         data.append({
#             "signalType": key,
#             "cycleBegin": cycle_begin,
#             "duration": duration,
#         })

# # Create DataFrame for timeline
# df = pd.DataFrame(data)

# # Create a smoothed line plot
# fig = px.line(
#     df,
#     x="cycleBegin",
#     y="duration",
#     color="signalType",
#     line_group="signalType",
#     color_discrete_map=dict_colormap,
#     title=f"Trend of Signal Duration for Phase {phase_no}",
# )

# # Smooth the lines
# fig.update_traces(
#     # mode="lines+markers",
#     line_shape="spline",  # Use spline for smooth curves
#     # marker=dict(size=6)  # Adjust marker size for better visibility
# )

# # Update layout for better readability
# fig.update_layout(
#     xaxis=dict(
#         title="Time",
#         tickformat="%H:%M:%S",  # Format ticks as HH:MM:SS
#     ),
#     yaxis=dict(
#         title="Signal Duration (Sec)",
#     ),
#     height=500  # Adjust figure height
# )

# fig.show()

#### Hourly

In [19]:
# df_spat_id = (
#     load_data(
#         dirpath="../data/production/atspm/fdot_d5/feature_extraction/feature/cycle/vehicle_signal/spat", 
#         signal_id=signal_id)
# )

# print(df_spat_id.shape)
# df_spat_id.head(1)

In [20]:
# df_spat_id["hour"] = df_spat_id["cycleBegin"].dt.hour 

# df_spat_id_hourly = (
#     df_spat_id
#     .groupby(["signalID", "date", "hour"])
#     .agg(
#         cycleLengthAvg=("cycleLength", "mean")
#     )
#     .reset_index()
# )

# # Transparency for confidence intervals
# ci_transparency = 0.25

# # Assuming df_spat_id_hourly exists
# proc_df_spat_id_hourly = df_spat_id_hourly.copy()

# # Combine 'date' and 'hour' into a single datetime column
# proc_df_spat_id_hourly["datetime"] = pd.to_datetime(proc_df_spat_id_hourly["date"]) + pd.to_timedelta(proc_df_spat_id_hourly["hour"], unit="h")

# # Group data by hour and calculate mean, std, and count
# df = (
#     proc_df_spat_id_hourly.groupby(["hour"])
#     .agg(
#         cycleLengthAvg=("cycleLengthAvg", "mean"),
#         cycleLengthStd=("cycleLengthAvg", "std"),
#         count=("cycleLengthAvg", "count")
#     )
#     .reset_index()
# )

# # Calculate confidence intervals
# z = 1.96  # 95% confidence
# df["upperBound"] = df["cycleLengthAvg"] + z * (df["cycleLengthStd"] / np.sqrt(df["count"]))
# df["lowerBound"] = df["cycleLengthAvg"] - z * (df["cycleLengthStd"] / np.sqrt(df["count"]))

# # Create the figure
# fig = go.Figure()

# # Add line trace for the average cycle length
# fig.add_trace(
#     go.Scatter(
#         x=df["hour"],
#         y=df["cycleLengthAvg"],
#         mode="lines+markers",
#         marker=dict(color="blue", size=8),  # Marker size and color
#         line=dict(color="blue", width=3),  # Line width and color
#         name="Avg. Cycle Length",
#         showlegend=False
#     )
# )

# # Add shaded confidence interval
# fig.add_trace(
#     go.Scatter(
#         x=pd.concat([df["hour"], df["hour"][::-1]]),
#         y=pd.concat([df["upperBound"], df["lowerBound"][::-1]]),
#         fill="toself",
#         fillcolor="rgba(0, 0, 255, 0.25)",  # Semi-transparent blue for CI
#         line=dict(color="rgba(0,0,0,0)"),  # No border
#         hoverinfo="skip",
#         name="95% Confidence Interval",
#         # showlegend=False  # Hide CI in legend
#     )
# )

# # Smooth the lines
# fig.update_traces(
#     # mode="lines+markers",
#     line_shape="spline",  # Use spline for smooth curves
#     # marker=dict(size=6)  # Adjust marker size for better visibility
# )

# # Update layout
# fig.update_layout(
#     height=600,
#     width=1400,
#     # title="24-Hour Trends of Average Cycle Length (Sec) with Confidence Intervals",
#     # title_x=0.5,
#     # title_y=0.9,
#     font=dict(size=16),  # General font size
#     # legend=dict(
#     #     orientation="h",
#     #     x=0.5,
#     #     xanchor="center",
#     #     y=-0.2,
#     #     font=dict(size=14),  # Legend font size
#     # ),
#     margin=dict(r=25, t=25, l=50, b=50)  # Margins for the plot
#     # margin=dict(l=50, r=50, t=100, b=50)  # Margins for the plot
# )

# fig.update_layout(
#     legend=dict(
#         orientation="h",        # Horizontal legend
#         x=0.5,                  # Centered horizontally
#         y=-0.15,                 # Move below the plot (adjust as needed)
#         xanchor="center",        # Align horizontally from the center
#         yanchor="top",           # Align vertically from the top
#         font=dict(size=16),
#     )
# )

# # Update x-axis and y-axis
# fig.update_xaxes(
#     title="Hour of Day",
#     tickvals=list(range(24)),  # Explicitly set x-axis tick values
#     ticktext=[f"{hour}" for hour in range(24)],  # Set tick labels as "hour:00"
#     title_font=dict(size=16),  # X-axis title font size
#     tickfont=dict(size=14),  # X-axis tick label font size
    
# )

# fig.update_yaxes(
#     title="Avg. Cycle Length (Sec)",
#     title_font=dict(size=16),  # Y-axis title font size
#     tickfont=dict(size=14),  # Y-axis tick label font size
# )

# # Export the Plotly figure as a high-resolution image
# fig.write_image("../reports/3.2.png", width=1400, height=600, scale=2)

# # Show the plot
# fig.show()

In [21]:
# df_spat_id_hourly = (
#     load_data(
#         dirpath="../data/production/atspm/fdot_d5/feature_extraction/feature/hourly/vehicle_signal/spat", 
#         signal_id=signal_id)
# )

# print(df_spat_id_hourly.shape)
# df_spat_id_hourly.head(1)

In [22]:
# # Define a colormap for signal types using valid color formats
# dict_colormap = {
#     "green": (39, 174, 96),
#     "yellow": (241, 196, 15),
#     "redClearance": (236, 112, 99),
#     "red": (203, 67, 53)
# }

# # Phase to analyze
# # phase_nos = [5, 1, 7, 3]
# phase_nos = [6, 2, 8, 4] 

# # Generate shades for each signal type based on the number of phases
# dict_colormap = {
#     signal_type: generate_shades(base_color, len(phase_nos))
#     for signal_type, base_color in dict_colormap.items()
# }

# dict_label = {
#     "green": "Green",       
#     "yellow": "Yellow",  
#     "redClearance": "Red Clearance", 
#     "red": "Red"    
# }

# # Transparency for confidence intervals
# ci_transparency = 0.25

# columns = [
#     f"{signal_type}DurationPhase{phase_no}Avg" 
#     for phase_no in phase_nos
#     for signal_type in dict_label.keys()
# ]

# # Group data by hour, date, and signal type to calculate stats
# proc_df_spat_id_hourly = pd.melt(
#     df_spat_id_hourly,
#     id_vars=["signalID", "date", "hour"],
#     value_vars=columns,
#     var_name="feature",
#     value_name="durationAvg"
# )

# # Combine 'date' and 'hour' into a single datetime column
# proc_df_spat_id_hourly["datetime"] = pd.to_datetime(proc_df_spat_id_hourly["date"]) + pd.to_timedelta(proc_df_spat_id_hourly["hour"], unit="h")

# df = (
#     proc_df_spat_id_hourly.groupby(["hour", "feature"])
#     .agg(
#         durationAvg=("durationAvg", "mean"),
#         durationStd=("durationAvg", "std"),
#         count=("durationAvg", "count")
#     )
#     .reset_index()
# )

# # Calculate confidence intervals
# z = 1.96  # 95% confidence
# df["upperBound"] = df["durationAvg"] + z * (df["durationStd"] / np.sqrt(df["count"]))
# df["lowerBound"] = df["durationAvg"] - z * (df["durationStd"] / np.sqrt(df["count"]))

# df["signalType"] = df["feature"].str.extract(r"([a-zA-Z]+)Duration")
# df["phaseNo"] = df["feature"].str.extract(r"(\d)").astype(int)

# # dict_phase_map = {
# #     1: "left", 6: "thru",
# #     5: "left", 2: "thru",
# #     3: "left", 8: "thru",
# #     7: "left", 4: "thru",
# # }

# # df["phaseType"] = df["phaseNo"].map(dict_phase_map)

# # Create a subplot grid
# fig = make_subplots(
#     rows=2, cols=2,
#     subplot_titles=[f"{dict_label[key]}" for key in dict_label.keys()],
#     shared_xaxes=False,
#     shared_yaxes=False,
#     vertical_spacing=0.175,
#     horizontal_spacing=0.075
# )

# # Explicitly update subplot title font sizes and styles
# for annotation in fig['layout']['annotations']:
#     annotation['font'] = dict(size=16, color="black")  # Adjust font size and color of subplot titles

# # Iterate over signal types and add traces with CIs
# for idx, (key, colors) in enumerate(dict_colormap.items()):
#     # Filter data for the current signal type
#     proc_df = df[df["signalType"] == key]

#     # Iterate over phase nos and add traces
#     for phase_idx, phase_no in enumerate(phase_nos):
#         proc_df_phase = proc_df[proc_df["phaseNo"] == phase_no]

#         # # Only show legend for the first occurrence of each signal type
#         # if not legend_shown[signal_type]:
#         #     show_legend = True
#         #     legend_shown[signal_type] = True  # Mark as shown
#         # else:
#         #     show_legend = False

#         # Add line trace for the average
#         fig.add_trace(
#             go.Scatter(
#                 x=proc_df_phase["hour"],
#                 y=proc_df_phase["durationAvg"],
#                 mode="lines+markers",
#                 marker=dict(color=colors[phase_idx], size=1),  # Increased marker size
#                 line=dict(color=colors[phase_idx], width=3),  # Increased line width
#                 name=f"Phase {phase_no}",
#                 legendgroup=key,
#                 # showlegend=False,
#             ),
#             row=(idx // 2) + 1, col=(idx % 2) + 1
#         )
    
#         # # Add shaded CI region
#         # fig.add_trace(
#         #     go.Scatter(
#         #         x=pd.concat([proc_df_phase["hour"], proc_df_phase["hour"][::-1]]),
#         #         y=pd.concat([proc_df_phase["upperBound"], proc_df_phase["lowerBound"][::-1]]),
#         #         fill="toself",
#         #         fillcolor=color.replace("1)", f"{ci_transparency})"),  # Add transparency to the solid color
#         #         line=dict(color="rgba(0,0,0,0)"),  # No border
#         #         hoverinfo="skip",
#         #         name=f"{key} - CI",
#         #         legendgroup=key,
#         #         showlegend=False,
#         #     ),
#         #     row=(idx // 2) + 1, col=(idx % 2) + 1
#         # )

#     # # Add a custom annotation as a pseudo-legend inside the subplot
#     # fig.add_annotation(
#     #     text="Phases: " + ", ".join([str(p) for p in phase_nos]),
#     #     xref=f"x{idx + 1}",
#     #     yref=f"y{idx + 1}",
#     #     x=23,  # Place near the right side of the subplot
#     #     y=max(proc_df["durationAvg"].fillna(0)) * 0.9,  # Adjust based on data range
#     #     showarrow=False,
#     #     font=dict(size=12, color="black"),
#     #     bgcolor="rgba(255, 255, 255, 0.7)",
#     #     bordercolor="black",
#     #     borderwidth=1,
#     # )
    
# # Smooth the lines
# fig.update_traces(
#     # mode="lines+markers",
#     line_shape="spline",  # Use spline for smooth curves
#     # marker=dict(size=6)  # Adjust marker size for better visibility
# )

# # Update layout
# fig.update_layout(
#     height=800,
#     width=1400,
#     # title_text=f"24-Hour Trends of Average Signal Duration (Sec) of Phase {phase_no} with Confidence Intervals",
#     # title_x=0.5,
#     # title_y=0.95,
#     # font=dict(size=16),  # General font size for the entire figure
#     margin=dict(l=50, r=25, t=25, b=50)  # Margins for the plot
# )

# # for signal_type in dict_label.keys():
# #     fig.update_layout(
# #         legend=dict(
# #             title="Phase No",
# #             x=1,
# #             y=1,
# #             xanchor='right',
# #             yanchor='top',
# #             bgcolor='rgba(255, 255, 255, 0.7)',
# #             bordercolor='black',
# #             borderwidth=1,
# #             font=dict(size=14),
# #             itemclick="toggleothers",  # Interactive toggling
# #             itemdoubleclick="toggle",  # Interactive toggling
# #         ),
# #     )

# # Update x-axis and y-axis labels with adjusted font sizes
# fig.update_xaxes(
#     title_text="Hour of Day",
#     tickformat="%H:%M",  # Format ticks as HH:MM
#     title_font=dict(size=16),  # Increased x-axis title font size
#     tickfont=dict(size=15),  # Increased x-axis tick label font size
# )
# fig.update_yaxes(
#     title_text="Avg. Signal Duration (Sec)",
#     title_font=dict(size=16),  # Increased y-axis title font size
#     tickfont=dict(size=15),  # Increased y-axis tick label font size
# )

# # Export the Plotly figure as a high-resolution image
# fig.write_image("../reports/3.3(b).png", width=1400, height=800, scale=2)

# # Show the plot
# fig.show()

In [23]:
# # Define a colormap for signal types using valid color formats
# dict_colormap = {
#     "green": "rgba(39, 174, 96, 1)",       # Solid green
#     "yellow": "rgba(241, 196, 15, 1)",    # Solid yellow
#     "redClearance": "rgba(236, 112, 99, 1)",  # Solid red-clearance
#     "red": "rgba(203, 67, 53, 1)"         # Solid red
# }

# # Transparency for confidence intervals
# ci_transparency = 0.25

# # Phase to analyze
# phase_no = 2

# # Assuming df_spat_id_hourly exists
# proc_df_spat_id_hourly = (
#     pd.melt(
#         df_spat_id_hourly, 
#         id_vars=["signalID", "date", "hour"], 
#         value_vars=[f"{key}DurationPhase{phase_no}Avg" for key in dict_colormap.keys()], 
#         var_name="featureName", 
#         value_name="durationAvg"
#     )
# )

# # Combine 'date' and 'hour' into a single datetime column
# proc_df_spat_id_hourly["datetime"] = pd.to_datetime(proc_df_spat_id_hourly["date"]) + pd.to_timedelta(proc_df_spat_id_hourly["hour"], unit="h")

# # Group data by hour and featureName
# df = (
#     proc_df_spat_id_hourly.groupby(["hour", "featureName"])
#     .agg(
#         durationAvg=("durationAvg", "mean"),
#         durationStd=("durationAvg", "std"),
#         count=("durationAvg", "count")
#     )
#     .reset_index()
# )

# # Calculate confidence intervals
# z = 1.96  # 95% confidence
# df["upperBound"] = df["durationAvg"] + z * (df["durationStd"] / np.sqrt(df["count"]))
# df["lowerBound"] = df["durationAvg"] - z * (df["durationStd"] / np.sqrt(df["count"]))

# # Create a single plot
# fig = go.Figure()

# # Add traces for each signal type
# for key, color in dict_colormap.items():
#     # Filter data for the current signal type
#     proc_df = df[df["featureName"] == f"{key}DurationPhase{phase_no}Avg"]

#     # Add line trace for the average
#     fig.add_trace(
#         go.Scatter(
#             x=proc_df["hour"],
#             y=proc_df["durationAvg"],
#             mode="lines+markers",
#             marker=dict(color=color, size=8),  # Marker size and color
#             line=dict(color=color, width=3, shape="spline"),  # Smooth line
#             name=f"{key} - Avg",
#             legendgroup=key,
#         )
#     )

#     # Add shaded CI region
#     fig.add_trace(
#         go.Scatter(
#             x=pd.concat([proc_df["hour"], proc_df["hour"][::-1]]),
#             y=pd.concat([proc_df["upperBound"], proc_df["lowerBound"][::-1]]),
#             fill="toself",
#             fillcolor=color.replace("1)", f"{ci_transparency})"),  # Add transparency to the solid color
#             line=dict(color="rgba(0,0,0,0)"),  # No border
#             hoverinfo="skip",
#             name=f"{key} - CI",
#             legendgroup=key,
#             showlegend=False,  # Hide CI in the legend
#         )
#     )

# # Update layout
# fig.update_layout(
#     height=700,
#     width=1400,
#     title=f"24-Hour Trends of Average Signal Duration (Sec) of Phase {phase_no} with Confidence Intervals",
#     title_x=0.5,
#     title_y=0.95,
#     font=dict(size=15),  # General font size
#     legend=dict(
#         title="Signal Type",
#         orientation="h",
#         x=0.5,
#         xanchor="center",
#         y=-0.2,
#         font=dict(size=14),  # Increased font size for legend
#         itemclick="toggleothers",  # Interactive toggling
#         itemdoubleclick="toggle",  # Interactive toggling
#     ),
#     margin=dict(l=50, r=50, t=100, b=50)  # Margins for the plot
# )


# # Update x-axis and y-axis
# fig.update_xaxes(
#     title="Hour of Day",
#     tickformat="%H:%M",  # Format ticks as HH:MM
#     title_font=dict(size=16),  # X-axis title font size
#     tickfont=dict(size=14),  # X-axis tick label font size
# )
# fig.update_yaxes(
#     title="Avg. Signal Duration (Sec)",
#     title_font=dict(size=16),  # Y-axis title font size
#     tickfont=dict(size=14),  # Y-axis tick label font size
# )

# # Show the plot
# fig.show()

### Vehicle Traffic Features

#### Volume

##### Hourly

In [24]:
# df_volume_id_hourly = (
#     load_data(
#         dirpath="../data/production/atspm/fdot_d5/feature_extraction/feature/hourly/vehicle_traffic/volume",
#         signal_id=signal_id
#     )
# )

# print(df_volume_id_hourly.shape)
# df_volume_id_hourly.head(1)

In [25]:
# # Define a colormap for signal types
# dict_colormap = {
#     "green": "rgba(39, 174, 96, 1)",       # Base green
#     "yellow": "rgba(241, 196, 15, 1)",    # Base yellow
#     "redClearance": "rgba(236, 112, 99, 1)",  # Base red-clearance
#     "red": "rgba(203, 67, 53, 1)"         # Base red
# }

# # Transparency for confidence intervals
# ci_transparency = 0.25

# phase_no = 2

# # Define signal types and associated columns
# dict_columns = {
#     "green": [
#         column for column in df_volume_id_hourly.columns if f"{phase_no}" in column and "green" in column
#     ],
#     "yellow": [
#         column for column in df_volume_id_hourly.columns if f"{phase_no}" in column and "yellow" in column
#     ],
#     "redClearance": [
#         column for column in df_volume_id_hourly.columns if f"{phase_no}" in column and "redClearance" in column
#     ],
#     "red": [
#         column for column in df_volume_id_hourly.columns if f"{phase_no}" in column and "red" in column
#     ]
# }

# # Group data by hour, date, and signal type to calculate stats
# proc_df_volume_id_hourly = pd.melt(
#     df_volume_id_hourly,
#     id_vars=["signalID", "hour", "date"],
#     value_vars=[col for cols in dict_columns.values() for col in cols],
#     var_name="feature",
#     value_name="volume"
# )

# # # Parse signal type and lane type from the feature column
# # proc_df_volume_id_hourly["signalType"] = proc_df_volume_id_hourly["feature"].str.extract(r"([a-zA-Z]+)Volume")
# # proc_df_volume_id_hourly["laneType"] = proc_df_volume_id_hourly["feature"].str.extract(r"(T|TR)")

# # Aggregate to compute mean, std, and count for each hour and signal type
# df = (
#     proc_df_volume_id_hourly.groupby(["hour", "feature"])
#     .agg(
#         volumeAvg=("volume", "mean"),
#         volumeStd=("volume", "std"),
#         count=("volume", "count")
#     )
#     .reset_index()
# )

# # Calculate confidence intervals
# z = 1.96  # 95% confidence
# df["upperBound"] = df["volumeAvg"] + z * (df["volumeStd"] / np.sqrt(df["count"]))
# df["lowerBound"] = df["volumeAvg"] - z * (df["volumeStd"] / np.sqrt(df["count"]))

# # Parse signal type and lane type from the feature column
# df["signalType"] = df["feature"].str.extract(r"([a-zA-Z]+)Volume")
# df["laneType"] = df["feature"].str.extract(r"(TR|T)")

# # Create subplots: One subplot per signal type
# fig = make_subplots(
#     rows=len(dict_colormap), cols=1,
#     subplot_titles=[f"Signal Type: {signal_type}" for signal_type in dict_colormap.keys()],
#     shared_xaxes=False,
#     vertical_spacing=0.075
# )

# # Transparency for confidence intervals
# ci_fill_transparency = "0.2"  # Lighter transparency for CI regions

# # Iterate over signal types and create subplots
# for idx, signal_type in enumerate(dict_colormap.keys()):
#     row = idx + 1

#     # Filter data for the current signal type
#     proc_df = df[df["signalType"] == signal_type]

#     # Get unique lane types dynamically
#     lane_types = proc_df["laneType"].unique()

#     # Generate colors dynamically for each lane type
#     dict_colormap_lanes = {
#         lane_type: dict_colormap[signal_type]
#         for lane_type in lane_types
#     }

#     # Iterate over lane types and add traces
#     for lane_type, color in dict_colormap_lanes.items():
#         proc_df_lane = proc_df[proc_df["laneType"] == lane_type]
#         proc_df_lane = float_to_int(proc_df_lane)

#         # Add line trace for the average
#         fig.add_trace(
#             go.Scatter(
#                 x=proc_df_lane["hour"],
#                 y=proc_df_lane["volumeAvg"],
#                 mode="lines+markers",
#                 marker=dict(color=color, size=8),
#                 line=dict(color=color, width=3),
#                 name=f"{signal_type} ({lane_type})",
#             ),
#             row=row, col=1
#         )

#         # Add shaded CI region with lighter transparency
#         fig.add_trace(
#             go.Scatter(
#                 x=pd.concat([proc_df_lane["hour"], proc_df_lane["hour"][::-1]]),
#                 y=pd.concat([proc_df_lane["upperBound"], proc_df_lane["lowerBound"][::-1]]),
#                 fill="toself",
#                 fillcolor=color.replace("1)", f"{ci_fill_transparency})"),  # Lighter fill for CI
#                 line=dict(color="rgba(0,0,0,0)"),  # No border
#                 hoverinfo="skip",
#                 name=f"{signal_type} ({lane_type})",
#                 showlegend=False,  # Don't show CI in the legend
#             ),
#             row=row, col=1
#         )

#         # Add lane type annotation at the end of the line
#         if not proc_df_lane.empty:
#             fig.add_annotation(
#                 x=proc_df_lane["hour"].iloc[-1],  # Last x-coordinate
#                 y=proc_df_lane["volumeAvg"].iloc[-1],  # Last y-coordinate
#                 # text=f"<b>{laneType}</b>",  # Bold text
#                 text=f"{lane_type}",
#                 showarrow=False,
#                 font=dict(size=15, color="black"),  # Black font and increased size
#                 xanchor="left",
#                 yanchor="middle",
#                 row=row,
#                 col=1
#             )


# # Update layout
# fig.update_layout(
#     height=1700,  # Adjust height for readability
#     width=1400,
#     title=f"Hourly Volume Trends for Phase {phase_no} by Signal Type with Confidence Intervals",
#     title_x=0.5,
#     font=dict(size=17),
#     legend=dict(
#         title="Lane Type",
#         orientation="h",
#         x=0.5,
#         xanchor="center",
#         y=-0.1,
#         font=dict(size=14)
#     ),
#     margin=dict(l=50, r=50, t=100, b=50)  # Margins for better spacing
# )

# # Update axes for each subplot
# for i in range(1, len(dict_colormap) + 1):  # Iterate over rows
#     fig.update_xaxes(
#         title_text="Hour of Day",
#         tickmode="array",
#         # tickvals=list(range(24)),
#         # ticktext=[f"{hour}:00" for hour in range(24)],
#         title_font=dict(size=16),
#         tickfont=dict(size=16),
#         row=i, col=1
#     )
#     fig.update_yaxes(
#         title_text="Volume",
#         title_font=dict(size=16),
#         tickfont=dict(size=16),
#         row=i, col=1
#     )

# # Show the plot
# fig.show()

In [26]:
# columns = [
#     f"{volume_type}Phase{phase_no}" 
#     # for phase_no in [1, 2, 3, 4, 5, 6, 7, 8] 
#     for phase_no in [2, 4, 6, 8] 
#     for volume_type in ["volume", "greenVolume", "yellowVolume", "redClearanceVolume", "redVolume"]
# ]

# for column in columns:
#     proc_columns = df_volume_id_hourly.columns.tolist()
#     proc_columns = [proc_column for proc_column in proc_columns if column in proc_column]

#     df_volume_id_hourly[column] = (
#         df_volume_id_hourly[proc_columns].apply(lambda row: row.sum(), axis=1)
#     )

In [27]:
# columns = [
#     f"{volume_type}Phase{phase_no}" 
#     # for phase_no in [1, 2, 3, 4, 5, 6, 7, 8] 
#     for phase_no in [2, 4, 6, 8] 
#     for volume_type in ["greenVolume", "yellowVolume", "redClearanceVolume", "redVolume"]
# ]

# # Group data by hour, date, and signal type to calculate stats
# proc_df_volume_id_hourly = pd.melt(
#     df_volume_id_hourly,
#     id_vars=["signalID", "hour", "date"],
#     value_vars=columns,
#     var_name="feature",
#     value_name="volume"
# )

In [28]:
# # Aggregate to compute mean, std, and count for each hour and signal type
# df = (
#     proc_df_volume_id_hourly.groupby(["hour", "feature"])
#     .agg(
#         volumeAvg=("volume", "mean"),
#         volumeStd=("volume", "std"),
#         count=("volume", "count")
#     )
#     .reset_index()
# )

# # Calculate confidence intervals
# z = 1.96  # 95% confidence
# df["upperBound"] = df["volumeAvg"] + z * (df["volumeStd"] / np.sqrt(df["count"]))
# df["lowerBound"] = df["volumeAvg"] - z * (df["volumeStd"] / np.sqrt(df["count"]))

# # Parse signal type and lane type from the feature column
# df["signalType"] = df["feature"].str.extract(r"([a-zA-Z]+)Volume")
# df["phaseNo"] = df["feature"].str.extract(r"(\d)").astype(int)

# dict_approach_map = {
#     1: "major1", 6: "major1",
#     5: "major2", 2: "major2",
#     3: "major3", 8: "major3",
#     7: "major4", 4: "major4",
# }

# df["approachType"] = df["phaseNo"].map(dict_approach_map)

# dict_approach_dir = {
#     2: "Westbound (Major Road)",
#     6: "Eastbound (Major Road)",
#     4: "Northbound (Minor Road)",
#     8: "Southbound (Minor Road)"
# }

# # Define a colormap for signal types using valid color formats
# dict_colormap = {
#     "green": "rgba(39, 174, 96, 1)",       # Solid green
#     # "yellow": "rgba(241, 196, 15, 1)",    # Solid yellow
#     # "redClearance": "rgba(236, 112, 99, 1)",  # Solid red-clearance
#     "red": "rgba(203, 67, 53, 1)"         # Solid red
# }

# dict_label = {
#     "green": "Green",       
#     # "yellow": "Yellow",  
#     # "redClearance": "Red Clearance", 
#     "red": "Red"    
# }

# # Transparency for confidence intervals
# ci_fill_transparency = 0.25

# # # Approach Type to analyze
# # approach_type = "major2"

# # phase_nos = df[df["approachType"] == approach_type]["phaseNo"].unique().tolist()
# phase_nos = [2, 6, 4, 8]

# # Create a 1x2 grid for subplots
# fig = make_subplots(
#     rows=2, cols=2,
#     subplot_titles=[f"{dict_approach_dir[phase_no]}" for phase_no in phase_nos],
#     shared_xaxes=False, shared_yaxes=False,
#     vertical_spacing=0.15
# )

# # Track which signal types have been added to the legend
# legend_shown = {"green": False, "red": False}

# # Iterate over phases and add traces
# row_col_mapping = [(1, 1), (1, 2), (2, 1), (2, 2)]  # Map phases to subplot positions
# for idx, (phase_no, (row, col)) in enumerate(zip(phase_nos, row_col_mapping)):
#     proc_df = df[df["phaseNo"] == phase_no]

#     # Iterate over lane types and add traces
#     for signal_type, color in dict_colormap.items():
#         proc_df_signal = proc_df[proc_df["signalType"] == signal_type]
#         proc_df_signal = float_to_int(proc_df_signal)

#         # Only show legend for the first occurrence of each signal type
#         if not legend_shown[signal_type]:
#             show_legend = True
#             legend_shown[signal_type] = True  # Mark as shown
#         else:
#             show_legend = False

#         # Add line trace for the average
#         fig.add_trace(
#             go.Scatter(
#                 x=proc_df_signal["hour"],
#                 y=proc_df_signal["volumeAvg"],
#                 mode="lines+markers",
#                 marker=dict(color=color, size=8),
#                 line=dict(color=color, width=3),
#                 # name=f"{dict_label[signal_type]}",
#                 showlegend=False
#             ),
#             row=row, col=col
#         )

#         # Add shaded CI region with lighter transparency
#         fig.add_trace(
#             go.Scatter(
#                 x=pd.concat([proc_df_signal["hour"], proc_df_signal["hour"][::-1]]),
#                 y=pd.concat([proc_df_signal["upperBound"], proc_df_signal["lowerBound"][::-1]]),
#                 fill="toself",
#                 fillcolor=color.replace("1)", f"{ci_fill_transparency})"),  # Lighter fill for CI
#                 line=dict(color="rgba(0,0,0,0)"),  # No border
#                 hoverinfo="skip",
#                 name=f"{dict_label[signal_type]} - 95% Confidence Interval",
#                 showlegend=show_legend
#             ),
#             row=row, col=col
#         )

# # # Update layout
# # fig.update_layout(
# #     # title="Proportion of Cycles Recommended with PR by Hour with 95% Confidence Intervals",
# #     height=800,
# #     width=1400,
# #     showlegend=False,
# #     xaxis_title="Hour of Day",
# #     font=dict(size=14),
# #     margin=dict(r=25, t=50, l=50, b=50)  # Margins for the plot
# # )

# # Update layout with legend positioned at the bottom
# fig.update_layout(
#     height=800,
#     width=1400,
#     showlegend=True,  # Ensure legend is shown
#     xaxis_title="Hour of Day",
#     font=dict(size=14),
#     margin=dict(r=25, t=50, l=50, b=100),  # Increase bottom margin for legend space
#     legend=dict(
#         orientation="h",  # Horizontal legend layout
#         x=0.5,  # Centered horizontally
#         y=-0.1,  # Position below the plot
#         xanchor="center",
#         yanchor="top",
#         font=dict(size=16),
#         bgcolor="rgba(255,255,255,0.8)",  # Add background for better readability
#     )
# )

# # Update axis labels for shared x/y axes
# fig.update_xaxes(
#     title_text="Hour of Day",
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Update y-axes for shared configuration
# fig.update_yaxes(
#     title_text="Vehicle Volume", 
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Export the Plotly figure as a high-resolution image
# fig.write_image("../reports/3.4.png", width=1400, height=800, scale=2)

# # Show plot
# fig.show()

#### Platoon Ratio

##### Hourly

In [29]:
# df_spat_id = (
#     load_data(
#         dirpath="../data/production/atspm/fdot_d5/feature_extraction/feature/cycle/vehicle_signal/spat", 
#         signal_id=signal_id)
# )

# df_spat_id["hour"] = df_spat_id["cycleBegin"].dt.hour 

# phase_nos = list(set([int(column[-1]) for column in df_spat_id.columns if "Duration" in column]))

# df_spat_id_hourly = (
#     df_spat_id
#     .groupby(["signalID", "date", "hour"])
#     .agg(
#         cycleLengthAvg=("cycleLength", "mean"),
#         **{
#             f"greenDurationPhase{phase_no}Avg": (f"greenDurationPhase{phase_no}", "mean")
#             for phase_no in phase_nos
#         }
#     )
#     .reset_index()
# )

In [30]:
# df_volume_id_hourly = (
#     load_data(
#         dirpath="../data/production/atspm/fdot_d5/feature_extraction/feature/hourly/vehicle_traffic/volume",
#         signal_id=signal_id
#     )
# )

# # print(df_volume_id_hourly.shape)
# # df_volume_id_hourly.head(1)

# columns = [
#     f"{signal_type}VolumePhase{phase_no}" 
#     for phase_no in [1, 2, 3, 4, 5, 6, 7, 8] 
#     # for phase_no in [2, 4, 6, 8] 
#     for signal_type in ["green", "yellow", "redClearance", "red"]
# ]

# for column in columns:
#     proc_columns = df_volume_id_hourly.columns.tolist()
#     proc_columns = [proc_column for proc_column in proc_columns if column in proc_column]

#     df_volume_id_hourly[column] = (
#         df_volume_id_hourly[proc_columns].apply(lambda row: row.sum(), axis=1)
#     )
#     if int(column[-1]) % 2 != 0:
#         df_volume_id_hourly = df_volume_id_hourly.drop(columns=column)
        
#     df_volume_id_hourly = df_volume_id_hourly.drop(columns=proc_columns)

# columns_volume = [
#     f"volumePhase{phase_no}" 
#     for phase_no in [1, 2, 3, 4, 5, 6, 7, 8] 
# ]
# for column in columns_volume:
#     proc_columns = df_volume_id_hourly.columns.tolist()
#     proc_columns = [proc_column for proc_column in proc_columns if column in proc_column]
                   
#     df_volume_id_hourly = df_volume_id_hourly.drop(columns=proc_columns)

In [31]:
# df_platoon_ratio_id_hourly = df_volume_id_hourly.copy()

# for phase_no in [2, 4, 6, 8]:
#     columns = [
#         f"{signal_type}VolumePhase{phase_no}" for signal_type in ["green", "yellow", "redClearance", "red"]
#     ]
    
#     # df_platoon_ratio_id_hourly = df_platoon_ratio_id_hourly.copy()
#     df_platoon_ratio_id_hourly[f"platoonRatioPhase{phase_no}"] = (
#         (
#             (
#                 df_platoon_ratio_id_hourly[f"greenVolumePhase{phase_no}"] 
#                 /  
#                 df_platoon_ratio_id_hourly[columns].apply(lambda row: row.sum(), axis=1)
#             )
#             *
#             (
#                 df_spat_id_hourly["cycleLengthAvg"]
#                 /
#                 df_spat_id_hourly[f"greenDurationPhase{phase_no}Avg"]
#             )
#         )
#         .round(2)    
#     )
    
#     df_platoon_ratio_id_hourly = df_platoon_ratio_id_hourly.drop(columns=columns)

# df_platoon_ratio_id_hourly = (
#     pd.melt(df_platoon_ratio_id_hourly, 
#             id_vars=["signalID", "date", "hour"], 
#             value_vars=[f"platoonRatioPhase{phase_no}" for phase_no in [2, 4, 6, 8]], 
#             var_name="phaseNo", value_name="platoonRatio")
# )
# df_platoon_ratio_id_hourly["phaseNo"] = df_platoon_ratio_id_hourly["phaseNo"].str.extract(r"(\d)").astype(int)

In [32]:
# # Aggregate to compute mean, std, and count for each hour and signal type
# df = (
#     df_platoon_ratio_id_hourly.groupby(["hour", "phaseNo"])
#     .agg(
#         platoonRatioAvg=("platoonRatio", "mean"),
#         platoonRatioStd=("platoonRatio", "std"),
#         count=("platoonRatio", "count")
#     )
#     .reset_index()
# )

# # Calculate confidence intervals
# z = 1.96  # 95% confidence
# df["upperBound"] = df["platoonRatioAvg"] + z * (df["platoonRatioStd"] / np.sqrt(df["count"]))
# df["lowerBound"] = df["platoonRatioAvg"] - z * (df["platoonRatioStd"] / np.sqrt(df["count"]))

# df["lowerBound"] = df["lowerBound"].clip(lower=0)

# dict_approach_map = {
#     1: "major1", 6: "major1",
#     5: "major2", 2: "major2",
#     3: "major3", 8: "major3",
#     7: "major4", 4: "major4",
# }

# df["approachType"] = df["phaseNo"].map(dict_approach_map)

# dict_approach_dir = {
#     2: "Westbound (Major Road)",
#     6: "Eastbound (Major Road)",
#     4: "Northbound (Minor Road)",
#     8: "Southbound (Minor Road)"
# }

# # Transparency for confidence intervals
# ci_transparency = 0.25

# # # Approach Type to analyze
# # approach_type = "major2"

# # phase_nos = df[df["approachType"] == approach_type]["phaseNo"].unique().tolist()
# phase_nos = [2, 6, 4, 8]

# # Create a 1x2 grid for subplots
# fig = make_subplots(
#     rows=2, cols=2,
#     subplot_titles=[f"{dict_approach_dir[phase_no]}" for phase_no in phase_nos],
#     shared_xaxes=False, shared_yaxes=False,
#     vertical_spacing=0.15
# )

# color = "rgba(0, 51, 153, 1)"

# # Iterate over phases and add traces
# row_col_mapping = [(1, 1), (1, 2), (2, 1), (2, 2)]  # Map phases to subplot positions
# for idx, (phase_no, (row, col)) in enumerate(zip(phase_nos, row_col_mapping)):
#     proc_df = df[df["phaseNo"] == phase_no]

#     # Add line trace for the average
#     fig.add_trace(
#         go.Scatter(
#             x=proc_df["hour"],
#             y=proc_df["platoonRatioAvg"],
#             mode="lines+markers",
#             marker=dict(color=color, size=8),
#             line=dict(color=color, width=3),
#             showlegend=False
#         ),
#         row=row, col=col
#     )

#     # Add shaded CI region with lighter transparency
#     fig.add_trace(
#         go.Scatter(
#             x=pd.concat([proc_df["hour"], proc_df["hour"][::-1]]),
#             y=pd.concat([proc_df["upperBound"], proc_df["lowerBound"][::-1]]),
#             fill="toself",
#             fillcolor=color.replace("1)", f"{ci_transparency})"),  # Lighter fill for CI
#             line=dict(color="rgba(0,0,0,0)"),  # No border
#             hoverinfo="skip",
#             name="95% Confidence Interval",
#             showlegend=(idx==0)
#         ),
#         row=row, col=col
#     )

# # Update layout
# fig.update_layout(
#     # title="Proportion of Cycles Recommended with PR by Hour with 95% Confidence Intervals",
#     height=800,
#     width=1400,
#     xaxis_title="Hour of Day",
#     font=dict(size=14),
#     margin=dict(r=25, t=50, l=50, b=50),  # Margins for the plot
#     showlegend=True,
#     legend=dict(
#         orientation="h",  # Horizontal legend layout
#         x=0.5,  # Centered horizontally
#         y=-0.1,  # Position below the plot
#         xanchor="center",
#         yanchor="top",
#         font=dict(size=16),
#         # bgcolor="rgba(255,255,255,0.8)",  # Add background for better readability
#     )
# )

# # Update axis labels for shared x/y axes
# fig.update_xaxes(
#     title_text="Hour of Day",
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Update y-axes for shared configuration
# fig.update_yaxes(
#     title_text="Avg. Platoon Ratio", 
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Export the Plotly figure as a high-resolution image
# fig.write_image("../reports/3.5.png", width=1400, height=800, scale=2)

# # Show plot
# fig.show()

#### Occupancy

##### Hourly

In [33]:
# df_occupancy_id_hourly = (
#     load_data(
#         dirpath="../data/production/atspm/fdot_d5/feature_extraction/feature/hourly/vehicle_traffic/occupancy",
#         signal_id=signal_id
#     )
# )

# print(df_occupancy_id_hourly.shape)
# df_occupancy_id_hourly.head(1)

In [34]:
# # Define a colormap for signal types
# dict_colormap = {
#     "green": "rgba(39, 174, 96, 1)",       # Base green
#     "yellow": "rgba(241, 196, 15, 1)",    # Base yellow
#     "redClearance": "rgba(236, 112, 99, 1)",  # Base red-clearance
#     "red": "rgba(203, 67, 53, 1)"         # Base red
# }

# # Transparency for confidence intervals
# ci_transparency = 0.25

# phase_no = 2

# # Define signal types and associated columns
# dict_columns = {
#     "green": [
#         column for column in df_occupancy_id_hourly.columns if f"{phase_no}" in column and "green" in column
#     ],
#     "yellow": [
#         column for column in df_occupancy_id_hourly.columns if f"{phase_no}" in column and "yellow" in column
#     ],
#     "redClearance": [
#         column for column in df_occupancy_id_hourly.columns if f"{phase_no}" in column and "redClearance" in column
#     ],
#     "red": [
#         column for column in df_occupancy_id_hourly.columns if f"{phase_no}" in column and "red" in column
#     ]
# }

# columns = [
#     col for cols in dict_columns.values() 
#     for col in cols 
#     if not any(k in col for k in ["Min", "Max", "Std"])
# ]

# # Group data by hour, date, and signal type to calculate stats
# proc_df_occupancy_id_hourly = pd.melt(
#     df_occupancy_id_hourly,
#     id_vars=["signalID", "hour", "date"],
#     value_vars=columns,
#     var_name="feature",
#     value_name="occupancy"
# )

# # # Parse signal type and lane type from the feature column
# # proc_df_occupancy_id_hourly["signalType"] = proc_df_occupancy_id_hourly["feature"].str.extract(r"([a-zA-Z]+)Occupancy")
# # proc_df_occupancy_id_hourly["laneType"] = proc_df_occupancy_id_hourly["feature"].str.extract(r"(T|TR)")

# # Aggregate to compute mean, std, and count for each hour and signal type
# df = (
#     proc_df_occupancy_id_hourly.groupby(["hour", "feature"])
#     .agg(
#         occupancyAvg=("occupancy", "mean"),
#         occupancyStd=("occupancy", "std"),
#         count=("occupancy", "count")
#     )
#     .reset_index()
# )

# # Calculate confidence intervals
# z = 1.96  # 95% confidence
# df["upperBound"] = df["occupancyAvg"] + z * (df["occupancyStd"] / np.sqrt(df["count"]))
# df["lowerBound"] = df["occupancyAvg"] - z * (df["occupancyStd"] / np.sqrt(df["count"]))

# # Parse signal type and lane type from the feature column
# df["signalType"] = df["feature"].str.extract(r"([a-zA-Z]+)AvgOccupancy")
# df["laneType"] = df["feature"].str.extract(r"(TR|T)")

# # Create subplots: One subplot per signal type
# fig = make_subplots(
#     rows=len(dict_colormap), cols=1,
#     subplot_titles=[f"Signal Type: {signal_type}" for signal_type in dict_colormap.keys()],
#     shared_xaxes=False,
#     vertical_spacing=0.075
# )

# # Transparency for confidence intervals
# ci_fill_transparency = "0.2"  # Lighter transparency for CI regions

# # Iterate over signal types and create subplots
# for idx, signal_type in enumerate(dict_colormap.keys()):
#     row = idx + 1

#     # Filter data for the current signal type
#     proc_df = df[df["signalType"] == signal_type]

#     # Get unique lane types dynamically
#     lane_types = proc_df["laneType"].unique()

#     # Generate colors dynamically for each lane type
#     dict_colormap_lanes = {
#         lane_type: dict_colormap[signal_type]
#         for lane_type in lane_types
#     }

#     # Iterate over lane types and add traces
#     for lane_type, color in dict_colormap_lanes.items():
#         proc_df_lane = proc_df[proc_df["laneType"] == lane_type]

#         # Add line trace for the average
#         fig.add_trace(
#             go.Scatter(
#                 x=proc_df_lane["hour"],
#                 y=proc_df_lane["occupancyAvg"],
#                 mode="lines+markers",
#                 marker=dict(color=color, size=8),
#                 line=dict(color=color, width=3),
#                 name=f"{signal_type} ({lane_type})",
#             ),
#             row=row, col=1
#         )

#         # Add shaded CI region with lighter transparency
#         fig.add_trace(
#             go.Scatter(
#                 x=pd.concat([proc_df_lane["hour"], proc_df_lane["hour"][::-1]]),
#                 y=pd.concat([proc_df_lane["upperBound"], proc_df_lane["lowerBound"][::-1]]),
#                 fill="toself",
#                 fillcolor=color.replace("1)", f"{ci_fill_transparency})"),  # Lighter fill for CI
#                 line=dict(color="rgba(0,0,0,0)"),  # No border
#                 hoverinfo="skip",
#                 name=f"{signal_type} ({lane_type})",
#                 showlegend=False,  # Don't show CI in the legend
#             ),
#             row=row, col=1
#         )

#         # Add lane type annotation at the end of the line
#         if not proc_df_lane.empty:
#             fig.add_annotation(
#                 x=proc_df_lane["hour"].iloc[-1],  # Last x-coordinate
#                 y=proc_df_lane["occupancyAvg"].iloc[-1],  # Last y-coordinate
#                 # text=f"<b>{laneType}</b>",  # Bold text
#                 text=f"{lane_type}",
#                 showarrow=False,
#                 font=dict(size=15, color="black"),  # Black font and increased size
#                 xanchor="left",
#                 yanchor="middle",
#                 row=row,
#                 col=1
#             )


# # Update layout
# fig.update_layout(
#     height=1700,  # Adjust height for readability
#     width=1400,
#     title=f"Hourly Occupancy Trends for Phase {phase_no} by Signal Type with Confidence Intervals",
#     title_x=0.5,
#     font=dict(size=17),
#     legend=dict(
#         title="Lane Type",
#         orientation="h",
#         x=0.5,
#         xanchor="center",
#         y=-0.1,
#         font=dict(size=14)
#     ),
#     margin=dict(l=50, r=50, t=100, b=50)  # Margins for better spacing
# )

# # Update axes for each subplot
# for i in range(1, len(dict_colormap) + 1):  # Iterate over rows
#     fig.update_xaxes(
#         title_text="Hour of Day",
#         tickmode="array",
#         # tickvals=list(range(24)),
#         # ticktext=[f"{hour}:00" for hour in range(24)],
#         title_font=dict(size=16),
#         tickfont=dict(size=16),
#         row=i, col=1
#     )
#     fig.update_yaxes(
#         title_text="Occupancy",
#         title_font=dict(size=16),
#         tickfont=dict(size=16),
#         row=i, col=1
#     )

# # Show the plot
# fig.show()

In [35]:
# columns = [
#     f"{signal_type}AvgOccupancyPhase{phase_no}" 
#     for phase_no in [1, 2, 3, 4, 5, 6, 7, 8] 
#     # for phase_no in [2, 4, 6, 8] 
#     # for signal_type in ["green", "yellow", "redClearance", "red"]
#     for signal_type in ["red"]
# ]

# for column in columns:
#     proc_columns = df_occupancy_id_hourly.columns.tolist()
#     proc_columns = [
#         proc_column for proc_column in proc_columns 
#         if column in proc_column and not any(k in proc_column for k in ["Min", "Max", "Std"])
#     ]
    
#     df_occupancy_id_hourly[column] = (
#         df_occupancy_id_hourly[proc_columns].apply(lambda row: row.mean(), axis=1)
#     )

# df_occupancy_id_hourly = df_occupancy_id_hourly[["signalID", "date", "hour"] + columns]

In [36]:
# columns = [
#     f"{signal_type}AvgOccupancyPhase{phase_no}" 
#     for phase_no in [1, 2, 3, 4, 5, 6, 7, 8] 
#     # for phase_no in [2, 4, 6, 8] 
#     # for signal_type in ["green", "yellow", "redClearance", "red"]
#     for signal_type in ["red"]
# ]

# # Group data by hour, date, and signal type to calculate stats
# proc_df_occupancy_id_hourly = pd.melt(
#     df_occupancy_id_hourly,
#     id_vars=["signalID", "hour", "date"],
#     value_vars=columns,
#     var_name="feature",
#     value_name="occupancy"
# )

In [37]:
# # Aggregate to compute mean, std, and count for each hour and signal type
# df = (
#     proc_df_occupancy_id_hourly.groupby(["hour", "feature"])
#     .agg(
#         occupancyAvg=("occupancy", "mean"),
#         occupancyStd=("occupancy", "std"),
#         count=("occupancy", "count")
#     )
#     .reset_index()
# )

# # Calculate confidence intervals
# z = 1.96  # 95% confidence
# df["upperBound"] = df["occupancyAvg"] + z * (df["occupancyStd"] / np.sqrt(df["count"]))
# df["lowerBound"] = df["occupancyAvg"] - z * (df["occupancyStd"] / np.sqrt(df["count"]))

# # Parse signal type and lane type from the feature column
# df["signalType"] = df["feature"].str.extract(r"([a-zA-Z]+)AvgOccupancy")
# df["phaseNo"] = df["feature"].str.extract(r"(\d)").astype(int)

# dict_approach_map = {
#     1: "major1", 6: "major1",
#     5: "major2", 2: "major2",
#     3: "minor1", 8: "minor1",
#     7: "minor2", 4: "minor2",
# }

# df["approachType"] = df["phaseNo"].map(dict_approach_map)

# # Define a colormap for signal types using valid color formats
# dict_colormap = {
#     # "green": "rgba(39, 174, 96, 1)",       # Solid green
#     # "yellow": "rgba(241, 196, 15, 1)",    # Solid yellow
#     # "redClearance": "rgba(236, 112, 99, 1)",  # Solid red-clearance
#     "red": "rgba(203, 67, 53, 1)"         # Solid red
# }

# dict_label = {
#     # "green": "Green",       
#     # "yellow": "Yellow",  
#     # "redClearance": "Red Clearance", 
#     "red": "Red"    
# }

# # Transparency for confidence intervals
# ci_transparency = 0.25

# # Approach Type to analyze
# approach_type = "minor2"

# phase_nos = df[df["approachType"] == approach_type]["phaseNo"].unique().tolist()
# phase_nos = sorted(phase_nos, key=lambda x: x % 2 == 0)

# # Create a 1x2 grid for subplots
# fig = make_subplots(
#     # rows=2, cols=2,
#     rows=1, cols=2,
#     subplot_titles=[f"Phase No: {phase_no}" for phase_no in phase_nos],
#     shared_xaxes=False, shared_yaxes=False,
#     vertical_spacing=0.15
# )

# # Iterate over phases and add traces
# # row_col_mapping = [(1, 1), (1, 2), (2, 1), (2, 2)]  # Map phases to subplot positions
# row_col_mapping = [(1, 1), (1, 2)]  # Map phases to subplot positions
# for idx, (phase_no, (row, col)) in enumerate(zip(phase_nos, row_col_mapping)):
#     proc_df = df[df["phaseNo"] == phase_no]

#     # Iterate over lane types and add traces
#     for signal_type, color in dict_colormap.items():
#         proc_df_signal = proc_df[proc_df["signalType"] == signal_type]
#         proc_df_signal = float_to_int(proc_df_signal)

#         # Add line trace for the average
#         fig.add_trace(
#             go.Scatter(
#                 x=proc_df_signal["hour"],
#                 y=proc_df_signal["occupancyAvg"],
#                 mode="lines+markers",
#                 marker=dict(color=color, size=8),
#                 line=dict(color=color, width=3),
#                 name=f"{dict_label[signal_type]}",
#                 showlegend=False
#             ),
#             row=row, col=col
#         )

#         # Add shaded CI region with lighter transparency
#         fig.add_trace(
#             go.Scatter(
#                 x=pd.concat([proc_df_signal["hour"], proc_df_signal["hour"][::-1]]),
#                 y=pd.concat([proc_df_signal["upperBound"], proc_df_signal["lowerBound"][::-1]]),
#                 fill="toself",
#                 fillcolor=color.replace("1)", f"{ci_fill_transparency})"),  # Lighter fill for CI
#                 line=dict(color="rgba(0,0,0,0)"),  # No border
#                 hoverinfo="skip",
#                 name=f"{dict_label[signal_type]} - 95% Confidence Interval",
#                 showlegend=(idx==0)
#             ),
#             row=row, col=col
#         )

# # Update layout
# fig.update_layout(
#     # title="Proportion of Cycles Recommended with PR by Hour with 95% Confidence Intervals",
#     height=400,
#     width=1400,
#     xaxis_title="Hour of Day",
#     font=dict(size=14),
#     margin=dict(r=25, t=50, l=50, b=50),  # Margins for the plot
#     # showlegend=False,
#     showlegend=True,
#     legend=dict(
#         orientation="h",  # Horizontal legend layout
#         x=0.5,  # Centered horizontally
#         y=-0.2,  # Position below the plot
#         xanchor="center",
#         yanchor="top",
#         font=dict(size=16),
#         # bgcolor="rgba(255,255,255,0.8)",  # Add background for better readability
#     )
# )

# # Update axis labels for shared x/y axes
# fig.update_xaxes(
#     title_text="Hour of Day",
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Update y-axes for shared configuration
# fig.update_yaxes(
#     title_text="Avg. Occupancy Time (Sec)", 
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Export the Plotly figure as a high-resolution image
# fig.write_image("../reports/3.6(d).png", width=1400, height=400, scale=2)

# # Show plot
# fig.show()

#### Split Failure

##### Hourly

In [65]:
# df_split_failure_id = (
#     load_data(
#         dirpath="../data/production/atspm/fdot_d5/feature_extraction/feature/cycle/vehicle_traffic/split_failure",
#         signal_id=signal_id
#     )
# )

# # print(df_split_failure_id.shape)
# # df_split_failure_id.head(1)

# # Extract unique phase numbers efficiently
# phase_nos = sorted(
#     set(
#         int(re.search(r"Phase(\d+)", column).group(1))
#         for column in df_split_failure_id.columns if "SplitFailure" in column
#     )
# )

# # Precompute the column groups for each phase
# dict_columns = {
#     phase_no: [col for col in df_split_failure_id.columns if f"Phase{phase_no}" in col]
#     for phase_no in phase_nos
# }

# # Vectorized operation without looping over rows
# for phase_no, columns in tqdm.tqdm(dict_columns.items()):
#     df_split_failure_id[f"splitFailurePhase{phase_no}"] = df_split_failure_id[columns].max(axis=1)

# # Drop all processed columns at once (efficient batch drop)
# df_split_failure_id.drop(columns=[col for cols in dict_columns.values() for col in cols], 
#                          inplace=True)

# df_split_failure_id["hour"] = df_split_failure_id["cycleBegin"].dt.hour

In [66]:
# phase_nos = sorted(
#     set(
#         int(re.search(r"Phase(\d+)", column).group(1))
#         for column in df_split_failure_id.columns if "splitFailure" in column
#     )
# )

# df_split_failure_id_hourly = (
#     df_split_failure_id
#     .groupby(["signalID", "date", "hour"])
#     .agg(
#         **{
#             f"splitFailurePhase{phase_no}": (f"splitFailurePhase{phase_no}", "sum")
#             for phase_no in phase_nos 
#         }
#     )
#     .reset_index()  
# )

# columns = [
#     f"splitFailurePhase{phase_no}" 
#     for phase_no in phase_nos 
# ]

# # Group data by hour, date, and signal type to calculate stats
# proc_df_split_failure_id_hourly = pd.melt(
#     df_split_failure_id_hourly,
#     id_vars=["signalID", "date", "hour"],
#     value_vars=columns,
#     var_name="feature",
#     value_name="splitFailure"
# )

In [76]:
# Aggregate to compute mean, std, and count for each hour and signal type
df = (
    proc_df_split_failure_id_hourly.groupby(["hour", "feature"])
    .agg(
        splitFailureAvg=("splitFailure", "mean"),
        splitFailureStd=("splitFailure", "std"),
        count=("splitFailure", "count")
    )
    .reset_index()
)

# Calculate confidence intervals
z = 1.96  # 95% confidence
df["upperBound"] = df["splitFailureAvg"] + z * (df["splitFailureStd"] / np.sqrt(df["count"]))
df["lowerBound"] = df["splitFailureAvg"] - z * (df["splitFailureStd"] / np.sqrt(df["count"]))

# Parse signal type and lane type from the feature column
df["signalType"] = df["feature"].str.extract(r"([a-zA-Z]+)SplitFailure")
df["phaseNo"] = df["feature"].str.extract(r"(\d)").astype(int)

dict_approach_map = {
    1: "major1", 6: "major1",
    5: "major2", 2: "major2",
    3: "minor1", 8: "minor1",
    7: "minor2", 4: "minor2",
}

df["approachType"] = df["phaseNo"].map(dict_approach_map)

# Transparency for confidence intervals
ci_fill_transparency = 0.25

# Approach Type to analyze
approach_type = "minor1"

phase_nos = df[df["approachType"] == approach_type]["phaseNo"].unique().tolist()
phase_nos = sorted(phase_nos, key=lambda x: x % 2 == 0)

color = "rgba(0, 51, 153, 1)"

# Create a 1x2 grid for subplots
fig = make_subplots(
    # rows=2, cols=2,
    rows=1, cols=2,
    subplot_titles=[f"Phase No: {phase_no}" for phase_no in phase_nos],
    shared_xaxes=False, shared_yaxes=False,
    vertical_spacing=0.15
)

# Iterate over phases and add traces
# row_col_mapping = [(1, 1), (1, 2), (2, 1), (2, 2)]  # Map phases to subplot positions
row_col_mapping = [(1, 1), (1, 2)]  # Map phases to subplot positions
for idx, (phase_no, (row, col)) in enumerate(zip(phase_nos, row_col_mapping)):
    proc_df = df[df["phaseNo"] == phase_no]

    # Add line trace for the average
    fig.add_trace(
        go.Scatter(
            x=proc_df["hour"],
            y=proc_df["splitFailureAvg"],
            mode="lines+markers",
            marker=dict(color=color, size=8),
            line=dict(color=color, width=3),
            showlegend=False
        ),
        row=row, col=col
    )

    # Add shaded CI region with lighter transparency
    fig.add_trace(
        go.Scatter(
            x=pd.concat([proc_df["hour"], proc_df["hour"][::-1]]),
            y=pd.concat([proc_df["upperBound"], proc_df["lowerBound"][::-1]]),
            fill="toself",
            fillcolor=color.replace("1)", f"{ci_fill_transparency})"),  # Lighter fill for CI
            line=dict(color="rgba(0,0,0,0)"),  # No border
            hoverinfo="skip",
            name="95% Confidence Interval",  
            showlegend=(idx == 0)
        ),
        row=row, col=col
    )

# Update layout
fig.update_layout(
    # title="Proportion of Cycles Recommended with PR by Hour with 95% Confidence Intervals",
    height=400,
    width=1400,
    xaxis_title="Hour of Day",
    font=dict(size=14),
    margin=dict(r=25, t=50, l=50, b=50),  # Margins for the plot
    showlegend=False,
    # showlegend=True,
    # legend=dict(
    #     orientation="h",  # Horizontal legend layout
    #     x=0.5,  # Centered horizontally
    #     y=-0.2,  # Position below the plot
    #     xanchor="center",
    #     yanchor="top",
    #     font=dict(size=16),
    #     # bgcolor="rgba(255,255,255,0.8)",  # Add background for better readability
    # )
)

# Update axis labels for shared x/y axes
fig.update_xaxes(
    title_text="Hour of Day",
    title_font=dict(size=16),
    tickfont=dict(size=16),
)

# df = float_to_int(df)

# for phase_no in phase_nos:
#     proc_df = df[df["phaseNo"] == phase_no]

#     # Iterate over lane types and add traces
#     for signal_type, color in dict_colormap.items():
#         proc_df_signal = proc_df[proc_df["signalType"] == signal_type]
#         # proc_df_signal = float_to_int(proc_df_signal)

#         max_val = int(proc_df_signal["splitFailureAvg"].max())

#         if max_val < 2:
#             # Update y-axes for shared configuration
#             fig.update_yaxes(
#                 title_text="Avg. Split Failure", 
#                 title_font=dict(size=16),
#                 tickfont=dict(size=16),
#                 # tickmode="linear",  # Ensure ticks are evenly spaced
#                 # tick0=0,            # Starting tick value
#                 # dtick=1,             # Interval between ticks (ensures integer values)
#                 # range=[-0.5, 1.5]   # Expand range slightly to force 1 to appear
#             )
#         else:
#             fig.update_yaxes(
#                 tickmode="linear",
#                 dtick=tick_interval,
#                 tickfont=dict(size=14)
#             )

fig.update_yaxes(
    title_text="No. of Cycles with Split Failure", 
    title_font=dict(size=16),
    tickfont=dict(size=16),
)

# Export the Plotly figure as a high-resolution image
fig.write_image("../reports/3.7(c).png", width=1400, height=400, scale=2)

# Show plot
fig.show()

#### Headway

##### Hourly

In [None]:
# df_headway_id_hourly = (
#     load_data(
#         dirpath="../data/production/atspm/fdot_d5/feature_extraction/feature/hourly/vehicle_traffic/headway",
#         signal_id=signal_id
#     )
# )

# print(df_headway_id_hourly.shape)
# df_headway_id_hourly.head(1)

In [None]:
# columns = [
#     f"{signal_type}AvgHeadwayPhase{phase_no}" 
#     for phase_no in [2, 4, 6, 8] 
#     for signal_type in ["green", "red"]
# ]

# for column in columns:
#     proc_columns = df_headway_id_hourly.columns.tolist()
#     proc_columns = [
#         proc_column for proc_column in proc_columns 
#         if column in proc_column and not any(k in proc_column for k in ["Min", "Max", "Std"])
#     ]
    
#     df_headway_id_hourly[column] = (
#         df_headway_id_hourly[proc_columns].apply(lambda row: row.mean(skipna=True), axis=1)
#     )

# df_headway_id_hourly = df_headway_id_hourly[["signalID", "date", "hour"] + columns]

In [None]:
# columns = [
#     f"{signal_type}AvgHeadwayPhase{phase_no}" 
#     for phase_no in [2, 4, 6, 8] 
#     for signal_type in ["green", "red"]
# ]

# # Group data by hour, date, and signal type to calculate stats
# proc_df_headway_id_hourly = pd.melt(
#     df_headway_id_hourly,
#     id_vars=["signalID", "hour", "date"],
#     value_vars=columns,
#     var_name="feature",
#     value_name="headway"
# )

In [None]:
# # Aggregate to compute mean, std, and count for each hour and signal type
# df = (
#     proc_df_headway_id_hourly.groupby(["hour", "feature"])
#     .agg(
#         headwayAvg=("headway", "mean"),
#         headwayStd=("headway", "std"),
#         count=("headway", "count")
#     )
#     .reset_index()
# )

# # Calculate confidence intervals
# z = 1.96  # 95% confidence
# df["upperBound"] = df["headwayAvg"] + z * (df["headwayStd"] / np.sqrt(df["count"]))
# df["lowerBound"] = df["headwayAvg"] - z * (df["headwayStd"] / np.sqrt(df["count"]))

# # Parse signal type and lane type from the feature column
# df["signalType"] = df["feature"].str.extract(r"([a-zA-Z]+)AvgHeadway")
# df["phaseNo"] = df["feature"].str.extract(r"(\d)").astype(int)

# # dict_approach_map = {
# #     1: "major1", 6: "major1",
# #     5: "major2", 2: "major2",
# #     3: "minor1", 8: "minor1",
# #     7: "minor2", 4: "minor2",
# # }

# # df["approachType"] = df["phaseNo"].map(dict_approach_map)

# # Define a colormap for signal types using valid color formats
# dict_colormap = {
#     "green": "rgba(39, 174, 96, 1)",       # Solid green
#     # "yellow": "rgba(241, 196, 15, 1)",    # Solid yellow
#     # "redClearance": "rgba(236, 112, 99, 1)",  # Solid red-clearance
#     "red": "rgba(203, 67, 53, 1)"         # Solid red
# }

# dict_label = {
#     "green": "Green",       
#     # "yellow": "Yellow",  
#     # "redClearance": "Red Clearance", 
#     "red": "Red"    
# }

# # Transparency for confidence intervals
# ci_transparency = 0.25

# # # Approach Type to analyze
# # approach_type = "major1"

# # phase_nos = df[df["approachType"] == approach_type]["phaseNo"].unique().tolist()
# phase_nos = [2, 6, 4, 8]

# # Create a 1x2 grid for subplots
# fig = make_subplots(
#     rows=2, cols=2,
#     # rows=1, cols=2,
#     subplot_titles=[f"Phase No: {phase_no}" for phase_no in phase_nos],
#     shared_xaxes=False, shared_yaxes=False,
#     vertical_spacing=0.15
# )

# # Track which signal types have been added to the legend
# legend_shown = {"green": False, "red": False}

# # Iterate over phases and add traces
# row_col_mapping = [(1, 1), (1, 2), (2, 1), (2, 2)]  # Map phases to subplot positions
# # row_col_mapping = [(1, 1), (1, 2)]  # Map phases to subplot positions
# for idx, (phase_no, (row, col)) in enumerate(zip(phase_nos, row_col_mapping)):
#     proc_df = df[df["phaseNo"] == phase_no]

#     # Iterate over lane types and add traces
#     for signal_type, color in dict_colormap.items():
#         proc_df_signal = proc_df[proc_df["signalType"] == signal_type]
#         # proc_df_signal = float_to_int(proc_df_signal)

#         # # Only show legend for the first occurrence of each signal type
#         # if not legend_shown[signal_type]:
#         #     show_legend = True
#         #     legend_shown[signal_type] = True  # Mark as shown
#         # else:
#         #     show_legend = False

#         # Add line trace for the average
#         fig.add_trace(
#             go.Scatter(
#                 x=proc_df_signal["hour"],
#                 y=proc_df_signal["headwayAvg"],
#                 mode="lines+markers",
#                 marker=dict(color=color, size=8),
#                 line=dict(color=color, width=3),
#                 name=f"{dict_label[signal_type]}",
#                 showlegend=False
#             ),
#             row=row, col=col
#         )

#         # Add shaded CI region with lighter transparency
#         fig.add_trace(
#             go.Scatter(
#                 x=pd.concat([proc_df_signal["hour"], proc_df_signal["hour"][::-1]]),
#                 y=pd.concat([proc_df_signal["upperBound"], proc_df_signal["lowerBound"][::-1]]),
#                 fill="toself",
#                 fillcolor=color.replace("1)", f"{ci_fill_transparency})"),  # Lighter fill for CI
#                 line=dict(color="rgba(0,0,0,0)"),  # No border
#                 hoverinfo="skip",
#                 name="95% Confidence Interval",
#                 showlegend=(idx==0)
#             ),
#             row=row, col=col
#         )

# # # Update layout
# # fig.update_layout(
# #     # title="Proportion of Cycles Recommended with PR by Hour with 95% Confidence Intervals",
# #     height=800,
# #     width=1400,
# #     showlegend=False,
# #     xaxis_title="Hour of Day",
# #     font=dict(size=14),
# #     margin=dict(r=25, t=50, l=50, b=50)  # Margins for the plot
# # )

# # Update layout with legend positioned at the bottom
# fig.update_layout(
#     height=800,
#     width=1400,
#     showlegend=True,  # Ensure legend is shown
#     xaxis_title="Hour of Day",
#     font=dict(size=14),
#     margin=dict(r=25, t=50, l=50, b=100),  # Increase bottom margin for legend space
#     legend=dict(
#         orientation="h",  # Horizontal legend layout
#         x=0.5,  # Centered horizontally
#         y=-0.1,  # Position below the plot
#         xanchor="center",
#         yanchor="top",
#         font=dict(size=15),
#         bgcolor="rgba(255,255,255,0.8)",  # Add background for better readability
#     )
# )

# # Update axis labels for shared x/y axes
# fig.update_xaxes(
#     title_text="Hour of Day",
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Update y-axes for shared configuration
# fig.update_yaxes(
#     title_text="Avg. Headway (Sec)", 
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Export the Plotly figure as a high-resolution image
# fig.write_image("../reports/3.8.png", width=1400, height=800, scale=2)

# # Show plot
# fig.show()

#### Conflict

##### Hourly

In [None]:
# df_conflict_id_hourly = (
#     load_data(
#         dirpath="../data/production/atspm/fdot_d5/feature_extraction/feature/hourly/vehicle_traffic/conflict",
#         signal_id=signal_id
#     )
# )

# print(df_conflict_id_hourly.shape)
# df_conflict_id_hourly.head(1)

In [None]:
# columns = [
#     f"{signal_type}Conflict1.0Phase{phase_no}" 
#     for phase_no in [2, 4, 6, 8] 
#     for signal_type in ["green", "red"]
#     # for signal_type in ["yellow", "redClearance"]
# ]

# for column in columns:
#     proc_columns = df_conflict_id_hourly.columns.tolist()
#     proc_columns = [
#         proc_column for proc_column in proc_columns 
#         if column in proc_column
#     ]
    
#     df_conflict_id_hourly[column] = (
#         df_conflict_id_hourly[proc_columns].apply(lambda row: row.sum(), axis=1)
#     )

# df_conflict_id_hourly = df_conflict_id_hourly[["signalID", "date", "hour"] + columns]

In [None]:
# columns = [
#     f"{signal_type}Conflict1.0Phase{phase_no}" 
#     for phase_no in [2, 4, 6, 8] 
#     for signal_type in ["green", "red"]
#     # for signal_type in ["yellow", "redClearance"]
# ]

# # Group data by hour, date, and signal type to calculate stats
# proc_df_conflict_id_hourly = pd.melt(
#     df_conflict_id_hourly,
#     id_vars=["signalID", "hour", "date"],
#     value_vars=columns,
#     var_name="feature",
#     value_name="conflict"
# )

In [None]:
# # Aggregate to compute mean, std, and count for each hour and signal type
# df = (
#     proc_df_conflict_id_hourly.groupby(["hour", "feature"])
#     .agg(
#         conflictAvg=("conflict", "mean"),
#         conflictStd=("conflict", "std"),
#         count=("conflict", "count")
#     )
#     .reset_index()
# )

# # Calculate confidence intervals
# z = 1.96  # 95% confidence
# df["upperBound"] = df["conflictAvg"] + z * (df["conflictStd"] / np.sqrt(df["count"]))
# df["lowerBound"] = df["conflictAvg"] - z * (df["conflictStd"] / np.sqrt(df["count"]))

# # Parse signal type and lane type from the feature column
# df["signalType"] = df["feature"].str.extract(r"([a-zA-Z]+)Conflict")
# df["phaseNo"] = df["feature"].str.extract(r"(\d+)(?!.*\d)").astype(int)

# # dict_approach_map = {
# #     1: "major1", 6: "major1",
# #     5: "major2", 2: "major2",
# #     3: "minor1", 8: "minor1",
# #     7: "minor2", 4: "minor2",
# # }

# # df["approachType"] = df["phaseNo"].map(dict_approach_map)

# # Define a colormap for signal types using valid color formats
# dict_colormap = {
#     "green": "rgba(39, 174, 96, 1)",       # Solid green
#     # "yellow": "rgba(241, 196, 15, 1)",    # Solid yellow
#     # "redClearance": "rgba(236, 112, 99, 1)",  # Solid red-clearance
#     "red": "rgba(203, 67, 53, 1)"         # Solid red
# }

# dict_label = {
#     "green": "Green",       
#     # "yellow": "Yellow",  
#     # "redClearance": "Red Clearance", 
#     "red": "Red"    
# }

# # Transparency for confidence intervals
# ci_transparency = 0.25

# # # Approach Type to analyze
# # approach_type = "major1"

# # phase_nos = df[df["approachType"] == approach_type]["phaseNo"].unique().tolist()
# phase_nos = [2, 6, 4, 8]

# # Create a 1x2 grid for subplots
# fig = make_subplots(
#     rows=2, cols=2,
#     # rows=1, cols=2,
#     subplot_titles=[f"Phase No: {phase_no}" for phase_no in phase_nos],
#     shared_xaxes=False, shared_yaxes=False,
#     vertical_spacing=0.15
# )

# # Track which signal types have been added to the legend
# legend_shown = {"green": False, "red": False}
# # legend_shown = {"yellow": False, "redClearance": False}

# # Iterate over phases and add traces
# row_col_mapping = [(1, 1), (1, 2), (2, 1), (2, 2)]  # Map phases to subplot positions
# # row_col_mapping = [(1, 1), (1, 2)]  # Map phases to subplot positions
# for idx, (phase_no, (row, col)) in enumerate(zip(phase_nos, row_col_mapping)):
#     proc_df = df[df["phaseNo"] == phase_no]

#     # Iterate over lane types and add traces
#     for signal_type, color in dict_colormap.items():
#         proc_df_signal = proc_df[proc_df["signalType"] == signal_type]
#         proc_df_signal = float_to_int(proc_df_signal)

#         # # Only show legend for the first occurrence of each signal type
#         # if not legend_shown[signal_type]:
#         #     show_legend = True
#         #     legend_shown[signal_type] = True  # Mark as shown
#         # else:
#         #     show_legend = False

#         # Add line trace for the average
#         fig.add_trace(
#             go.Scatter(
#                 x=proc_df_signal["hour"],
#                 y=proc_df_signal["conflictAvg"],
#                 mode="lines+markers",
#                 marker=dict(color=color, size=8),
#                 line=dict(color=color, width=3),
#                 name=f"{dict_label[signal_type]}",
#                 showlegend=False
#             ),
#             row=row, col=col
#         )

#         # Add shaded CI region with lighter transparency
#         fig.add_trace(
#             go.Scatter(
#                 x=pd.concat([proc_df_signal["hour"], proc_df_signal["hour"][::-1]]),
#                 y=pd.concat([proc_df_signal["upperBound"], proc_df_signal["lowerBound"][::-1]]),
#                 fill="toself",
#                 fillcolor=color.replace("1)", f"{ci_fill_transparency})"),  # Lighter fill for CI
#                 line=dict(color="rgba(0,0,0,0)"),  # No border
#                 hoverinfo="skip",
#                 name=f"{dict_label[signal_type]} - 95% Confidence Interval",
#                 showlegend=(idx==0)
#             ),
#             row=row, col=col
#         )

# # # Update layout
# # fig.update_layout(
# #     # title="Proportion of Cycles Recommended with PR by Hour with 95% Confidence Intervals",
# #     height=800,
# #     width=1400,
# #     showlegend=False,
# #     xaxis_title="Hour of Day",
# #     font=dict(size=14),
# #     margin=dict(r=25, t=50, l=50, b=50)  # Margins for the plot
# # )

# # Update layout with legend positioned at the bottom
# fig.update_layout(
#     height=800,
#     width=1400,
#     showlegend=True,  # Ensure legend is shown
#     xaxis_title="Hour of Day",
#     font=dict(size=14),
#     margin=dict(r=25, t=50, l=50, b=100),  # Increase bottom margin for legend space
#     legend=dict(
#         orientation="h",  # Horizontal legend layout
#         x=0.5,  # Centered horizontally
#         y=-0.1,  # Position below the plot
#         xanchor="center",
#         yanchor="top",
#         font=dict(size=15),
#         bgcolor="rgba(255,255,255,0.8)",  # Add background for better readability
#     )
# )

# # Update axis labels for shared x/y axes
# fig.update_xaxes(
#     title_text="Hour of Day",
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Update y-axes for shared configuration
# fig.update_yaxes(
#     title_text="No. of Conflicts (Headway < 2 sec)", 
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Export the Plotly figure as a high-resolution image
# fig.write_image("../reports/3.9.png", width=1400, height=800, scale=2)

# # Show plot
# fig.show()

#### Red Light Running

##### Hourly

In [None]:
# df_red_running_id_hourly = (
#     load_data(
#         dirpath="../data/production/atspm/fdot_d5/feature_extraction/feature/hourly/vehicle_traffic/red_running",
#         signal_id=signal_id
#     )
# )

# print(df_red_running_id_hourly.shape)
# df_red_running_id_hourly.head(1)

In [None]:
# columns = [
#     f"{signal_type}RunningFlagPhase{phase_no}" 
#     for phase_no in [1, 2, 3, 4, 5, 6, 7, 8] 
#     for signal_type in ["red"]
# ]

# for column in columns:
#     proc_columns = df_red_running_id_hourly.columns.tolist()
#     proc_columns = [
#         proc_column for proc_column in proc_columns 
#         if column in proc_column
#     ]
    
#     df_red_running_id_hourly[column] = (
#         df_red_running_id_hourly[proc_columns].apply(lambda row: row.max(), axis=1)
#     )

# df_red_running_id_hourly = df_red_running_id_hourly[["signalID", "date", "hour"] + columns]

In [None]:
# columns = [
#     f"{signal_type}RunningFlagPhase{phase_no}" 
#     for phase_no in [1, 2, 3, 4, 5, 6, 7, 8] 
#     for signal_type in ["red"]
# ]

# # Group data by hour, date, and signal type to calculate stats
# proc_df_red_running_id_hourly = pd.melt(
#     df_red_running_id_hourly,
#     id_vars=["signalID", "hour", "date"],
#     value_vars=columns,
#     var_name="feature",
#     value_name="redRunningFlag"
# )

In [None]:
# # Aggregate to compute mean, std, and count for each hour and signal type
# df = (
#     proc_df_red_running_id_hourly.groupby(["hour", "feature"])
#     .agg(
#         redRunningFlagAvg=("redRunningFlag", "mean"),
#         redRunningFlagStd=("redRunningFlag", "std"),
#         count=("redRunningFlag", "count")
#     )
#     .reset_index()
# )

# # Calculate confidence intervals
# z = 1.96  # 95% confidence
# df["upperBound"] = df["redRunningFlagAvg"] + z * (df["redRunningFlagStd"] / np.sqrt(df["count"]))
# df["lowerBound"] = df["redRunningFlagAvg"] - z * (df["redRunningFlagStd"] / np.sqrt(df["count"]))

# # Parse signal type and lane type from the feature column
# df["signalType"] = df["feature"].str.extract(r"([a-zA-Z]+)RunningFlag")
# df["phaseNo"] = df["feature"].str.extract(r"(\d)").astype(int)

# dict_approach_map = {
#     1: "major1", 6: "major1",
#     5: "major2", 2: "major2",
#     3: "minor1", 8: "minor1",
#     7: "minor2", 4: "minor2",
# }

# df["approachType"] = df["phaseNo"].map(dict_approach_map)

# # Define a colormap for signal types using valid color formats
# dict_colormap = {
#     # "green": "rgba(39, 174, 96, 1)",       # Solid green
#     # "yellow": "rgba(241, 196, 15, 1)",    # Solid yellow
#     # "redClearance": "rgba(236, 112, 99, 1)",  # Solid red-clearance
#     "red": "rgba(203, 67, 53, 1)"         # Solid red
# }

# dict_label = {
#     # "green": "Green",       
#     # "yellow": "Yellow",  
#     # "redClearance": "Red Clearance", 
#     "red": "Red"    
# }

# # Transparency for confidence intervals
# ci_transparency = 0.25

# # Approach Type to analyze
# approach_type = "minor2"

# phase_nos = df[df["approachType"] == approach_type]["phaseNo"].unique().tolist()
# phase_nos = sorted(phase_nos, key=lambda x: x % 2 == 0)

# # Create a 1x2 grid for subplots
# fig = make_subplots(
#     # rows=2, cols=2,
#     rows=1, cols=2,
#     subplot_titles=[f"Phase No: {phase_no}" for phase_no in phase_nos],
#     shared_xaxes=False, shared_yaxes=False,
#     vertical_spacing=0.15
# )

# # Iterate over phases and add traces
# # row_col_mapping = [(1, 1), (1, 2), (2, 1), (2, 2)]  # Map phases to subplot positions
# row_col_mapping = [(1, 1), (1, 2)]  # Map phases to subplot positions
# for idx, (phase_no, (row, col)) in enumerate(zip(phase_nos, row_col_mapping)):
#     proc_df = df[df["phaseNo"] == phase_no]

#     # Iterate over lane types and add traces
#     for signal_type, color in dict_colormap.items():
#         proc_df_signal = proc_df[proc_df["signalType"] == signal_type]
#         proc_df_signal = float_to_int(proc_df_signal)

#         # Add line trace for the average
#         fig.add_trace(
#             go.Scatter(
#                 x=proc_df_signal["hour"],
#                 y=proc_df_signal["redRunningFlagAvg"],
#                 mode="lines+markers",
#                 marker=dict(color=color, size=8),
#                 line=dict(color=color, width=3),
#                 name=f"{dict_label[signal_type]}",
#                 showlegend=False
#             ),
#             row=row, col=col
#         )

#         # Add shaded CI region with lighter transparency
#         fig.add_trace(
#             go.Scatter(
#                 x=pd.concat([proc_df_signal["hour"], proc_df_signal["hour"][::-1]]),
#                 y=pd.concat([proc_df_signal["upperBound"], proc_df_signal["lowerBound"][::-1]]),
#                 fill="toself",
#                 fillcolor=color.replace("1)", f"{ci_fill_transparency})"),  # Lighter fill for CI
#                 line=dict(color="rgba(0,0,0,0)"),  # No border
#                 hoverinfo="skip",
#                 name=f"{dict_label[signal_type]} - 95% Confidence Interval",
#                 showlegend=(idx==0)
#             ),
#             row=row, col=col
#         )

# # Update layout
# fig.update_layout(
#     # title="Proportion of Cycles Recommended with PR by Hour with 95% Confidence Intervals",
#     height=400,
#     width=1400,
#     xaxis_title="Hour of Day",
#     font=dict(size=14),
#     margin=dict(r=25, t=50, l=50, b=50),  # Margins for the plot
#     # showlegend=False,
#     showlegend=True,
#     legend=dict(
#         orientation="h",  # Horizontal legend layout
#         x=0.5,  # Centered horizontally
#         y=-0.2,  # Position below the plot
#         xanchor="center",
#         yanchor="top",
#         font=dict(size=15),
#         bgcolor="rgba(255,255,255,0.8)",  # Add background for better readability
#     )
# )

# # Update axis labels for shared x/y axes
# fig.update_xaxes(
#     title_text="Hour of Day",
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # df = float_to_int(df)

# # for phase_no in phase_nos:
# #     proc_df = df[df["phaseNo"] == phase_no]

# #     # Iterate over lane types and add traces
# #     for signal_type, color in dict_colormap.items():
# #         proc_df_signal = proc_df[proc_df["signalType"] == signal_type]
# #         # proc_df_signal = float_to_int(proc_df_signal)

# #         max_val = int(proc_df_signal["splitFailureAvg"].max())

# #         if max_val < 2:
# #             # Update y-axes for shared configuration
# #             fig.update_yaxes(
# #                 title_text="Avg. Split Failure", 
# #                 title_font=dict(size=16),
# #                 tickfont=dict(size=16),
# #                 # tickmode="linear",  # Ensure ticks are evenly spaced
# #                 # tick0=0,            # Starting tick value
# #                 # dtick=1,             # Interval between ticks (ensures integer values)
# #                 # range=[-0.5, 1.5]   # Expand range slightly to force 1 to appear
# #             )
# #         else:
# #             fig.update_yaxes(
# #                 tickmode="linear",
# #                 dtick=tick_interval,
# #                 tickfont=dict(size=14)
# #             )

# fig.update_yaxes(
#     title_text="No. of Cycles with RLR", 
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Export the Plotly figure as a high-resolution image
# fig.write_image("../reports/3.10(d).png", width=1400, height=400, scale=2)

# # Show plot
# fig.show()

### Pedestrian Traffic Features

#### Pedestrian Activity

##### Hourly

In [None]:
df_spat_id = (
    load_data(
        dirpath="../data/production/atspm/fdot_d5/feature_extraction/feature/cycle/vehicle_signal/spat/",
        signal_id=signal_id
    )
)

df_pedestrian_activity_id = (
    load_data(
        dirpath="../data/production/atspm/fdot_d5/feature_extraction/feature/cycle/pedestrian_traffic/activity",
        signal_id=signal_id
    )
)

columns = [
    column for column in df_pedestrian_activity_id.columns if not any(k in column for k in ["45"])
]
df_pedestrian_activity_id = df_pedestrian_activity_id[columns]
df_pedestrian_activity_id = float_to_int(df_pedestrian_activity_id)

# Join
df_pedestrian_activity_id = pd.merge(
    df_spat_id[["signalID", "date", "cycleNo", "cycleBegin", "cycleEnd"]], df_pedestrian_activity_id, 
    on=["signalID", "date", "cycleNo", "cycleBegin", "cycleEnd"], 
    how="left"
)

# Shift
columns = [column for column in df_pedestrian_activity_id.columns if "Prev" in column]

for column in columns:
    df_pedestrian_activity_id[column] = df_pedestrian_activity_id[column].shift(-1)

df_pedestrian_activity_id = df_pedestrian_activity_id.fillna(0)

phase_nos = [
    int(column[-1]) for column in df_pedestrian_activity_id.columns if "90" in column and not any(k in column for k in ["Cycle"])
]

dict_indicator = {
    f"pedestrianActivityIndicatorPhase{phase_no}": [] for phase_no in phase_nos
}

for i in range(len(df_pedestrian_activity_id)):
    for phase_no in phase_nos:
        cycle_no_curr = df_pedestrian_activity_id.loc[i, "cycleNo"]
        pedestrian_activity_curr = df_pedestrian_activity_id.loc[i, f"pedestrianActivity90Phase{phase_no}CurrCycle"]
        pedestrian_activity_prev = df_pedestrian_activity_id.loc[i, f"pedestrianActivity90Phase{phase_no}PrevCycle"]
        
        if i == len(df_pedestrian_activity_id) - 1:
            if pedestrian_activity_curr > 0 or pedestrian_activity_prev > 0:
                dict_indicator[f"pedestrianActivityIndicatorPhase{phase_no}"].append(1)
            else:
                dict_indicator[f"pedestrianActivityIndicatorPhase{phase_no}"].append(0)
        else:
            cycle_no_next = df_pedestrian_activity_id.loc[i+1, "cycleNo"]	
            if (cycle_no_next - cycle_no_curr) > 1:
                dict_indicator[f"pedestrianActivityIndicatorPhase{phase_no}"].append(0)
            else:
                if pedestrian_activity_curr > 0 or pedestrian_activity_prev > 0:
                    dict_indicator[f"pedestrianActivityIndicatorPhase{phase_no}"].append(1)
                else:
                    dict_indicator[f"pedestrianActivityIndicatorPhase{phase_no}"].append(0)


columns = [column for column in df_pedestrian_activity_id.columns if "Activity" in column]

df_pedestrian_activity_id = df_pedestrian_activity_id.drop(columns=columns)
df_pedestrian_activity_id = pd.concat([df_pedestrian_activity_id, pd.DataFrame(dict_indicator)], 
                                      axis=1)

df_pedestrian_activity_id["hour"] = df_pedestrian_activity_id["cycleBegin"].dt.hour
    
activity_columns = [col for col in df_pedestrian_activity_id.columns if "Activity" in col]

df_pedestrian_activity_id_hourly = (
    df_pedestrian_activity_id
    .groupby(["signalID", "date", "hour"])[activity_columns]
    .agg(['sum', 'count'])
    .reset_index()
)

df_pedestrian_activity_id_hourly.columns = [
    ''.join([col[0], col[1].capitalize()]).rstrip('_') if isinstance(col, tuple) else col
    for col in df_pedestrian_activity_id_hourly.columns
] # Sum: Count of 1s; Count: Count of 1s and 0s

sum_columns = [col for col in df_pedestrian_activity_id_hourly.columns if 'Sum' in col]
df_pedestrian_activity_id_hourly_sum = pd.melt(
    df_pedestrian_activity_id_hourly, 
    id_vars=["signalID", "date", "hour"], 
    value_vars=sum_columns, 
    var_name="phaseNo", 
    value_name="cyclesWithPedestrian"
)
df_pedestrian_activity_id_hourly_sum["phaseNo"] = (
    df_pedestrian_activity_id_hourly_sum["phaseNo"].str.extract(r'Phase(\d+)').astype(int)
)

count_columns = [col for col in df_pedestrian_activity_id_hourly.columns if 'Count' in col]
df_pedestrian_activity_id_hourly_count = pd.melt(
    df_pedestrian_activity_id_hourly, 
    id_vars=["signalID", "date", "hour"], 
    value_vars=count_columns, 
    var_name="phaseNo", 
    value_name="totalCycles"
)
df_pedestrian_activity_id_hourly_count["phaseNo"] = (
    df_pedestrian_activity_id_hourly_count["phaseNo"].str.extract(r'Phase(\d+)').astype(int)
)

df_pedestrian_activity_id_hourly = pd.merge(
    df_pedestrian_activity_id_hourly_sum, df_pedestrian_activity_id_hourly_count, 
    on=["signalID", "date", "hour", "phaseNo"],
    how="inner"
)

In [None]:
# # Aggregate to compute mean, std, and count for each hour and signal type
# df = (
#     df_pedestrian_activity_id_hourly.groupby(["hour", "phaseNo"])
#     .agg(
#         cyclesWithPedestrianAvg=("cyclesWithPedestrian", "mean"),
#         cyclesWithPedestrianStd=("cyclesWithPedestrian", "std"),
#         count=("cyclesWithPedestrian", "count")
#     )
#     .reset_index()
# )

# # Calculate confidence intervals
# z = 1.96  # 95% confidence
# df["upperBound"] = df["cyclesWithPedestrianAvg"] + z * (df["cyclesWithPedestrianStd"] / np.sqrt(df["count"]))
# df["lowerBound"] = df["cyclesWithPedestrianAvg"] - z * (df["cyclesWithPedestrianStd"] / np.sqrt(df["count"]))

# # dict_approach_map = {
# #     1: "major1", 6: "major1",
# #     5: "major2", 2: "major2",
# #     3: "major3", 8: "major3",
# #     7: "major4", 4: "major4",
# # }

# # df["approachType"] = df["phaseNo"].map(dict_approach_map)

# # dict_approach_dir = {
# #     2: "Westbound (Major Road)",
# #     6: "Eastbound (Major Road)",
# #     4: "Northbound (Minor Road)",
# #     8: "Southbound (Minor Road)"
# # }

# # Transparency for confidence intervals
# ci_transparency = 0.25

# # # Approach Type to analyze
# # approach_type = "major2"

# # phase_nos = df[df["approachType"] == approach_type]["phaseNo"].unique().tolist()
# phase_nos = [2, 6, 4, 8]

# # Create a 1x2 grid for subplots
# fig = make_subplots(
#     rows=2, cols=2,
#     subplot_titles=[f"Pedestrian Phase No: {phase_no}" for phase_no in phase_nos],
#     shared_xaxes=False, shared_yaxes=False,
#     vertical_spacing=0.15
# )

# color = "rgba(0, 51, 153, 1)"

# # Iterate over phases and add traces
# row_col_mapping = [(1, 1), (1, 2), (2, 1), (2, 2)]  # Map phases to subplot positions
# for idx, (phase_no, (row, col)) in enumerate(zip(phase_nos, row_col_mapping)):
#     proc_df = df[df["phaseNo"] == phase_no]

#     # Add line trace for the average
#     fig.add_trace(
#         go.Scatter(
#             x=proc_df["hour"],
#             y=proc_df["cyclesWithPedestrianAvg"],
#             mode="lines+markers",
#             marker=dict(color=color, size=8),
#             line=dict(color=color, width=3),
#             showlegend=False
#         ),
#         row=row, col=col
#     )

#     # Add shaded CI region with lighter transparency
#     fig.add_trace(
#         go.Scatter(
#             x=pd.concat([proc_df["hour"], proc_df["hour"][::-1]]),
#             y=pd.concat([proc_df["upperBound"], proc_df["lowerBound"][::-1]]),
#             fill="toself",
#             fillcolor=color.replace("1)", f"{ci_transparency})"),  # Lighter fill for CI
#             line=dict(color="rgba(0,0,0,0)"),  # No border
#             hoverinfo="skip",
#             name="95% Confidence Interval",
#             showlegend=(idx==0)
#         ),
#         row=row, col=col
#     )

# # Update layout
# fig.update_layout(
#     # title="Proportion of Cycles Recommended with PR by Hour with 95% Confidence Intervals",
#     height=800,
#     width=1400,
#     # showlegend=False,
#     xaxis_title="Hour of Day",
#     font=dict(size=14),
#     margin=dict(r=25, t=50, l=50, b=50),  # Margins for the plot
#     showlegend=True,
#     legend=dict(
#         orientation="h",  # Horizontal legend layout
#         x=0.5,  # Centered horizontally
#         y=-0.1,  # Position below the plot
#         xanchor="center",
#         yanchor="top",
#         font=dict(size=15),
#         bgcolor="rgba(255,255,255,0.8)",  # Add background for better readability
#     )
# )

# # Update axis labels for shared x/y axes
# fig.update_xaxes(
#     title_text="Hour of Day",
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Update y-axes for shared configuration
# fig.update_yaxes(
#     title_text="No. of Cycles With Pedestrian", 
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Export the Plotly figure as a high-resolution image
# fig.write_image("../reports/3.11.png", width=1400, height=800, scale=2)

# # Show plot
# fig.show()

#### Pedestrian Delay

##### Hourly

In [None]:
# df_pedestrian_delay_id_hourly = load_data(
#     dirpath="../data/production/atspm/fdot_d5/feature_extraction/feature/cycle/pedestrian_traffic/delay",
#     signal_id=signal_id
# )
# columns = [
#     col for col in df_pedestrian_delay_id_hourly.columns if all(suffix not in col for suffix in ["CurrCycle", "PrevCycle"]) and ("Delay" in col) and df_pedestrian_delay_id_hourly[col].dtype != "O"
# ]

# df_pedestrian_delay_id_hourly["hour"] = df_pedestrian_delay_id_hourly["cycleBegin"].dt.hour
# df_pedestrian_delay_id_hourly = (
#     pd.melt(df_pedestrian_delay_id_hourly, 
#             id_vars=["signalID", "date", "hour"], 
#             var_name="phaseNo",
#             value_vars=columns,
#             value_name="pedestrianDelay"
#            )
# )
# df_pedestrian_delay_id_hourly["phaseNo"] = df_pedestrian_delay_id_hourly["phaseNo"].str.extract(r'Phase(\d)').astype(int)

# # Filter non-zero values
# df_pedestrian_delay_id_hourly = (
#     df_pedestrian_delay_id_hourly[df_pedestrian_delay_id_hourly["pedestrianDelay"] > 0]
# )

# df_pedestrian_delay_id_hourly.head(1)

In [None]:
# # Aggregate to compute mean, std, and count for each hour and signal type
# df = (
#     df_pedestrian_delay_id_hourly.groupby(["hour", "phaseNo"])
#     .agg(
#         pedestrianDelayAvg=("pedestrianDelay", "mean"),
#         pedestrianDelayStd=("pedestrianDelay", "std"),
#         count=("pedestrianDelay", "count")
#     )
#     .reset_index()
# )

# # Calculate confidence intervals
# z = 1.96  # 95% confidence
# df["upperBound"] = df["pedestrianDelayAvg"] + z * (df["pedestrianDelayStd"] / np.sqrt(df["count"]))
# df["lowerBound"] = df["pedestrianDelayAvg"] - z * (df["pedestrianDelayStd"] / np.sqrt(df["count"]))

# # dict_approach_map = {
# #     1: "major1", 6: "major1",
# #     5: "major2", 2: "major2",
# #     3: "major3", 8: "major3",
# #     7: "major4", 4: "major4",
# # }

# # df["approachType"] = df["phaseNo"].map(dict_approach_map)

# # dict_approach_dir = {
# #     2: "Westbound (Major Road)",
# #     6: "Eastbound (Major Road)",
# #     4: "Northbound (Minor Road)",
# #     8: "Southbound (Minor Road)"
# # }

# # Transparency for confidence intervals
# ci_transparency = 0.25

# # # Approach Type to analyze
# # approach_type = "major2"

# # phase_nos = df[df["approachType"] == approach_type]["phaseNo"].unique().tolist()
# phase_nos = [2, 6, 4, 8]

# # Create a 1x2 grid for subplots
# fig = make_subplots(
#     rows=2, cols=2,
#     subplot_titles=[f"Pedestrian Phase No: {phase_no}" for phase_no in phase_nos],
#     shared_xaxes=False, shared_yaxes=False,
#     vertical_spacing=0.15
# )

# color = "rgba(0, 51, 153, 1)"

# # Iterate over phases and add traces
# row_col_mapping = [(1, 1), (1, 2), (2, 1), (2, 2)]  # Map phases to subplot positions
# for idx, (phase_no, (row, col)) in enumerate(zip(phase_nos, row_col_mapping)):
#     proc_df = df[df["phaseNo"] == phase_no]

#     # Add line trace for the average
#     fig.add_trace(
#         go.Scatter(
#             x=proc_df["hour"],
#             y=proc_df["pedestrianDelayAvg"],
#             mode="lines+markers",
#             marker=dict(color=color, size=8),
#             line=dict(color=color, width=3),
#             showlegend=False
#         ),
#         row=row, col=col
#     )

#     # Add shaded CI region with lighter transparency
#     fig.add_trace(
#         go.Scatter(
#             x=pd.concat([proc_df["hour"], proc_df["hour"][::-1]]),
#             y=pd.concat([proc_df["upperBound"], proc_df["lowerBound"][::-1]]),
#             fill="toself",
#             fillcolor=color.replace("1)", f"{ci_transparency})"),  # Lighter fill for CI
#             line=dict(color="rgba(0,0,0,0)"),  # No border
#             hoverinfo="skip",
#             name="95% Confidence Interval",
#             showlegend=(idx==0)
#         ),
#         row=row, col=col
#     )

# # Update layout
# fig.update_layout(
#     # title="Proportion of Cycles Recommended with PR by Hour with 95% Confidence Intervals",
#     height=800,
#     width=1400,
#     # showlegend=False,
#     xaxis_title="Hour of Day",
#     font=dict(size=14),
#     margin=dict(r=25, t=50, l=50, b=50),  # Margins for the plot
#     showlegend=True,
#     legend=dict(
#         orientation="h",  # Horizontal legend layout
#         x=0.5,  # Centered horizontally
#         y=-0.1,  # Position below the plot
#         xanchor="center",
#         yanchor="top",
#         font=dict(size=15),
#         bgcolor="rgba(255,255,255,0.8)",  # Add background for better readability
#     )
# )

# # Update axis labels for shared x/y axes
# fig.update_xaxes(
#     title_text="Hour of Day",
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Update y-axes for shared configuration
# fig.update_yaxes(
#     title_text="Avg. Pedestrian Delay (Sec)", 
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Export the Plotly figure as a high-resolution image
# fig.write_image("../reports/3.12.png", width=1400, height=800, scale=2)

# # Show plot
# fig.show()