# Import Libraries

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import numpy as np
import pandas as pd
import psutil
import tqdm
import yaml
import glob
import os
import gc

from datetime import datetime
from multiprocessing import Pool
from IPython.display import clear_output

In [9]:
import plotly.io as pio
# pio.renderers.default = "notebook"
# pio.renderers.default = "notebook_connected"
pio.renderers.default = "iframe"

import plotly.graph_objects as go
import plotly.express as px

from plotly.subplots import make_subplots

In [3]:
from src.components.feature_extraction import data_quality_check
from src.utils import get_root_directory

In [4]:
# Get root directory of the project
root_dir = get_root_directory()

In [5]:
# Maximize Column Display 
pd.set_option('display.max_colwidth', None)     # Display all content within each cell without truncation
pd.set_option('display.max_columns', None)      # Display all columns
pd.set_option('display.width', None)            # Display entire width of DataFrame is displayed

pd.set_option('display.max_rows', None)         # Display all rows

In [6]:
# Get the current CPU usage as a percentage
cpu_usage = psutil.cpu_percent(interval=1)  # Interval of 1 second
print(f"Current CPU usage: {cpu_usage}%")

# Get the per-core usage
cpu_per_core = psutil.cpu_percent(interval=1, percpu=True)
print(f"CPU usage per core: {cpu_per_core}")

# Get the total number of cores
cpu_cores = psutil.cpu_count()
print(f"Total CPU cores: {cpu_cores}")

Current CPU usage: 7.6%
CPU usage per core: [4.0, 3.0, 5.0, 6.1, 8.1, 13.0, 6.0, 12.1, 2.0, 7.1, 8.9, 14.0]
Total CPU cores: 12


# Data Quality Check

## FDOT D5

In [7]:
# Configurations
signal_ids = [
    "1285", "1290",
    "1300", "1315", "1325", "1330", 
    "1455", "1470", "1490",
    "1500", "1555",
    "1707", "1725", "1790", "1795", 
    "1960",
    "2055", 
    "2485", 
    "2665", 
    # "D5I-3000"
]

In [None]:
for signal_id in signal_ids:
    print(f"Processing Signal ID: {signal_id}")
    print("=" * 40)

    # Define the filepaths
    filepaths = f"../data/interim/atspm/fdot_d5/event_data/{signal_id}/*.pkl"
    filepaths = [p for p in glob.glob(filepaths)][1:]  # Exclude first file, if needed

    # Extract dates from filepaths
    dates = [os.path.basename(filepath).split(".")[0] for filepath in filepaths]

    for date in dates:
        print(f"  Processing Date: {date}")

        try:
            # Parse the date string into a datetime object
            date_object = datetime.strptime(date, '%Y-%m-%d')

            # Extract day, month, and year
            day = date_object.day
            month = date_object.month
            year = date_object.year

            if month != 6:
                continue

            # Check data quality
            checker = data_quality_check.DataQualityCheck(event_type="vehicle_signal")
            checker.check_data_quality(signal_id=signal_id, 
                                       day=day, month=month, year=year)

        except Exception as e:
            print(f"Error Processing Date {date}: {e}")

    # Explicitly call garbage collector
    gc.collect()

    # Clear output after processing each Signal ID
    clear_output(wait=True)

Processing Signal ID: 2665
  Processing Date: 2024-06-02
  Processing Date: 2024-06-03
  Processing Date: 2024-06-04
  Processing Date: 2024-06-05
  Processing Date: 2024-06-06
  Processing Date: 2024-06-07


## FDOT D7

# Visualization

In [24]:
filepaths = "../data/production/atspm/fdot_d5/data_quality_check/vehicle_signal/*.csv"
filepaths = [p for p in glob.glob(filepaths)]

df_data_quality_check = pd.DataFrame()

for filepath in filepaths:
    df_data_quality_check_id  = pd.read_csv(filepath)

    df_data_quality_check = (
        pd.concat([df_data_quality_check, df_data_quality_check_id], axis=0, ignore_index=True)
    )

# print(df_data_quality_check.shape)
# df_data_quality_check.head()

In [25]:
df_data_quality_check = (
    df_data_quality_check.groupby(["signalID", "date"])
    .agg(
        errorSequencePercent=("errorSequencePercent", "mean"),
        correctSequencePercent=("correctSequencePercent", "mean")
    )
    .reset_index()
)

In [55]:
# proc_df_data_quality_check = df_data_quality_check.groupby("date")["signalID"].size().reset_index()

# # Generate all dates in June 2024
# dates = pd.date_range(start="2024-06-01", end="2024-06-30")

# # Plotly bar chart
# fig = px.bar(proc_df_data_quality_check, x='date', y='signalID')

# # Update layout
# fig.update_layout(
#     height=600,  # Adjust height for readability
#     width=1400
# )

# # Update x-axis for custom tick formatting
# fig.update_xaxes(
#     title_text="Date (Year: 2024)",
#     # tickmode="linear",  # Ensures all dates are shown if evenly spaced
#     tickmode="array",  # Use array mode to specify all ticks explicitly
#     tickvals=dates.strftime('%Y-%m-%d').tolist(),  # Use string format of date values for alignment
#     ticktext=dates.strftime('%B %d').tolist(),  # Custom tick labels in "Month Day" format
#     tickangle=-45,  # Rotate tick labels for readability
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Update y-axis to show all unique signal IDs
# fig.update_yaxes(
#     title_text="Frequency of Intersections with Data",
#     tickmode="array",  # Ensure all unique IDs are shown
#     tickvals=proc_df_data_quality_check['signalID'].unique(),  # Unique values for the y-axis
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Show the figure
# fig.show()

In [56]:
# proc_df_data_quality_check = df_data_quality_check.copy()

# # Generate all dates in June 2024
# dates = pd.date_range(start="2024-06-01", end="2024-06-30")

# fig = px.line(
#     proc_df_data_quality_check,
#     x="date",
#     y="errorSequencePercent",
#     color="signalID",  # Group lines by signalID
#     line_shape="spline",  # Smooth lines
#     # title="Error Sequence Percentage Over Time by Signal ID",
#     # hover_data={"date": "|%B %d, %Y"}  # Customize hover to show date in 'Month Day, Year' format
# )

# # Customize the x-axis for better date formatting
# fig.update_xaxes(
#     # dtick="D1",  # Adjust to show ticks for every day
#     # tickformat="%d %b",  # Format ticks as 'Day Month' (e.g., '01 Jun')
#     # title="Date"
#     title_text="Date (Year: 2024)",
#     # tickmode="linear",  # Ensures all dates are shown if evenly spaced
#     tickmode="array",  # Use array mode to specify all ticks explicitly
#     tickvals=dates.strftime('%Y-%m-%d').tolist(),  # Use string format of date values for alignment
#     ticktext=dates.strftime('%B %d').tolist(),  # Custom tick labels in "Month Day" format
#     tickangle=-45,  # Rotate tick labels for readability
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Customize the y-axis
# fig.update_yaxes(
#     title="Error Sequence Percentage (%)",
#     title_font=dict(size=16),
#     tickfont=dict(size=16),
# )

# # Update layout for readability
# fig.update_layout(
#     height=600,
#     width=1500,
#     # margin=dict(l=50, r=50, t=100, b=50),
#     legend_title="Signal ID",
#     font=dict(size=14)
# )

# # Show the plot
# fig.show()
