### Load and Prepare the Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read the CSV file containing loop detector data
# There are no headers in file, so we provide column names explicitly
df = pd.read_csv(
    "loop_20150101_20151231.csv",
    header=None,
    names=["time", "detector", "direction", "speed", "volume", "occupancy"]
)

# Convert 'time' to datetime and sort chronologically
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values("time")

### Pivot Data to Wide Format (Time × Detector) and Clean

In [3]:
# Reshape the DataFrame so that each row is a timestamp
# and each column is a detector's speed reading
wide = df.pivot_table(
    index="time",
    columns="detector",
    values="speed"
).sort_index()


# Replace speed = 0 (which indicates sensor failure or no data)
# with NaN to mark it as missing
wide = wide.replace(0, np.nan)

### Identify and Mask Frozen Readings (Constant > 1hr)

In [4]:
values = wide.to_numpy().astype(float)
T, D = values.shape  # T: time steps, D: detectors

# Boolean mask to mark frozen readings
frozen = np.zeros_like(values, dtype=bool)

MAX_RUN = 12  # 12 × 5min = 60 min

# MAX_RUN defines the maximum allowed run length (number of consecutive time steps) 
# that a detector is allowed to report the same speed value before we decide it’s 
# “frozen.”

# Iterate through each detector column
for d in range(D):
    col = values[:, d]
    run_val = col[0]   # Initial value in the run
    run_start = 0      # Start index of the current run

    # Check if current value is the same as previous (or both NaN)
    for t in range(1, T):
        same = (col[t] == run_val) or (np.isnan(col[t]) and np.isnan(run_val))
        if same:
            continue   # Still in the same run

        # Value changed → close current run
        run_len = t - run_start
        if run_len >= MAX_RUN and not np.isnan(run_val):
            # Mark the frozen range as True
            frozen[run_start:t, d] = True

        # Start a new run
        run_val = col[t]
        run_start = t

    # Handle the tail end (last run till the end of the array)
    run_len = T - run_start
    if run_len >= MAX_RUN and not np.isnan(run_val):
        frozen[run_start:T, d] = True

# Replace all frozen values with NaN
values[frozen] = np.nan

# Reconstruct DataFrame from the modified values
wide = pd.DataFrame(values, index=wide.index, columns=wide.columns)


### Detect Long Per-Sensor Data Gaps (Blackouts)

In [14]:
def find_streaks(bool_array):
    """Find start and end indices of True streaks in a boolean array."""
    out, start = [], None
    for i, v in enumerate(bool_array):
        if v and start is None:
            start = i
        elif not v and start is not None:
            out.append((start, i - 1))
            start = None
    if start is not None:
        out.append((start, len(bool_array) - 1))
    return out

# Build a boolean mask of missing values (NaNs) for each detector
mask = wide.isna().to_numpy()
timestamps = wide.index.to_numpy()
T, D = mask.shape

MIN_LEN = 2  # Minimum length of a streak to be considered a blackout

# The above line adds the constraint that a sensor must have 2 or more 
# consecutive missing values to be flagged as having a blackout.
# This means that "10 minutes of continuous missing data" counts
# as a blackout. Setting MIN_LEN = 12 -> 60 minutes of missing data


sensor_blackouts = {}

# For each detector, find sequences of consecutive NaNs
for j, det in enumerate(wide.columns):
    streaks = find_streaks(mask[:, j])
    long_ones = []  # store the long missing-data streaks for this detector

    # iterate over all (start, end) streak pairs
    for s, e in streaks:    
        L = e - s + 1    # compute the length of this streak

        # skip streaks at very start or very end (structural missingness)
        if s == 0 or e == T - 1:
            continue

        if L >= MIN_LEN: # only keep streaks that are long enough
            start_time = timestamps[s]  # convert index to actual timestamp
            end_time = timestamps[e]
            long_ones.append((start_time, end_time, L))  # store tuple (start, end, length)

    if long_ones:
        sensor_blackouts[det] = long_ones

print(f"Num sensors with ≥{MIN_LEN} steps missing in a row:", len(sensor_blackouts))

Num sensors with ≥2 steps missing in a row: 144


### Filter for Active Detectors (≥30% Valid Data)

In [15]:
# Compute the fraction of valid (non-NaN) data for each detector
valid_frac_per_sensor = 1.0 - mask.mean(axis=0)  # % non-NaN per detector

# Minimum required valid data fraction
ACTIVE_THRESH = 0.3  # Must have ≥30% valid readings

# Select detectors with enough valid data
active_cols = wide.columns[valid_frac_per_sensor >= ACTIVE_THRESH]

# Filter the wide table to only include active detectors
wide_active = wide[active_cols]

# Create a fresh mask of missing values for active detectors only
mask_active = wide_active.isna().to_numpy()
timestamps = wide_active.index.to_numpy()

### Identify Global Blackouts (Across Many Sensors)

In [16]:
# At each timestamp, compute the fraction of active detectors missing data
missing_frac = mask_active.mean(axis=1)

THRESH = 0.1  # Global blackout if ≥10% of active detectors are missing at the same time
global_mask = missing_frac >= THRESH

# Find streaks of global blackouts
global_blackouts = find_streaks(global_mask)

print("\nGlobal / multi-sensor blackouts (first 10):")
for s, e in global_blackouts[:10]:
    print(f"  {timestamps[s]} → {timestamps[e]}  "
          f"(len={e - s + 1}, missing_frac≈{missing_frac[s]:.3f})")


Global / multi-sensor blackouts (first 10):
  2015-01-18T02:40:00.000000000 → 2015-01-18T04:55:00.000000000  (len=28, missing_frac≈0.118)
  2015-02-07T20:50:00.000000000 → 2015-02-08T00:50:00.000000000  (len=49, missing_frac≈0.104)
  2015-06-06T06:05:00.000000000 → 2015-06-06T13:45:00.000000000  (len=93, missing_frac≈0.125)
  2015-06-27T01:25:00.000000000 → 2015-06-27T01:45:00.000000000  (len=5, missing_frac≈0.104)
  2015-06-27T02:00:00.000000000 → 2015-06-27T02:20:00.000000000  (len=5, missing_frac≈0.104)
  2015-08-15T01:40:00.000000000 → 2015-08-15T02:25:00.000000000  (len=10, missing_frac≈0.111)
  2015-08-15T02:40:00.000000000 → 2015-08-15T02:55:00.000000000  (len=4, missing_frac≈0.104)
  2015-08-29T13:35:00.000000000 → 2015-08-30T04:15:00.000000000  (len=177, missing_frac≈0.111)
  2015-11-17T14:15:00.000000000 → 2015-11-18T08:10:00.000000000  (len=216, missing_frac≈0.118)
  2015-11-21T22:35:00.000000000 → 2015-11-22T02:40:00.000000000  (len=50, missing_frac≈0.104)


### Convert Per-Sensor Blackouts to DataFrame

In [17]:
# Organize blackout data into a structured DataFrame
rows = []
for det, intervals in sensor_blackouts.items():
    for (st, et, L) in intervals:
        rows.append({
            "detector": det,
            "start": st,
            "end": et,
            "len_steps": L,
            "len_minutes": L * 5   # each step = 5 min
        })


# Create a final DataFrame and sort by time and detector
blackouts_df = pd.DataFrame(rows).sort_values(["start", "detector"])

print("\nExample per-sensor blackouts:")
print(blackouts_df.head(15))

print(f"\nTotal detectors: {D}")
print(f"Active detectors (>=30% data): {len(active_cols)}")


Example per-sensor blackouts:
       detector               start                 end  len_steps  \
648  405es02662 2015-01-01 22:00:00 2015-01-02 04:55:00         84   
918  520es00746 2015-01-02 06:35:00 2015-01-02 09:25:00         35   
921  520es00770 2015-01-02 06:35:00 2015-01-02 09:25:00         35   
649  405es02662 2015-01-02 22:00:00 2015-01-03 04:55:00         84   
650  405es02662 2015-01-03 22:00:00 2015-01-04 04:55:00         84   
651  405es02662 2015-01-04 22:00:00 2015-01-05 04:55:00         84   
652  405es02662 2015-01-05 22:00:00 2015-01-06 04:55:00         84   
653  405es02662 2015-01-06 22:00:00 2015-01-07 04:55:00         84   
654  405es02662 2015-01-07 22:00:00 2015-01-08 04:55:00         84   
479  405es01870 2015-01-08 01:05:00 2015-01-08 02:00:00         12   
496  405es01921 2015-01-08 01:05:00 2015-01-08 02:00:00         12   
508  405es01971 2015-01-08 01:05:00 2015-01-08 02:00:00         12   
497  405es01921 2015-01-08 05:30:00 2015-01-08 06:35:00    

Note that some detectors are going off every night at the same time. We'd need to find them and filter them out, and those should not count as blackouts.