<a href="https://colab.research.google.com/github/alessandro-rubin/databricks_training/blob/main/Find_consecutive_values.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AIM: to find samples with consecutive values of a signal in a dataframe with signals from many different vehicles

In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=7003ac0cd80428915ff39387a3b2bebd70cd7dbb07598dac0bc0b171622c0917
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [73]:
spark = SparkSession.builder.appName("example").getOrCreate()

import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import lag, lead
from pyspark.sql.window import Window
from pyspark.sql import functions as F

def find_consecutive_intervals(df, signal_column, v_0, n):
    # Define a window specification to order rows by datetime
    window_spec = Window.orderBy("datetime")

    # Create a column that indicates when the signal is equal to v_0
    df = df.withColumn("is_v_0", (F.col(signal_column) == v_0).cast("integer"))

    # Create a column that assigns a group ID to consecutive rows with the same is_v_0 value
    df = df.withColumn(
        "group_id",
        F.sum("is_v_0").over(window_spec.rowsBetween(Window.unboundedPreceding, 0))
    )

    # Create a column that counts the number of consecutive v_0 rows within each group
    df = df.withColumn(
        "consecutive_count",
        F.when(F.col("is_v_0") == 1, F.sum("is_v_0").over(window_spec)).otherwise(0)
    )
    df.orderBy('vehicle','datetime').show()

    # Filter rows where consecutive_count is greater than or equal to n
    filtered_df = df.filter(F.col("consecutive_count") >= n)

    # Calculate the start and end of each interval and interval length in samples
    result_df = filtered_df.groupBy("group_id").agg(
        F.min("datetime").alias("start_datetime"),
        F.max("datetime").alias("end_datetime"),
        F.count("*").alias("interval_length_samples")
    )

    return result_df

def find_consecutive_intervals2(df, signal_column, v_0, n):
    # Define a window specification to order rows by datetime
    window_spec = Window.orderBy("datetime")

    # Create a column that indicates when the signal is equal to v_0
    df = df.withColumn("is_v_0", (F.col(signal_column) == v_0).cast("integer"))

    # Create a column that assigns a group ID to consecutive rows with the same is_v_0 value
    df = df.withColumn(
        "group_id",
        -F.sum("is_v_0").over(window_spec) + F.row_number().over(window_spec)
    )

    # Create a column that counts the number of consecutive v_0 rows within each group
    df = df.withColumn(
        "consecutive_count",
        F.when(F.col("is_v_0") == 1, F.sum("is_v_0").over(Window.partitionBy('vehicle',"group_id"))).otherwise(0)
    )
    df.orderBy('vehicle','datetime').show()

    # Filter rows where consecutive_count is greater than or equal to n
    filtered_df = df.filter(F.col("consecutive_count") >= n)
    filtered_df.show()
    # Calculate the start and end of each interval and interval length in samples
    result_df = filtered_df.groupBy("group_id").agg(
        F.min("datetime").alias("start_datetime"),
        F.max("datetime").alias("end_datetime"),
        F.count("*").alias("interval_length_samples"),
        F.first('vehicle')
    )

    return result_df
# Usage example:
# Assuming 'df' is your PySpark DataFrame
columns=["vehicle", "signal", "datetime"]

data = [
        ("vehicle_2", "v_0", "2023-09-12 08:00:00"),
        ("vehicle_2", "v_1", "2023-09-12 08:01:00"),
        ("vehicle_2", "v_0", "2023-09-12 08:03:00"),
        ("vehicle_2", "v_0", "2023-09-12 08:02:00"),
        ("vehicle_2", "v_0", "2023-09-12 08:03:00"),
        ("vehicle_1", "v_0", "2023-09-12 08:00:00"),
        ("vehicle_1", "v_1", "2023-09-12 08:01:00"),
        ("vehicle_1", "v_0", "2023-09-12 08:02:00"),
        ("vehicle_1", "v_0", "2023-09-12 08:03:00"),
        ("vehicle_1", "v_0", "2023-09-12 08:04:00"),
        ("vehicle_2", "v_1", "2023-09-12 08:05:00"),
        ("vehicle_1", "v_0", "2023-09-12 08:06:00"),
        ("vehicle_1", "v_0", "2023-09-12 08:07:00"),
        ("vehicle_1", "v_0", "2023-09-12 08:08:00"),
    ]
df = spark.createDataFrame(data, columns)
n = 3  # Minimum consecutive rows with the same signal value


(('vehicle_2', 'v_0', '2023-09-12 08:03:00'),)

In [47]:
n_rows=100

np.random.randint(1,26,n_rows)

array([14,  6,  9, 24,  1,  7,  5, 16,  6,  3,  3, 25, 23, 21,  9, 13,  9,
       21, 11, 21, 17, 21,  5, 20, 21, 12,  2, 24,  6, 12, 18, 16, 21,  4,
        4, 21, 21,  5, 12, 13, 15,  2, 14,  1, 11, 18, 12, 13,  3, 12, 16,
        9, 11,  1, 10,  9,  8,  1,  9,  4,  7,  6, 24, 21, 20,  6,  4,  2,
        6,  7,  6,  6,  8, 15,  1,  2, 17, 14,  3,  1,  9, 11, 22,  7, 16,
       21, 21, 13, 19, 13,  5,  5, 20, 18, 15,  7, 23, 23, 20, 21])

In [48]:


result = find_consecutive_intervals(df,'signal','v_0', n)
result.show()


+---------+------+-------------------+------+--------+-----------------+
|  vehicle|signal|           datetime|is_v_0|group_id|consecutive_count|
+---------+------+-------------------+------+--------+-----------------+
|vehicle_1|   v_0|2023-09-12 08:00:00|     1|       1|                1|
|vehicle_1|   v_1|2023-09-12 08:01:00|     0|       1|                0|
|vehicle_1|   v_0|2023-09-12 08:02:00|     1|       2|                2|
|vehicle_1|   v_0|2023-09-12 08:03:00|     1|       3|                3|
|vehicle_1|   v_0|2023-09-12 08:04:00|     1|       4|                4|
|vehicle_2|   v_1|2023-09-12 08:05:00|     0|       4|                0|
|vehicle_1|   v_0|2023-09-12 08:06:00|     1|       5|                5|
|vehicle_1|   v_0|2023-09-12 08:07:00|     1|       6|                6|
|vehicle_1|   v_0|2023-09-12 08:08:00|     1|       7|                7|
+---------+------+-------------------+------+--------+-----------------+

+--------+-------------------+-------------------+

In [74]:
result = find_consecutive_intervals(df,'signal','v_0', n)
result.show()

+---------+------+-------------------+------+--------+-----------------+
|  vehicle|signal|           datetime|is_v_0|group_id|consecutive_count|
+---------+------+-------------------+------+--------+-----------------+
|vehicle_1|   v_0|2023-09-12 08:00:00|     1|       2|                2|
|vehicle_1|   v_1|2023-09-12 08:01:00|     0|       2|                0|
|vehicle_1|   v_0|2023-09-12 08:02:00|     1|       4|                4|
|vehicle_1|   v_0|2023-09-12 08:03:00|     1|       6|                6|
|vehicle_1|   v_0|2023-09-12 08:04:00|     1|       8|                8|
|vehicle_1|   v_0|2023-09-12 08:06:00|     1|       9|                9|
|vehicle_1|   v_0|2023-09-12 08:07:00|     1|      10|               10|
|vehicle_1|   v_0|2023-09-12 08:08:00|     1|      11|               11|
|vehicle_2|   v_0|2023-09-12 08:00:00|     1|       1|                2|
|vehicle_2|   v_1|2023-09-12 08:01:00|     0|       2|                0|
|vehicle_2|   v_0|2023-09-12 08:02:00|     1|      