<a href="https://colab.research.google.com/github/alessandro-rubin/databricks_training/blob/main/event_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType, IntegerType, StringType, StructType, StructField, TimestampType
import random
import datetime
import pandas as pd
import pyspark.sql.functions as F

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=2e506c3f7e123d3713df7eda6efc7b0eecf9d8c227550ec4113b3b72bc1d047f
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [2]:
from pyspark.sql.functions import col

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Sample input data
data = [
    ("Event A", "2023-07-01 10:00:00", "2023-07-01 10:05:00"),
    ("Event A", "2023-07-01 10:07:00", "2023-07-01 10:10:00"),
    ("Event A", "2023-07-01 10:15:00", "2023-07-01 10:20:00"),
    ("Event B", "2023-07-01 10:01:00", "2023-07-01 10:02:00"),
    ("Event B", "2023-07-01 10:04:00", "2023-07-01 10:06:00"),
    ("Event B", "2023-07-01 10:08:00", "2023-07-01 10:09:00"),
    ("Event C", "2023-07-01 10:02:00", "2023-07-01 10:05:00"),
    ("Event C", "2023-07-01 10:11:00", "2023-07-01 10:14:00"),
    ("Event C", "2023-07-01 10:17:00", "2023-07-01 10:19:00"),
]

# Create the DataFrame
df = spark.createDataFrame(data, ["type", "begin", "end"])

# Convert string timestamps to timestamp data type
df = df.withColumn("begin", col("begin").cast("timestamp"))
df = df.withColumn("end", col("end").cast("timestamp"))

# Show the input DataFrame
df.show()


+-------+-------------------+-------------------+
|   type|              begin|                end|
+-------+-------------------+-------------------+
|Event A|2023-07-01 10:00:00|2023-07-01 10:05:00|
|Event A|2023-07-01 10:07:00|2023-07-01 10:10:00|
|Event A|2023-07-01 10:15:00|2023-07-01 10:20:00|
|Event B|2023-07-01 10:01:00|2023-07-01 10:02:00|
|Event B|2023-07-01 10:04:00|2023-07-01 10:06:00|
|Event B|2023-07-01 10:08:00|2023-07-01 10:09:00|
|Event C|2023-07-01 10:02:00|2023-07-01 10:05:00|
|Event C|2023-07-01 10:11:00|2023-07-01 10:14:00|
|Event C|2023-07-01 10:17:00|2023-07-01 10:19:00|
+-------+-------------------+-------------------+



In [9]:

import pyspark.sql.functions as F
from pyspark.sql import Window

# Convert the user-defined time interval to seconds
time_interval = 300  # User-defined value

# Calculate the time difference in seconds
df = df.withColumn("time_diff",
                   F.col("begin").cast("long") - F.lag("end").over(Window.partitionBy("type").orderBy("begin")).cast("long"))

# Group events and create event clusters based on the time interval
df = df.withColumn("event_cluster",
                   F.sum(F.when(F.col("time_diff") > time_interval, F.lit(1)).otherwise(0))
                   .over(Window.partitionBy("type").orderBy("begin")))

# Continue with the remaining code...


# Step 6: Aggregate data to calculate counts and beginning/end timestamps
grouped_df = df.groupBy("type", "event_cluster").agg(F.count("*").alias("event_count"),
                                                     F.min("begin").alias("begin"),
                                                     F.max("end").alias("end"))

# Step 7: Create a new DataFrame with desired output columns
output_df = grouped_df.select("type", "event_count", "begin", "end")

# Step 8: Display or save the resulting DataFrame
output_df.show()


+-------+-----------+-------------------+-------------------+
|   type|event_count|              begin|                end|
+-------+-----------+-------------------+-------------------+
|Event A|          3|2023-07-01 10:00:00|2023-07-01 10:20:00|
|Event B|          3|2023-07-01 10:01:00|2023-07-01 10:09:00|
|Event C|          1|2023-07-01 10:02:00|2023-07-01 10:05:00|
|Event C|          2|2023-07-01 10:11:00|2023-07-01 10:19:00|
+-------+-----------+-------------------+-------------------+

