<a href="https://colab.research.google.com/github/alessandro-rubin/databricks_training/blob/main/event_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType, IntegerType, StringType, StructType, StructField, TimestampType
import random
import datetime
import pandas as pd
import pyspark.sql.functions as F

In [None]:
from pyspark.sql.functions import col

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Sample input data
data = [
    ("Event A", "2023-07-01 10:00:00", "2023-07-01 10:05:00"),
    ("Event A", "2023-07-01 10:07:00", "2023-07-01 10:10:00"),
    ("Event A", "2023-07-01 10:15:00", "2023-07-01 10:20:00"),
    ("Event B", "2023-07-01 10:01:00", "2023-07-01 10:02:00"),
    ("Event B", "2023-07-01 10:04:00", "2023-07-01 10:06:00"),
    ("Event B", "2023-07-01 10:08:00", "2023-07-01 10:09:00"),
    ("Event C", "2023-07-01 10:02:00", "2023-07-01 10:05:00"),
    ("Event C", "2023-07-01 10:11:00", "2023-07-01 10:14:00"),
    ("Event C", "2023-07-01 10:17:00", "2023-07-01 10:19:00"),
]

# Create the DataFrame
df = spark.createDataFrame(data, ["type", "begin", "end"])

# Convert string timestamps to timestamp data type
df = df.withColumn("begin", col("begin").cast("timestamp"))
df = df.withColumn("end", col("end").cast("timestamp"))

# Show the input DataFrame
df.show()


In [None]:
from datetime import timedelta
time_interval= timedelta(days=1,hours=0)
time_interval.total_seconds()

In [None]:

import pyspark.sql.functions as F
from pyspark.sql import Window


# Convert the user-defined time interval to seconds
time_interval_seconds = time_interval.total_seconds()  # User-defined value

# Calculate the time difference in seconds
df = df.withColumn("time_diff",
                   F.col("begin").cast("long") - F.lag("end").over(Window.partitionBy("type").orderBy("begin")).cast("long"))

# Group events and create event clusters based on the time interval
df = df.withColumn("event_cluster",
                   F.sum(F.when(F.col("time_diff") > time_interval, F.lit(1)).otherwise(0))
                   .over(Window.partitionBy("type").orderBy("begin")))

# Continue with the remaining code...


# Step 6: Aggregate data to calculate counts and beginning/end timestamps
grouped_df = df.groupBy("type", "event_cluster").agg(F.count("*").alias("event_count"),
                                                     F.min("begin").alias("begin"),
                                                     F.max("end").alias("end"))

# Step 7: Create a new DataFrame with desired output columns
output_df = grouped_df.select("type", "event_count", "begin", "end")

# Step 8: Display or save the resulting DataFrame
output_df.show()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum
from pyspark.sql.window import Window

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Create a sample DataFrame
data = [
    ("A", 10),
    ("A", 20),
    ("B", 15),
    ("B", 25),
    ("B", 30),
    ("C", 5),
]

df = spark.createDataFrame(data, ["type", "value"])

# Define the Window specification
window = Window.partitionBy("type").orderBy("value")

# Apply window function
df = df.withColumn("sum_value", sum(col("value")).over(window))

df.show()
