In [13]:
from pyspark.sql import SparkSession
import os
from pyspark.sql.types import StringType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import concat, lit
from pyspark.sql.functions import avg, length
from pyspark.sql.functions import col, expr, count , row_number
from pyspark.sql.window import Window
import math

# PART 1:
## 1. Grouping the similar processes according to Jaccard Similarities
## 2. Creating the new data 

# code to start the Master:
1. Open cmd and admin
2. write "cd %SPARK_HOME%"
3. bin\spark-class2.cmd org.apache.spark.deploy.master.Master
# code to start the worker:
1. Open cmd and admin
2. write "cd %SPARK_HOME%"
3. write "bin\spark-class2.cmd org.apache.spark.deploy.worker.Worker -c 6 -m 10G spark://192.168.1.81:7077"
* in step 3:
* -c -> number of cores
* -m -> amount of RAM for the current worker
* the spark link is from the Master link ( go to the web page of the master and locate the spark link )

In [2]:
from pyspark.sql import SparkSession
import findspark
findspark.init()

spark = SparkSession.builder \
    .appName("part1Grouping") \
    .master("spark://192.168.1.81:7077") \
    .config("spark.executor.memory", "10g") \
    .config("spark.executor.cores", "6") \
    .config("spark.executor.instances", "2") \
    .config("spark.driver.memory", "10g") \
    .config("spark.driver.cores", "3") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.executor.heartbeatInterval", "100s") \
    .config("spark.sql.broadcastTimeout", "3600s") \
    .config("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2") \
    .config("spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored", "true") \
    .getOrCreate()



In [70]:
# Load the data into a DataFrame
data_path = "output2.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)
df.show()

+----------+--------+----+--------+---------+
|FromServer|ToServer|time|  action|processId|
+----------+--------+----+--------+---------+
|      null| lkVpiJ4|   0| Request|        1|
|   lkVpiJ4|    null|   6|Response|        1|
|      null| lkVpiJ4|   9| Request|        2|
|   lkVpiJ4|    null|  12|Response|        2|
|      null| OZBsEf0|  11| Request|        3|
|   OZBsEf0|    null|  13|Response|        3|
|      null|    Aum3|  18| Request|        4|
|      Aum3|    null|  28|Response|        4|
|      null|    Aum3|  22| Request|        5|
|      Aum3|    null|  24|Response|        5|
|      null|   qZGv1|  27| Request|        6|
|     qZGv1|    null|  36|Response|        6|
|      null|    asdf|  40| Request|        7|
|      asdf|    fdsa|  41| Request|        7|
|      fdsa|    asdf|  42|Response|        7|
|      asdf|    null|  43|Response|        7|
+----------+--------+----+--------+---------+



In [71]:
from pyspark.sql.functions import col, collect_list, struct

# Group by processID and collect the sequence of actions
processes_df = df.groupBy("processID").agg(collect_list(struct("FromServer", "ToServer", "time", "action")).alias("actions"))

# Convert actions to string for MinHash LSH
def actions_to_string(actions):
    return "".join([f"{action['FromServer']}{action['ToServer']}" for action in actions])

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

actions_to_string_udf = udf(actions_to_string, StringType())
processes_df = processes_df.withColumn("actions_str", actions_to_string_udf(col("actions")))
processes_df.show()

+---------+--------------------+--------------------+
|processID|             actions|         actions_str|
+---------+--------------------+--------------------+
|        1|[{null, lkVpiJ4, ...|nulllkVpiJ4lkVpiJ...|
|        6|[{null, qZGv1, 27...|  nullqZGv1qZGv1null|
|        3|[{null, OZBsEf0, ...|nullOZBsEf0OZBsEf...|
|        5|[{null, Aum3, 22,...|    nullAum3Aum3null|
|        4|[{null, Aum3, 18,...|    nullAum3Aum3null|
|        7|[{null, asdf, 40,...|nullasdfasdffdsaf...|
|        2|[{null, lkVpiJ4, ...|nulllkVpiJ4lkVpiJ...|
+---------+--------------------+--------------------+



In [72]:
# calculate the median process lenght to aproximate

# Calculate the length of actions_str column
df_with_length = processes_df.withColumn("length", length("actions_str"))

# Calculate the median length using window function and sorting
windowSpec = Window.orderBy("length")
df_with_length = df_with_length.withColumn("row_num", row_number().over(windowSpec))
count_df = df_with_length.count()

median_row = math.ceil(count_df / 2.0)

median_length = df_with_length.filter(col("row_num") == median_row).select("length").first()
cur_k = 5
thresholds = [(10000, 9), (5000, 8), (1000, 7), (100, 6)]
for threshold, value in thresholds:
    if median_length[0] > threshold:
        cur_k = value
        break
    

In [73]:
# Convert actions string into shingles
def get_shingles(row, k=5):
    concatenated_str = ''.join(row)
    shingles = [concatenated_str[i:i+k] for i in range(len(concatenated_str) - (k - 1))]
    return shingles
from pyspark.sql.types import ArrayType

get_shingles_udf = udf(lambda x: get_shingles(x,cur_k), ArrayType(StringType()))
processes_df = processes_df.withColumn("shingles", get_shingles_udf(col("actions_str")))
processes_df.show()


+---------+--------------------+--------------------+--------------------+
|processID|             actions|         actions_str|            shingles|
+---------+--------------------+--------------------+--------------------+
|        1|[{null, lkVpiJ4, ...|nulllkVpiJ4lkVpiJ...|[nulll, ulllk, ll...|
|        6|[{null, qZGv1, 27...|  nullqZGv1qZGv1null|[nullq, ullqZ, ll...|
|        3|[{null, OZBsEf0, ...|nullOZBsEf0OZBsEf...|[nullO, ullOZ, ll...|
|        5|[{null, Aum3, 22,...|    nullAum3Aum3null|[nullA, ullAu, ll...|
|        4|[{null, Aum3, 18,...|    nullAum3Aum3null|[nullA, ullAu, ll...|
|        7|[{null, asdf, 40,...|nullasdfasdffdsaf...|[nulla, ullas, ll...|
|        2|[{null, lkVpiJ4, ...|nulllkVpiJ4lkVpiJ...|[nulll, ulllk, ll...|
+---------+--------------------+--------------------+--------------------+



In [74]:
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="shingles", outputCol="features",binary=True)
cv_model = cv.fit(processes_df)
vectorized_df = cv_model.transform(processes_df)

In [83]:
from pyspark.ml.feature import MinHashLSH

mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=10)
mh_model = mh.fit(vectorized_df)
hashed_df = mh_model.transform(vectorized_df)
hashed_df.show()

+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
|processID|             actions|         actions_str|            shingles|            features|              hashes|
+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        1|[{null, lkVpiJ4, ...|nulllkVpiJ4lkVpiJ...|[nulll, ulllk, ll...|(79,[0,1,2,6,10,1...|[[1.2504707E7], [...|
|        6|[{null, qZGv1, 27...|  nullqZGv1qZGv1null|[nullq, ullqZ, ll...|(79,[4,37,40,47,4...|[[3.3556605E8], [...|
|        3|[{null, OZBsEf0, ...|nullOZBsEf0OZBsEf...|[nullO, ullOZ, ll...|(79,[9,25,30,38,4...|[[7.4414757E7], [...|
|        5|[{null, Aum3, 22,...|    nullAum3Aum3null|[nullA, ullAu, ll...|(79,[3,5,8,11,13,...|[[1730258.0], [1....|
|        4|[{null, Aum3, 18,...|    nullAum3Aum3null|[nullA, ullAu, ll...|(79,[3,5,8,11,13,...|[[1730258.0], [1....|
|        7|[{null, asdf, 40,...|nullasdfasdffdsaf...|[nulla, ull

In [138]:
from pyspark.sql.functions import array, array_union, collect_list, explode, col
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import HashingTF

threshold = 0.8
# Find similar candidate process IDs using MinHashLSH
similarity_df = mh_model.approxSimilarityJoin(hashed_df, hashed_df, threshold, distCol="JaccardDistance") \
    .select(col("datasetA.processID").alias("processID_A"),
            col("datasetB.processID").alias("processID_B"),
            col("JaccardDistance"),col("datasetA.features").alias("featuresA"),col("datasetB.features").alias("featuresB"))

# Filter out self-joins and duplicates
similarity_df = similarity_df.filter(col("processID_A") < col("processID_B"))
similarity_df.show()
# Function to calculate Jaccard similarity
def jaccard_similarity(vec1, vec2):
    set1 = set(vec1.indices)
    set2 = set(vec2.indices)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    if len(union) == 0:
        return 0.0
    return float(len(intersection)) / len(union)

# Register the function as a UDF
from pyspark.sql.types import FloatType
jaccard_similarity_udf = udf(jaccard_similarity, FloatType())

# Calculate Jaccard similarity for each candidate pair
similarity_df = similarity_df.withColumn("JaccardSimilarity", jaccard_similarity_udf(col("featuresA"), col("featuresB")))
# Filter pairs with Jaccard similarity above a threshold (e.g., 0.8)
similarity_df_filtered = similarity_df.filter(col("JaccardSimilarity") >= 0.9)
# Group by processID_A and collect similar processIDs
grouped_df = similarity_df.groupBy("processID_A").agg(collect_list("processID_B").alias("similar_processIDs"))

# Convert processID_A to an array and concatenate with similar_processIDs
grouped_df = grouped_df.withColumn("all_processIDs", array_union(array(col("processID_A")), col("similar_processIDs")))

# Explode the all_processIDs array to get a mapping of each process ID to its group
exploded_df = grouped_df.select(explode(col("all_processIDs")).alias("processID"), col("processID_A").alias("group_representative"))
similarity_df.show()

+-----------+-----------+---------------+--------------------+--------------------+
|processID_A|processID_B|JaccardDistance|           featuresA|           featuresB|
+-----------+-----------+---------------+--------------------+--------------------+
|          1|          2|            0.0|(79,[0,1,2,6,10,1...|(79,[0,1,2,6,10,1...|
|          4|          5|            0.0|(79,[3,5,8,11,13,...|(79,[3,5,8,11,13,...|
+-----------+-----------+---------------+--------------------+--------------------+

+-----------+-----------+---------------+--------------------+--------------------+-----------------+
|processID_A|processID_B|JaccardDistance|           featuresA|           featuresB|JaccardSimilarity|
+-----------+-----------+---------------+--------------------+--------------------+-----------------+
|          1|          2|            0.0|(79,[0,1,2,6,10,1...|(79,[0,1,2,6,10,1...|              1.0|
|          4|          5|            0.0|(79,[3,5,8,11,13,...|(79,[3,5,8,11,13,...|    

In [87]:
# Merge overlapping groups
def merge_groups(group_list):
    groups = []
    for group in group_list:
        merged = False
        for existing_group in groups:
            if any(item in group for item in existing_group):
                existing_group.update(group)
                merged = True
                break
        if not merged:
            groups.append(set(group))
    return [list(group) for group in groups]

merge_groups_udf = udf(lambda x: merge_groups(x), ArrayType(ArrayType(IntegerType())))

grouped_lists = exploded_df.groupBy("group_representative") \
    .agg(collect_list("processID").alias("group_list")) \
    .agg(collect_list("group_list").alias("group_lists"))

merged_groups = grouped_lists.withColumn("merged_groups", merge_groups_udf(col("group_lists"))) \
    .select(explode(col("merged_groups")).alias("final_group"))

# Convert the final groups to a DataFrame
from pyspark.sql.functions import concat_ws

final_groups_df = merged_groups.select(concat_ws("_", col("final_group")).alias("Group"), col("final_group"))

# Find the representative process for each final group
final_groups_exploded = final_groups_df.withColumn("processID", explode(col("final_group")))

# Join with the original DataFrame to keep only the representative process
filtered_df = df.join(final_groups_exploded, on="processID", how="inner")

# Select the smallest processID in each group as the representative
from pyspark.sql.functions import min

group_representative_df = final_groups_exploded.groupBy("Group").agg(min("processID").alias("representative_processID"))

# Join to get the full details of the representative processes
representative_processes_df = group_representative_df.join(filtered_df, filtered_df["processID"] == group_representative_df.representative_processID, "inner") \
    .select("processID", "FromServer", "ToServer", "time", "action")
representative_processes_df.show()
final_groups_df.show()

+---------+----------+--------+----+--------+
|processID|FromServer|ToServer|time|  action|
+---------+----------+--------+----+--------+
|        1|      null| lkVpiJ4|   0| Request|
|        1|   lkVpiJ4|    null|   6|Response|
|        4|      null|    Aum3|  18| Request|
|        4|      Aum3|    null|  28|Response|
+---------+----------+--------+----+--------+

+-----+-----------+
|Group|final_group|
+-----+-----------+
|  1_2|     [1, 2]|
|  4_5|     [4, 5]|
+-----+-----------+



In [88]:
representative_processes_df.show(truncate=False)

+---------+----------+--------+----+--------+
|processID|FromServer|ToServer|time|action  |
+---------+----------+--------+----+--------+
|1        |null      |lkVpiJ4 |0   |Request |
|1        |lkVpiJ4   |null    |6   |Response|
|4        |null      |Aum3    |18  |Request |
|4        |Aum3      |null    |28  |Response|
+---------+----------+--------+----+--------+



In [89]:
from pyspark.sql.functions import col, expr
# Step 1: Remove Processes in Groups
# Get the list of process IDs to remove
processes_to_remove = final_groups_df.selectExpr("explode(final_group) as processID").distinct()

# Filter out rows where processID is in processes_to_remove
df_without_groups = df.join(processes_to_remove, "processID", "left_anti")
df_without_groups = df_without_groups.select("FromServer", "ToServer", "time", "action","processID")
# Add a constant number to processID
constant_number = df.agg({"processID": "max"}).first()[0]
new_representative_processes_df = representative_processes_df.withColumn(
    "processID",
    expr(f"processID + {constant_number}")
)

# Show the final DataFrame
new_representative_processes_df = new_representative_processes_df.select("FromServer", "ToServer", "time", "action","processID").orderBy("time")

# Combine original DataFrame and representatives DataFrame
combined_df = df_without_groups.union(new_representative_processes_df)

# Show final combined DataFrame
combined_df.show(truncate=False)

+----------+--------+----+--------+---------+
|FromServer|ToServer|time|action  |processID|
+----------+--------+----+--------+---------+
|null      |OZBsEf0 |11  |Request |3        |
|OZBsEf0   |null    |13  |Response|3        |
|null      |qZGv1   |27  |Request |6        |
|qZGv1     |null    |36  |Response|6        |
|null      |asdf    |40  |Request |7        |
|asdf      |fdsa    |41  |Request |7        |
|fdsa      |asdf    |42  |Response|7        |
|asdf      |null    |43  |Response|7        |
|null      |lkVpiJ4 |0   |Request |8        |
|lkVpiJ4   |null    |6   |Response|8        |
|null      |Aum3    |18  |Request |11       |
|Aum3      |null    |28  |Response|11       |
+----------+--------+----+--------+---------+



# creating the txt files:
## The desired files will be in the folder output

In [36]:
def write_to_one_txt(df, local_path_name,wanted_list):
    correct_path = wanted_list + "/part1Output.txt"
    formatted_df = df.withColumn(
    "formatted_line",
    concat(lit("<"), df.FromServer, lit(","),
           df.ToServer, lit(","),
           df.time, lit(","),
           df.action, lit(","),
           df.processID, lit(">"))
)
    open(correct_path, "w")
    formatted_df.select("formatted_line").write.mode("overwrite").text(output_path)
    os.system(f'cat {local_path_name}/*.txt >> {correct_path}')
    os.system(f'rm -r {local_path_name}')
    

In [13]:

# Define the output path
output_path = "./part1OUT1"
output_path1 = "./output"
write_to_one_txt(combined_df,output_path,output_path1)
# Write the DataFrame to a CSV file


In [14]:
# creating a dataframe only with the processes that were grouped.
df_with_groups = df.join(processes_to_remove, "processID", "semi")
df_with_groups.show()

+---------+----------+--------+----+--------+
|processId|FromServer|ToServer|time|  action|
+---------+----------+--------+----+--------+
|        1|      null| lkVpiJ4|   0| Request|
|        1|   lkVpiJ4|    null|   6|Response|
|        2|      null| lkVpiJ4|   9| Request|
|        2|   lkVpiJ4|    null|  12|Response|
|        4|      null|    Aum3|  18| Request|
|        4|      Aum3|    null|  28|Response|
|        5|      null|    Aum3|  22| Request|
|        5|      Aum3|    null|  24|Response|
+---------+----------+--------+----+--------+



In [15]:
exploded_final_groups_df = final_groups_df.select("Group", explode("final_group").alias("processID"))
exploded_final_groups_df.show()
df_with_groups.show()
joined_df = df_with_groups.join(exploded_final_groups_df, "processID")

+-----+---------+
|Group|processID|
+-----+---------+
|  1_2|        1|
|  1_2|        2|
|  4_5|        4|
|  4_5|        5|
+-----+---------+

+---------+----------+--------+----+--------+
|processId|FromServer|ToServer|time|  action|
+---------+----------+--------+----+--------+
|        1|      null| lkVpiJ4|   0| Request|
|        1|   lkVpiJ4|    null|   6|Response|
|        2|      null| lkVpiJ4|   9| Request|
|        2|   lkVpiJ4|    null|  12|Response|
|        4|      null|    Aum3|  18| Request|
|        4|      Aum3|    null|  28|Response|
|        5|      null|    Aum3|  22| Request|
|        5|      Aum3|    null|  24|Response|
+---------+----------+--------+----+--------+



In [38]:
# Function to write groups to txt file
def write_groups_to_txt(grouped_df, output_path):
    with open(output_path, "w") as file:
        for row in grouped_df.collect():
            group_name = row["Group"]
            process_ids = row["processIDs"]
            formatted_rows = row["formatted_rows"]
            
            # Ensure process_ids are unique and sorted
            process_ids = sorted(set(process_ids))
            
            file.write(f"Group: {{{', '.join(map(str, process_ids))}}}\n")
            
            for process_id in process_ids:
                file.write(f"{process_id}:\n")
                
                # Find all formatted rows for the current process ID
                rows_for_process_id = [row for row in formatted_rows if row.endswith(f",{process_id}>")]
                
                if rows_for_process_id:
                    for formatted_row in rows_for_process_id:
                        file.write(f"{formatted_row}\n")
                else:
                    file.write("<No corresponding formatted rows found>\n")
                    
            file.write("\n")  # Add empty line between groups for clarity

In [37]:

# Format each row into the desired format
formatted_df = joined_df.withColumn(
    "formatted_row",
    concat_ws("", lit("<"), col("FromServer"), lit(","), col("ToServer"),
              lit(","), col("time"), lit(","), col("action"), lit(","), col("processID"), lit(">"))
)

# Group by Group name and aggregate process IDs and formatted rows
grouped_df = formatted_df.groupBy("Group").agg(
    collect_list("processID").alias("processIDs"),
    collect_list("formatted_row").alias("formatted_rows")
)



# Output path
output_path = "./output/part1Observations.txt"

# Call function to write to text file
write_groups_to_txt(grouped_df, output_path)

NameError: name 'joined_df' is not defined

# Evaluation

In [96]:
# Function to compute Jaccard similarity

    
def evaluateData(df,df_original, process_id_col='processID'):
    # Perform a cross join
    jaccard_similarity_udf = udf(jaccard_similarity, FloatType())
    cross_joined_df = df.alias("df1").join(df.alias("df2")).select(col("df1.processId").alias("processIdA"),col("df1.features").alias("processAFeatures"),
                                                                  col("df2.processId").alias("processIdB"),col("df2.features").alias("processBFeatures")).orderBy(col("processIdA"))
    
    # Filter out pairs where the process IDs are the same
    filtered_df = cross_joined_df.filter(col("processIdA") < col("processIdB"))
    filtered_df.show()
    # Calculate Jaccard similarity for each candidate pair
    similarity_df = filtered_df.withColumn("JaccardSimilarity", jaccard_similarity_udf(col("processAFeatures"), col("processBFeatures")))
 
    # Filter pairs with Jaccard similarity above a threshold (e.g., 0.6)
    similar_df = similarity_df.filter(col("JaccardSimilarity") >= 0.3)
    nonsimilar_df = similarity_df.filter(col("JaccardSimilarity") < 0.3)
    #similar_df = t1
    #nosimilar_df = f1
    print("similar:")
    similar_df.show()
    print("nonsimilar:")
    nonsimilar_df.show()
evaluateData(cv_df)


    

+----------+--------------------+----------+--------------------+
|processIdA|    processAFeatures|processIdB|    processBFeatures|
+----------+--------------------+----------+--------------------+
|         3| (7,[0,5],[1.0,1.0])|         6| (7,[0,2],[1.0,1.0])|
|         3| (7,[0,5],[1.0,1.0])|         7|(7,[0,3,4],[1.0,1...|
|         3| (7,[0,5],[1.0,1.0])|         8| (7,[0,1],[1.0,1.0])|
|         3| (7,[0,5],[1.0,1.0])|        11| (7,[0,6],[1.0,1.0])|
|         6| (7,[0,2],[1.0,1.0])|         7|(7,[0,3,4],[1.0,1...|
|         6| (7,[0,2],[1.0,1.0])|         8| (7,[0,1],[1.0,1.0])|
|         6| (7,[0,2],[1.0,1.0])|        11| (7,[0,6],[1.0,1.0])|
|         7|(7,[0,3,4],[1.0,1...|         8| (7,[0,1],[1.0,1.0])|
|         7|(7,[0,3,4],[1.0,1...|        11| (7,[0,6],[1.0,1.0])|
|         8| (7,[0,1],[1.0,1.0])|        11| (7,[0,6],[1.0,1.0])|
+----------+--------------------+----------+--------------------+

similar:
+----------+-------------------+----------+-------------------+---

In [139]:
# Function to compute Jaccard similarity

def evaluateData(original_df,df_similar_Minhash,threshold, process_id_col='processID'):
    jaccard_similarity_udf = udf(jaccard_similarity, FloatType())
    cross_joined_df = original_df.alias("df1").join(original_df.alias("df2")).select(col("df1.processId").alias("processIdA"),col("df1.features").alias("processAFeatures"),
                                                                  col("df2.processId").alias("processIdB"),col("df2.features").alias("processBFeatures")).orderBy(col("processIdA"))
    filtered_df = cross_joined_df.filter(col("processIdA") < col("processIdB"))
    # Calculate Jaccard similarity for each candidate pair
    similarity_df = filtered_df.withColumn("JaccardSimilarity", jaccard_similarity_udf(col("processAFeatures"), col("processBFeatures")))
    df_similar_Minhash = df_similar_Minhash.select(col("processIdA"),col("processIdB"),col("JaccardSimilarity"))
    similarity_df = similarity_df.select(col("processIdA"),col("processIdB"),col("JaccardSimilarity"))
    
    t1 = similarity_df.filter(col("JaccardSimilarity") >= threshold)
    f1 = similarity_df.filter(col("JaccardSimilarity") < threshold)
    t3 = df_similar_Minhash.filter(col("JaccardSimilarity") >= threshold)
    f3 = df_similar_Minhash.filter(col("JaccardSimilarity") < threshold)

    B = similarity_df.exceptAll(df_similar_Minhash)
    TN = f3.union(B).intersect(f1)
    TP = t3.intersect(t1)
    FP = t3.intersect(f1)
    FN = f3.union(B).intersect(t1)
    print("TN" , TN.count(), " TP " , TP.count(), " FP " , FP.count(),  " FN " , FN.count() )



# Part 2

In [142]:
from pyspark.sql.functions import collect_set, col, concat_ws, collect_list
from pyspark.ml.feature import CountVectorizer, MinHashLSH
from pyspark.sql.functions import array_union, explode,array
from pyspark.sql.types import FloatType
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, ArrayType, IntegerType, FloatType

dataForPart2 = combined_df
dataForPart2.show()
# Aggregate FromServer and ToServer into sets for each processId
agg_df = dataForPart2.groupBy("processId").agg(
    collect_set("FromServer").alias("servers_array")
)
 
agg_df.show(truncate=False)
 
# Use CountVectorizer to convert server names to feature vectors
cv = CountVectorizer(inputCol="servers_array", outputCol="features")
cv_model = cv.fit(agg_df)
cv_df = cv_model.transform(agg_df)
 
print("Distinct Attributes (Vocabulary):")
for i, attr in enumerate(cv_model.vocabulary):
    print(f"{i}. {attr}")
 
# Show the transformed DataFrame with processId and features
cv_df.select("processId", "features").show(truncate=False)
 
# Apply MinHash LSH
minhash = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
model = minhash.fit(cv_df)
transformed_df = model.transform(cv_df)
 
# Perform clustering based on MinHash LSH
similarity_df = model.approxSimilarityJoin(transformed_df, transformed_df, 0.8, distCol="JaccardDistance")
 
# Extract clusters from the similarity dataframe
candidates = similarity_df.select(col("datasetA.processId").alias("processIdA"),
                                  col("datasetB.processId").alias("processIdB"),
                                  col("JaccardDistance"))
 
# Filter out self-joins and duplicates
candidates = candidates.filter(col("processIdA") < col("processIdB"))
# Function to calculate Jaccard similarity
 
jaccard_similarity_udf = udf(jaccard_similarity, FloatType())
 
# Join with original DataFrame to get shingles for each process
similarity_df = candidates \
    .join(agg_df.select("processId", "servers_array"), candidates.processIdA == agg_df.processId) \
    .withColumnRenamed("servers_array", "servers_A") \
    .drop("processId") \
    .join(agg_df.select("processId", "servers_array"), candidates.processIdB == agg_df.processId) \
    .withColumnRenamed("servers_array", "servers_B") \
    .drop("processId")
 
# Calculate Jaccard similarity for each candidate pair
similarity_df = similarity_df.withColumn("JaccardSimilarity", jaccard_similarity_udf(col("servers_A"), col("servers_B")))
evaluateData(cv_df,similarity_df,0.3)
# # Filter pairs with Jaccard similarity above a threshold (e.g., 0.6)
# similarity_df = similarity_df.filter(col("JaccardSimilarity") >= 0.3)
 
# similarity_df.show()


# merge_groups_udf = udf(lambda x: merge_groups(x), ArrayType(ArrayType(IntegerType())))

# grouped_lists = exploded_df.groupBy("group_representative") \
#     .agg(collect_list("processID").alias("group_list")) \
#     .agg(collect_list("group_list").alias("group_lists"))

# merged_groups = grouped_lists.withColumn("merged_groups", merge_groups_udf(col("group_lists"))) \
#     .select(explode(col("merged_groups")).alias("final_group"))

# # Convert the final groups to a DataFrame
# from pyspark.sql.functions import concat_ws

# final_groups_df = merged_groups.select(concat_ws("_", col("final_group")).alias("Group"), col("final_group"))

# final_groups_df.show()
# # Output path
# output_path = "./output/part2Observations.txt"

# processes_from_groups = final_groups_df.selectExpr("explode(final_group) as processID").distinct()
# # creating a dataframe only with the processes that were grouped.
# df_with_groups = dataForPart2.join(processes_from_groups, "processID", "semi")
# df_with_groups.show()

# exploded_final_groups_df = final_groups_df.select("Group", explode("final_group").alias("processID"))
# joined_df = df_with_groups.join(exploded_final_groups_df, "processID")
# # Call function to write to text file
# joined_df.show()
# # write_groups_to_txt(final_groups_df, output_path)

+----------+--------+----+--------+---------+
|FromServer|ToServer|time|  action|processID|
+----------+--------+----+--------+---------+
|      null| OZBsEf0|  11| Request|        3|
|   OZBsEf0|    null|  13|Response|        3|
|      null|   qZGv1|  27| Request|        6|
|     qZGv1|    null|  36|Response|        6|
|      null|    asdf|  40| Request|        7|
|      asdf|    fdsa|  41| Request|        7|
|      fdsa|    asdf|  42|Response|        7|
|      asdf|    null|  43|Response|        7|
|      null| lkVpiJ4|   0| Request|        8|
|   lkVpiJ4|    null|   6|Response|        8|
|      null|    Aum3|  18| Request|       11|
|      Aum3|    null|  28|Response|       11|
+----------+--------+----+--------+---------+

+---------+------------------+
|processId|servers_array     |
+---------+------------------+
|6        |[qZGv1, null]     |
|3        |[OZBsEf0, null]   |
|7        |[fdsa, null, asdf]|
|8        |[lkVpiJ4, null]   |
|11       |[Aum3, null]      |
+---------+----

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `processAFeatures` cannot be resolved. Did you mean one of the following? [`processIdA`, `processIdB`, `JaccardDistance`, `servers_A`, `servers_B`].;
'Project [processIdA#173257, 'processAFeatures, processIdB#173258, 'processBFeatures, JaccardSimilarity#173409]
+- Project [processIdA#173257, processIdB#173258, JaccardDistance#173245, servers_A#173324, servers_B#173396, jaccard_similarity(servers_A#173324, servers_B#173396)#173408 AS JaccardSimilarity#173409]
   +- Project [processIdA#173257, processIdB#173258, JaccardDistance#173245, servers_A#173324, servers_B#173396]
      +- Project [processIdA#173257, processIdB#173258, JaccardDistance#173245, servers_A#173324, processId#173340, servers_array#172659 AS servers_B#173396]
         +- Join Inner, (processIdB#173258 = processId#173340)
            :- Project [processIdA#173257, processIdB#173258, JaccardDistance#173245, servers_A#173324]
            :  +- Project [processIdA#173257, processIdB#173258, JaccardDistance#173245, processId#173270, servers_array#172659 AS servers_A#173324]
            :     +- Join Inner, (processIdA#173257 = processId#173270)
            :        :- Filter (processIdA#173257 < processIdB#173258)
            :        :  +- Project [datasetA#173151.processId AS processIdA#173257, datasetB#173176.processId AS processIdB#173258, JaccardDistance#173245]
            :        :     +- Filter (JaccardDistance#173245 < 0.8)
            :        :        +- Project [datasetA#173151, datasetB#173176, UDF(datasetA#173151.features, datasetB#173176.features) AS JaccardDistance#173245]
            :        :           +- Deduplicate [datasetA#173151, datasetB#173176]
            :        :              +- Project [datasetA#173151, datasetB#173176]
            :        :                 +- Project [entry#173152, hashValue#173153, datasetA#173151, datasetB#173176]
            :        :                    +- Join Inner, ((entry#173152 = entry#173177) AND (hashValue#173153 = hashValue#173178))
            :        :                       :- Project [struct(processId, processId#62902, servers_array, servers_array#172659, features, features#172972, hashes, hashes#173139) AS datasetA#173151, entry#173152, hashValue#173153]
            :        :                       :  +- Generate posexplode(hashes#173139), false, [entry#173152, hashValue#173153]
            :        :                       :     +- Project [processId#62902, servers_array#172659, features#172972, UDF(features#172972) AS hashes#173139]
            :        :                       :        +- Project [processId#62902, servers_array#172659, UDF(servers_array#172659) AS features#172972]
            :        :                       :           +- Aggregate [processId#62902], [processId#62902, collect_set(FromServer#62898, 0, 0) AS servers_array#172659]
            :        :                       :              +- Union false, false
            :        :                       :                 :- Project [FromServer#62898, ToServer#62899, time#62900, action#62901, processID#62902]
            :        :                       :                 :  +- Project [processId#62902, FromServer#62898, ToServer#62899, time#62900, action#62901]
            :        :                       :                 :     +- Join LeftAnti, (processId#62902 = processID#65957)
            :        :                       :                 :        :- Relation [FromServer#62898,ToServer#62899,time#62900,action#62901,processId#62902] csv
            :        :                       :                 :        +- Deduplicate [processID#65957]
            :        :                       :                 :           +- Project [processID#65957]
            :        :                       :                 :              +- Generate explode(final_group#65585), false, [processID#65957]
            :        :                       :                 :                 +- Project [concat_ws(_, cast(final_group#65585 as array<string>)) AS Group#65587, final_group#65585]
            :        :                       :                 :                    +- Project [final_group#65585]
            :        :                       :                 :                       +- Generate explode(merged_groups#65581), false, [final_group#65585]
            :        :                       :                 :                          +- Project [group_lists#65576, <lambda>(group_lists#65576)#65580 AS merged_groups#65581]
            :        :                       :                 :                             +- Aggregate [collect_list(group_list#65572, 0, 0) AS group_lists#65576]
            :        :                       :                 :                                +- Aggregate [group_representative#65520], [group_representative#65520, collect_list(processID#65521, 0, 0) AS group_list#65572]
            :        :                       :                 :                                   +- Project [processID#65521, processID_A#65377 AS group_representative#65520]
            :        :                       :                 :                                      +- Generate explode(all_processIDs#65515), false, [processID#65521]
            :        :                       :                 :                                         +- Project [processID_A#65377, similar_processIDs#65512, array_union(array(processID_A#65377), similar_processIDs#65512) AS all_processIDs#65515]
            :        :                       :                 :                                            +- Aggregate [processID_A#65377], [processID_A#65377, collect_list(processID_B#65378, 0, 0) AS similar_processIDs#65512]
            :        :                       :                 :                                               +- Project [processID_A#65377, processID_B#65378, JaccardDistance#65365, featuresA#65379, featuresB#65380, jaccard_similarity(featuresA#65379, featuresB#65380)#65497 AS JaccardSimilarity#65498]
            :        :                       :                 :                                                  +- Filter (processID_A#65377 < processID_B#65378)
            :        :                       :                 :                                                     +- Project [datasetA#65311.processID AS processID_A#65377, datasetB#65344.processID AS processID_B#65378, JaccardDistance#65365, datasetA#65311.features AS featuresA#65379, datasetB#65344.features AS featuresB#65380]
            :        :                       :                 :                                                        +- Filter (JaccardDistance#65365 < 0.8)
            :        :                       :                 :                                                           +- Project [datasetA#65311, datasetB#65344, UDF(datasetA#65311.features, datasetB#65344.features) AS JaccardDistance#65365]
            :        :                       :                 :                                                              +- Deduplicate [datasetA#65311, datasetB#65344]
            :        :                       :                 :                                                                 +- Project [datasetA#65311, datasetB#65344]
            :        :                       :                 :                                                                    +- Project [entry#65312, hashValue#65313, datasetA#65311, datasetB#65344]
            :        :                       :                 :                                                                       +- Join Inner, ((entry#65312 = entry#65345) AND (hashValue#65313 = hashValue#65346))
            :        :                       :                 :                                                                          :- Project [struct(processID, processID#65963, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, features, features#63047, hashes, hashes#64627) AS datasetA#65311, entry#65312, hashValue#65313]
            :        :                       :                 :                                                                          :  +- Generate posexplode(hashes#64627), false, [entry#65312, hashValue#65313]
            :        :                       :                 :                                                                          :     +- Project [processID#65963, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :        :                       :                 :                                                                          :        +- Project [processID#65963, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :        :                       :                 :                                                                          :           +- Project [processID#65963, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :        :                       :                 :                                                                          :              +- Project [processID#65963, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :        :                       :                 :                                                                          :                 +- Aggregate [processID#65963], [processID#65963, collect_list(struct(FromServer, FromServer#65959, ToServer, ToServer#65960, time, time#65961, action, action#65962), 0, 0) AS actions#62941]
            :        :                       :                 :                                                                          :                    +- Relation [FromServer#65959,ToServer#65960,time#65961,action#65962,processId#65963] csv
            :        :                       :                 :                                                                          +- Project [struct(processID, processID#65354, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, hashes, hashes#64627, features, features#65324) AS datasetB#65344, entry#65345, hashValue#65346]
            :        :                       :                 :                                                                             +- Generate posexplode(hashes#64627), false, [entry#65345, hashValue#65346]
            :        :                       :                 :                                                                                +- Project [processID#65354, actions#62941, actions_str#62945, shingles#63004, hashes#64627, features#65324]
            :        :                       :                 :                                                                                   +- Project [processID#65354, actions#62941, actions_str#62945, shingles#63004, inputCol_3e7494a3a9b6#65317, hashes#64627, inputCol_3e7494a3a9b6#65317 AS features#65324]
            :        :                       :                 :                                                                                      +- Project [processID#65354, actions#62941, actions_str#62945, shingles#63004, features#63047 AS inputCol_3e7494a3a9b6#65317, hashes#64627]
            :        :                       :                 :                                                                                         +- Project [processID#65354, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :        :                       :                 :                                                                                            +- Project [processID#65354, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :        :                       :                 :                                                                                               +- Project [processID#65354, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :        :                       :                 :                                                                                                  +- Project [processID#65354, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :        :                       :                 :                                                                                                     +- Aggregate [processID#65354], [processID#65354, collect_list(struct(FromServer, FromServer#65350, ToServer, ToServer#65351, time, time#65352, action, action#65353), 0, 0) AS actions#62941]
            :        :                       :                 :                                                                                                        +- Relation [FromServer#65350,ToServer#65351,time#65352,action#65353,processId#65354] csv
            :        :                       :                 +- Sort [time#65623 ASC NULLS FIRST], true
            :        :                       :                    +- Project [FromServer#65621, ToServer#65622, time#65623, action#65624, processID#65988]
            :        :                       :                       +- Project [(processID#65625 + 7) AS processID#65988, FromServer#65621, ToServer#65622, time#65623, action#65624]
            :        :                       :                          +- Project [processID#65625, FromServer#65621, ToServer#65622, time#65623, action#65624]
            :        :                       :                             +- Join Inner, (processID#65625 = representative_processID#65618)
            :        :                       :                                :- Aggregate [Group#65587], [Group#65587, min(processID#65591) AS representative_processID#65618]
            :        :                       :                                :  +- Project [Group#65587, final_group#65585, processID#65591]
            :        :                       :                                :     +- Generate explode(final_group#65585), false, [processID#65591]
            :        :                       :                                :        +- Project [concat_ws(_, cast(final_group#65585 as array<string>)) AS Group#65587, final_group#65585]
            :        :                       :                                :           +- Project [final_group#65585]
            :        :                       :                                :              +- Generate explode(merged_groups#65581), false, [final_group#65585]
            :        :                       :                                :                 +- Project [group_lists#65576, <lambda>(group_lists#65576)#65580 AS merged_groups#65581]
            :        :                       :                                :                    +- Aggregate [collect_list(group_list#65572, 0, 0) AS group_lists#65576]
            :        :                       :                                :                       +- Aggregate [group_representative#65520], [group_representative#65520, collect_list(processID#65521, 0, 0) AS group_list#65572]
            :        :                       :                                :                          +- Project [processID#65521, processID_A#65377 AS group_representative#65520]
            :        :                       :                                :                             +- Generate explode(all_processIDs#65515), false, [processID#65521]
            :        :                       :                                :                                +- Project [processID_A#65377, similar_processIDs#65512, array_union(array(processID_A#65377), similar_processIDs#65512) AS all_processIDs#65515]
            :        :                       :                                :                                   +- Aggregate [processID_A#65377], [processID_A#65377, collect_list(processID_B#65378, 0, 0) AS similar_processIDs#65512]
            :        :                       :                                :                                      +- Project [processID_A#65377, processID_B#65378, JaccardDistance#65365, featuresA#65379, featuresB#65380, jaccard_similarity(featuresA#65379, featuresB#65380)#65497 AS JaccardSimilarity#65498]
            :        :                       :                                :                                         +- Filter (processID_A#65377 < processID_B#65378)
            :        :                       :                                :                                            +- Project [datasetA#65311.processID AS processID_A#65377, datasetB#65344.processID AS processID_B#65378, JaccardDistance#65365, datasetA#65311.features AS featuresA#65379, datasetB#65344.features AS featuresB#65380]
            :        :                       :                                :                                               +- Filter (JaccardDistance#65365 < 0.8)
            :        :                       :                                :                                                  +- Project [datasetA#65311, datasetB#65344, UDF(datasetA#65311.features, datasetB#65344.features) AS JaccardDistance#65365]
            :        :                       :                                :                                                     +- Deduplicate [datasetA#65311, datasetB#65344]
            :        :                       :                                :                                                        +- Project [datasetA#65311, datasetB#65344]
            :        :                       :                                :                                                           +- Project [entry#65312, hashValue#65313, datasetA#65311, datasetB#65344]
            :        :                       :                                :                                                              +- Join Inner, ((entry#65312 = entry#65345) AND (hashValue#65313 = hashValue#65346))
            :        :                       :                                :                                                                 :- Project [struct(processID, processID#66003, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, features, features#63047, hashes, hashes#64627) AS datasetA#65311, entry#65312, hashValue#65313]
            :        :                       :                                :                                                                 :  +- Generate posexplode(hashes#64627), false, [entry#65312, hashValue#65313]
            :        :                       :                                :                                                                 :     +- Project [processID#66003, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :        :                       :                                :                                                                 :        +- Project [processID#66003, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :        :                       :                                :                                                                 :           +- Project [processID#66003, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :        :                       :                                :                                                                 :              +- Project [processID#66003, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :        :                       :                                :                                                                 :                 +- Aggregate [processID#66003], [processID#66003, collect_list(struct(FromServer, FromServer#65999, ToServer, ToServer#66000, time, time#66001, action, action#66002), 0, 0) AS actions#62941]
            :        :                       :                                :                                                                 :                    +- Relation [FromServer#65999,ToServer#66000,time#66001,action#66002,processId#66003] csv
            :        :                       :                                :                                                                 +- Project [struct(processID, processID#66008, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, hashes, hashes#64627, features, features#65324) AS datasetB#65344, entry#65345, hashValue#65346]
            :        :                       :                                :                                                                    +- Generate posexplode(hashes#64627), false, [entry#65345, hashValue#65346]
            :        :                       :                                :                                                                       +- Project [processID#66008, actions#62941, actions_str#62945, shingles#63004, hashes#64627, features#65324]
            :        :                       :                                :                                                                          +- Project [processID#66008, actions#62941, actions_str#62945, shingles#63004, inputCol_3e7494a3a9b6#65317, hashes#64627, inputCol_3e7494a3a9b6#65317 AS features#65324]
            :        :                       :                                :                                                                             +- Project [processID#66008, actions#62941, actions_str#62945, shingles#63004, features#63047 AS inputCol_3e7494a3a9b6#65317, hashes#64627]
            :        :                       :                                :                                                                                +- Project [processID#66008, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :        :                       :                                :                                                                                   +- Project [processID#66008, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :        :                       :                                :                                                                                      +- Project [processID#66008, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :        :                       :                                :                                                                                         +- Project [processID#66008, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :        :                       :                                :                                                                                            +- Aggregate [processID#66008], [processID#66008, collect_list(struct(FromServer, FromServer#66004, ToServer, ToServer#66005, time, time#66006, action, action#66007), 0, 0) AS actions#62941]
            :        :                       :                                :                                                                                               +- Relation [FromServer#66004,ToServer#66005,time#66006,action#66007,processId#66008] csv
            :        :                       :                                +- Project [processId#65625, FromServer#65621, ToServer#65622, time#65623, action#65624, Group#65631, final_group#65585]
            :        :                       :                                   +- Join Inner, (processId#65625 = processID#65591)
            :        :                       :                                      :- Relation [FromServer#65621,ToServer#65622,time#65623,action#65624,processId#65625] csv
            :        :                       :                                      +- Project [Group#65631, final_group#65585, processID#65591]
            :        :                       :                                         +- Generate explode(final_group#65585), false, [processID#65591]
            :        :                       :                                            +- Project [concat_ws(_, cast(final_group#65585 as array<string>)) AS Group#65631, final_group#65585]
            :        :                       :                                               +- Project [final_group#65585]
            :        :                       :                                                  +- Generate explode(merged_groups#65581), false, [final_group#65585]
            :        :                       :                                                     +- Project [group_lists#65576, <lambda>(group_lists#65576)#65580 AS merged_groups#65581]
            :        :                       :                                                        +- Aggregate [collect_list(group_list#65572, 0, 0) AS group_lists#65576]
            :        :                       :                                                           +- Aggregate [group_representative#65520], [group_representative#65520, collect_list(processID#65521, 0, 0) AS group_list#65572]
            :        :                       :                                                              +- Project [processID#65521, processID_A#65377 AS group_representative#65520]
            :        :                       :                                                                 +- Generate explode(all_processIDs#65515), false, [processID#65521]
            :        :                       :                                                                    +- Project [processID_A#65377, similar_processIDs#65512, array_union(array(processID_A#65377), similar_processIDs#65512) AS all_processIDs#65515]
            :        :                       :                                                                       +- Aggregate [processID_A#65377], [processID_A#65377, collect_list(processID_B#65378, 0, 0) AS similar_processIDs#65512]
            :        :                       :                                                                          +- Project [processID_A#65377, processID_B#65378, JaccardDistance#65365, featuresA#65379, featuresB#65380, jaccard_similarity(featuresA#65379, featuresB#65380)#65497 AS JaccardSimilarity#65498]
            :        :                       :                                                                             +- Filter (processID_A#65377 < processID_B#65378)
            :        :                       :                                                                                +- Project [datasetA#65311.processID AS processID_A#65377, datasetB#65344.processID AS processID_B#65378, JaccardDistance#65365, datasetA#65311.features AS featuresA#65379, datasetB#65344.features AS featuresB#65380]
            :        :                       :                                                                                   +- Filter (JaccardDistance#65365 < 0.8)
            :        :                       :                                                                                      +- Project [datasetA#65311, datasetB#65344, UDF(datasetA#65311.features, datasetB#65344.features) AS JaccardDistance#65365]
            :        :                       :                                                                                         +- Deduplicate [datasetA#65311, datasetB#65344]
            :        :                       :                                                                                            +- Project [datasetA#65311, datasetB#65344]
            :        :                       :                                                                                               +- Project [entry#65312, hashValue#65313, datasetA#65311, datasetB#65344]
            :        :                       :                                                                                                  +- Join Inner, ((entry#65312 = entry#65345) AND (hashValue#65313 = hashValue#65346))
            :        :                       :                                                                                                     :- Project [struct(processID, processID#65599, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, features, features#63047, hashes, hashes#64627) AS datasetA#65311, entry#65312, hashValue#65313]
            :        :                       :                                                                                                     :  +- Generate posexplode(hashes#64627), false, [entry#65312, hashValue#65313]
            :        :                       :                                                                                                     :     +- Project [processID#65599, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :        :                       :                                                                                                     :        +- Project [processID#65599, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :        :                       :                                                                                                     :           +- Project [processID#65599, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :        :                       :                                                                                                     :              +- Project [processID#65599, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :        :                       :                                                                                                     :                 +- Aggregate [processID#65599], [processID#65599, collect_list(struct(FromServer, FromServer#65595, ToServer, ToServer#65596, time, time#65597, action, action#65598), 0, 0) AS actions#62941]
            :        :                       :                                                                                                     :                    +- Relation [FromServer#65595,ToServer#65596,time#65597,action#65598,processId#65599] csv
            :        :                       :                                                                                                     +- Project [struct(processID, processID#65630, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, hashes, hashes#64627, features, features#65324) AS datasetB#65344, entry#65345, hashValue#65346]
            :        :                       :                                                                                                        +- Generate posexplode(hashes#64627), false, [entry#65345, hashValue#65346]
            :        :                       :                                                                                                           +- Project [processID#65630, actions#62941, actions_str#62945, shingles#63004, hashes#64627, features#65324]
            :        :                       :                                                                                                              +- Project [processID#65630, actions#62941, actions_str#62945, shingles#63004, inputCol_3e7494a3a9b6#65317, hashes#64627, inputCol_3e7494a3a9b6#65317 AS features#65324]
            :        :                       :                                                                                                                 +- Project [processID#65630, actions#62941, actions_str#62945, shingles#63004, features#63047 AS inputCol_3e7494a3a9b6#65317, hashes#64627]
            :        :                       :                                                                                                                    +- Project [processID#65630, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :        :                       :                                                                                                                       +- Project [processID#65630, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :        :                       :                                                                                                                          +- Project [processID#65630, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :        :                       :                                                                                                                             +- Project [processID#65630, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :        :                       :                                                                                                                                +- Aggregate [processID#65630], [processID#65630, collect_list(struct(FromServer, FromServer#65626, ToServer, ToServer#65627, time, time#65628, action, action#65629), 0, 0) AS actions#62941]
            :        :                       :                                                                                                                                   +- Relation [FromServer#65626,ToServer#65627,time#65628,action#65629,processId#65630] csv
            :        :                       +- Project [struct(processId, processId#173186, servers_array, servers_array#172659, hashes, hashes#173139, features, features#173162) AS datasetB#173176, entry#173177, hashValue#173178]
            :        :                          +- Generate posexplode(hashes#173139), false, [entry#173177, hashValue#173178]
            :        :                             +- Project [processId#173186, servers_array#172659, hashes#173139, features#173162]
            :        :                                +- Project [processId#173186, servers_array#172659, inputCol_245ba9db7dc3#173157, hashes#173139, inputCol_245ba9db7dc3#173157 AS features#173162]
            :        :                                   +- Project [processId#173186, servers_array#172659, features#172972 AS inputCol_245ba9db7dc3#173157, hashes#173139]
            :        :                                      +- Project [processId#173186, servers_array#172659, features#172972, UDF(features#172972) AS hashes#173139]
            :        :                                         +- Project [processId#173186, servers_array#172659, UDF(servers_array#172659) AS features#172972]
            :        :                                            +- Aggregate [processId#173186], [processId#173186, collect_set(FromServer#173182, 0, 0) AS servers_array#172659]
            :        :                                               +- Union false, false
            :        :                                                  :- Project [FromServer#173182, ToServer#173183, time#173184, action#173185, processID#173186]
            :        :                                                  :  +- Project [processId#173186, FromServer#173182, ToServer#173183, time#173184, action#173185]
            :        :                                                  :     +- Join LeftAnti, (processId#173186 = processID#65957)
            :        :                                                  :        :- Relation [FromServer#173182,ToServer#173183,time#173184,action#173185,processId#173186] csv
            :        :                                                  :        +- Deduplicate [processID#65957]
            :        :                                                  :           +- Project [processID#65957]
            :        :                                                  :              +- Generate explode(final_group#65585), false, [processID#65957]
            :        :                                                  :                 +- Project [concat_ws(_, cast(final_group#65585 as array<string>)) AS Group#65587, final_group#65585]
            :        :                                                  :                    +- Project [final_group#65585]
            :        :                                                  :                       +- Generate explode(merged_groups#65581), false, [final_group#65585]
            :        :                                                  :                          +- Project [group_lists#65576, <lambda>(group_lists#65576)#65580 AS merged_groups#65581]
            :        :                                                  :                             +- Aggregate [collect_list(group_list#65572, 0, 0) AS group_lists#65576]
            :        :                                                  :                                +- Aggregate [group_representative#65520], [group_representative#65520, collect_list(processID#65521, 0, 0) AS group_list#65572]
            :        :                                                  :                                   +- Project [processID#65521, processID_A#65377 AS group_representative#65520]
            :        :                                                  :                                      +- Generate explode(all_processIDs#65515), false, [processID#65521]
            :        :                                                  :                                         +- Project [processID_A#65377, similar_processIDs#65512, array_union(array(processID_A#65377), similar_processIDs#65512) AS all_processIDs#65515]
            :        :                                                  :                                            +- Aggregate [processID_A#65377], [processID_A#65377, collect_list(processID_B#65378, 0, 0) AS similar_processIDs#65512]
            :        :                                                  :                                               +- Project [processID_A#65377, processID_B#65378, JaccardDistance#65365, featuresA#65379, featuresB#65380, jaccard_similarity(featuresA#65379, featuresB#65380)#65497 AS JaccardSimilarity#65498]
            :        :                                                  :                                                  +- Filter (processID_A#65377 < processID_B#65378)
            :        :                                                  :                                                     +- Project [datasetA#65311.processID AS processID_A#65377, datasetB#65344.processID AS processID_B#65378, JaccardDistance#65365, datasetA#65311.features AS featuresA#65379, datasetB#65344.features AS featuresB#65380]
            :        :                                                  :                                                        +- Filter (JaccardDistance#65365 < 0.8)
            :        :                                                  :                                                           +- Project [datasetA#65311, datasetB#65344, UDF(datasetA#65311.features, datasetB#65344.features) AS JaccardDistance#65365]
            :        :                                                  :                                                              +- Deduplicate [datasetA#65311, datasetB#65344]
            :        :                                                  :                                                                 +- Project [datasetA#65311, datasetB#65344]
            :        :                                                  :                                                                    +- Project [entry#65312, hashValue#65313, datasetA#65311, datasetB#65344]
            :        :                                                  :                                                                       +- Join Inner, ((entry#65312 = entry#65345) AND (hashValue#65313 = hashValue#65346))
            :        :                                                  :                                                                          :- Project [struct(processID, processID#173191, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, features, features#63047, hashes, hashes#64627) AS datasetA#65311, entry#65312, hashValue#65313]
            :        :                                                  :                                                                          :  +- Generate posexplode(hashes#64627), false, [entry#65312, hashValue#65313]
            :        :                                                  :                                                                          :     +- Project [processID#173191, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :        :                                                  :                                                                          :        +- Project [processID#173191, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :        :                                                  :                                                                          :           +- Project [processID#173191, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :        :                                                  :                                                                          :              +- Project [processID#173191, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :        :                                                  :                                                                          :                 +- Aggregate [processID#173191], [processID#173191, collect_list(struct(FromServer, FromServer#173187, ToServer, ToServer#173188, time, time#173189, action, action#173190), 0, 0) AS actions#62941]
            :        :                                                  :                                                                          :                    +- Relation [FromServer#173187,ToServer#173188,time#173189,action#173190,processId#173191] csv
            :        :                                                  :                                                                          +- Project [struct(processID, processID#173196, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, hashes, hashes#64627, features, features#65324) AS datasetB#65344, entry#65345, hashValue#65346]
            :        :                                                  :                                                                             +- Generate posexplode(hashes#64627), false, [entry#65345, hashValue#65346]
            :        :                                                  :                                                                                +- Project [processID#173196, actions#62941, actions_str#62945, shingles#63004, hashes#64627, features#65324]
            :        :                                                  :                                                                                   +- Project [processID#173196, actions#62941, actions_str#62945, shingles#63004, inputCol_3e7494a3a9b6#65317, hashes#64627, inputCol_3e7494a3a9b6#65317 AS features#65324]
            :        :                                                  :                                                                                      +- Project [processID#173196, actions#62941, actions_str#62945, shingles#63004, features#63047 AS inputCol_3e7494a3a9b6#65317, hashes#64627]
            :        :                                                  :                                                                                         +- Project [processID#173196, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :        :                                                  :                                                                                            +- Project [processID#173196, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :        :                                                  :                                                                                               +- Project [processID#173196, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :        :                                                  :                                                                                                  +- Project [processID#173196, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :        :                                                  :                                                                                                     +- Aggregate [processID#173196], [processID#173196, collect_list(struct(FromServer, FromServer#173192, ToServer, ToServer#173193, time, time#173194, action, action#173195), 0, 0) AS actions#62941]
            :        :                                                  :                                                                                                        +- Relation [FromServer#173192,ToServer#173193,time#173194,action#173195,processId#173196] csv
            :        :                                                  +- Sort [time#173209 ASC NULLS FIRST], true
            :        :                                                     +- Project [FromServer#173207, ToServer#173208, time#173209, action#173210, processID#65988]
            :        :                                                        +- Project [(processID#173211 + 7) AS processID#65988, FromServer#173207, ToServer#173208, time#173209, action#173210]
            :        :                                                           +- Project [processID#173211, FromServer#173207, ToServer#173208, time#173209, action#173210]
            :        :                                                              +- Join Inner, (processID#173211 = representative_processID#65618)
            :        :                                                                 :- Aggregate [Group#65587], [Group#65587, min(processID#65591) AS representative_processID#65618]
            :        :                                                                 :  +- Project [Group#65587, final_group#65585, processID#65591]
            :        :                                                                 :     +- Generate explode(final_group#65585), false, [processID#65591]
            :        :                                                                 :        +- Project [concat_ws(_, cast(final_group#65585 as array<string>)) AS Group#65587, final_group#65585]
            :        :                                                                 :           +- Project [final_group#65585]
            :        :                                                                 :              +- Generate explode(merged_groups#65581), false, [final_group#65585]
            :        :                                                                 :                 +- Project [group_lists#65576, <lambda>(group_lists#65576)#65580 AS merged_groups#65581]
            :        :                                                                 :                    +- Aggregate [collect_list(group_list#65572, 0, 0) AS group_lists#65576]
            :        :                                                                 :                       +- Aggregate [group_representative#65520], [group_representative#65520, collect_list(processID#65521, 0, 0) AS group_list#65572]
            :        :                                                                 :                          +- Project [processID#65521, processID_A#65377 AS group_representative#65520]
            :        :                                                                 :                             +- Generate explode(all_processIDs#65515), false, [processID#65521]
            :        :                                                                 :                                +- Project [processID_A#65377, similar_processIDs#65512, array_union(array(processID_A#65377), similar_processIDs#65512) AS all_processIDs#65515]
            :        :                                                                 :                                   +- Aggregate [processID_A#65377], [processID_A#65377, collect_list(processID_B#65378, 0, 0) AS similar_processIDs#65512]
            :        :                                                                 :                                      +- Project [processID_A#65377, processID_B#65378, JaccardDistance#65365, featuresA#65379, featuresB#65380, jaccard_similarity(featuresA#65379, featuresB#65380)#65497 AS JaccardSimilarity#65498]
            :        :                                                                 :                                         +- Filter (processID_A#65377 < processID_B#65378)
            :        :                                                                 :                                            +- Project [datasetA#65311.processID AS processID_A#65377, datasetB#65344.processID AS processID_B#65378, JaccardDistance#65365, datasetA#65311.features AS featuresA#65379, datasetB#65344.features AS featuresB#65380]
            :        :                                                                 :                                               +- Filter (JaccardDistance#65365 < 0.8)
            :        :                                                                 :                                                  +- Project [datasetA#65311, datasetB#65344, UDF(datasetA#65311.features, datasetB#65344.features) AS JaccardDistance#65365]
            :        :                                                                 :                                                     +- Deduplicate [datasetA#65311, datasetB#65344]
            :        :                                                                 :                                                        +- Project [datasetA#65311, datasetB#65344]
            :        :                                                                 :                                                           +- Project [entry#65312, hashValue#65313, datasetA#65311, datasetB#65344]
            :        :                                                                 :                                                              +- Join Inner, ((entry#65312 = entry#65345) AND (hashValue#65313 = hashValue#65346))
            :        :                                                                 :                                                                 :- Project [struct(processID, processID#173201, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, features, features#63047, hashes, hashes#64627) AS datasetA#65311, entry#65312, hashValue#65313]
            :        :                                                                 :                                                                 :  +- Generate posexplode(hashes#64627), false, [entry#65312, hashValue#65313]
            :        :                                                                 :                                                                 :     +- Project [processID#173201, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :        :                                                                 :                                                                 :        +- Project [processID#173201, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :        :                                                                 :                                                                 :           +- Project [processID#173201, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :        :                                                                 :                                                                 :              +- Project [processID#173201, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :        :                                                                 :                                                                 :                 +- Aggregate [processID#173201], [processID#173201, collect_list(struct(FromServer, FromServer#173197, ToServer, ToServer#173198, time, time#173199, action, action#173200), 0, 0) AS actions#62941]
            :        :                                                                 :                                                                 :                    +- Relation [FromServer#173197,ToServer#173198,time#173199,action#173200,processId#173201] csv
            :        :                                                                 :                                                                 +- Project [struct(processID, processID#173206, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, hashes, hashes#64627, features, features#65324) AS datasetB#65344, entry#65345, hashValue#65346]
            :        :                                                                 :                                                                    +- Generate posexplode(hashes#64627), false, [entry#65345, hashValue#65346]
            :        :                                                                 :                                                                       +- Project [processID#173206, actions#62941, actions_str#62945, shingles#63004, hashes#64627, features#65324]
            :        :                                                                 :                                                                          +- Project [processID#173206, actions#62941, actions_str#62945, shingles#63004, inputCol_3e7494a3a9b6#65317, hashes#64627, inputCol_3e7494a3a9b6#65317 AS features#65324]
            :        :                                                                 :                                                                             +- Project [processID#173206, actions#62941, actions_str#62945, shingles#63004, features#63047 AS inputCol_3e7494a3a9b6#65317, hashes#64627]
            :        :                                                                 :                                                                                +- Project [processID#173206, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :        :                                                                 :                                                                                   +- Project [processID#173206, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :        :                                                                 :                                                                                      +- Project [processID#173206, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :        :                                                                 :                                                                                         +- Project [processID#173206, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :        :                                                                 :                                                                                            +- Aggregate [processID#173206], [processID#173206, collect_list(struct(FromServer, FromServer#173202, ToServer, ToServer#173203, time, time#173204, action, action#173205), 0, 0) AS actions#62941]
            :        :                                                                 :                                                                                               +- Relation [FromServer#173202,ToServer#173203,time#173204,action#173205,processId#173206] csv
            :        :                                                                 +- Project [processId#173211, FromServer#173207, ToServer#173208, time#173209, action#173210, Group#65631, final_group#65585]
            :        :                                                                    +- Join Inner, (processId#173211 = processID#65591)
            :        :                                                                       :- Relation [FromServer#173207,ToServer#173208,time#173209,action#173210,processId#173211] csv
            :        :                                                                       +- Project [Group#65631, final_group#65585, processID#65591]
            :        :                                                                          +- Generate explode(final_group#65585), false, [processID#65591]
            :        :                                                                             +- Project [concat_ws(_, cast(final_group#65585 as array<string>)) AS Group#65631, final_group#65585]
            :        :                                                                                +- Project [final_group#65585]
            :        :                                                                                   +- Generate explode(merged_groups#65581), false, [final_group#65585]
            :        :                                                                                      +- Project [group_lists#65576, <lambda>(group_lists#65576)#65580 AS merged_groups#65581]
            :        :                                                                                         +- Aggregate [collect_list(group_list#65572, 0, 0) AS group_lists#65576]
            :        :                                                                                            +- Aggregate [group_representative#65520], [group_representative#65520, collect_list(processID#65521, 0, 0) AS group_list#65572]
            :        :                                                                                               +- Project [processID#65521, processID_A#65377 AS group_representative#65520]
            :        :                                                                                                  +- Generate explode(all_processIDs#65515), false, [processID#65521]
            :        :                                                                                                     +- Project [processID_A#65377, similar_processIDs#65512, array_union(array(processID_A#65377), similar_processIDs#65512) AS all_processIDs#65515]
            :        :                                                                                                        +- Aggregate [processID_A#65377], [processID_A#65377, collect_list(processID_B#65378, 0, 0) AS similar_processIDs#65512]
            :        :                                                                                                           +- Project [processID_A#65377, processID_B#65378, JaccardDistance#65365, featuresA#65379, featuresB#65380, jaccard_similarity(featuresA#65379, featuresB#65380)#65497 AS JaccardSimilarity#65498]
            :        :                                                                                                              +- Filter (processID_A#65377 < processID_B#65378)
            :        :                                                                                                                 +- Project [datasetA#65311.processID AS processID_A#65377, datasetB#65344.processID AS processID_B#65378, JaccardDistance#65365, datasetA#65311.features AS featuresA#65379, datasetB#65344.features AS featuresB#65380]
            :        :                                                                                                                    +- Filter (JaccardDistance#65365 < 0.8)
            :        :                                                                                                                       +- Project [datasetA#65311, datasetB#65344, UDF(datasetA#65311.features, datasetB#65344.features) AS JaccardDistance#65365]
            :        :                                                                                                                          +- Deduplicate [datasetA#65311, datasetB#65344]
            :        :                                                                                                                             +- Project [datasetA#65311, datasetB#65344]
            :        :                                                                                                                                +- Project [entry#65312, hashValue#65313, datasetA#65311, datasetB#65344]
            :        :                                                                                                                                   +- Join Inner, ((entry#65312 = entry#65345) AND (hashValue#65313 = hashValue#65346))
            :        :                                                                                                                                      :- Project [struct(processID, processID#173216, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, features, features#63047, hashes, hashes#64627) AS datasetA#65311, entry#65312, hashValue#65313]
            :        :                                                                                                                                      :  +- Generate posexplode(hashes#64627), false, [entry#65312, hashValue#65313]
            :        :                                                                                                                                      :     +- Project [processID#173216, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :        :                                                                                                                                      :        +- Project [processID#173216, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :        :                                                                                                                                      :           +- Project [processID#173216, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :        :                                                                                                                                      :              +- Project [processID#173216, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :        :                                                                                                                                      :                 +- Aggregate [processID#173216], [processID#173216, collect_list(struct(FromServer, FromServer#173212, ToServer, ToServer#173213, time, time#173214, action, action#173215), 0, 0) AS actions#62941]
            :        :                                                                                                                                      :                    +- Relation [FromServer#173212,ToServer#173213,time#173214,action#173215,processId#173216] csv
            :        :                                                                                                                                      +- Project [struct(processID, processID#173221, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, hashes, hashes#64627, features, features#65324) AS datasetB#65344, entry#65345, hashValue#65346]
            :        :                                                                                                                                         +- Generate posexplode(hashes#64627), false, [entry#65345, hashValue#65346]
            :        :                                                                                                                                            +- Project [processID#173221, actions#62941, actions_str#62945, shingles#63004, hashes#64627, features#65324]
            :        :                                                                                                                                               +- Project [processID#173221, actions#62941, actions_str#62945, shingles#63004, inputCol_3e7494a3a9b6#65317, hashes#64627, inputCol_3e7494a3a9b6#65317 AS features#65324]
            :        :                                                                                                                                                  +- Project [processID#173221, actions#62941, actions_str#62945, shingles#63004, features#63047 AS inputCol_3e7494a3a9b6#65317, hashes#64627]
            :        :                                                                                                                                                     +- Project [processID#173221, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :        :                                                                                                                                                        +- Project [processID#173221, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :        :                                                                                                                                                           +- Project [processID#173221, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :        :                                                                                                                                                              +- Project [processID#173221, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :        :                                                                                                                                                                 +- Aggregate [processID#173221], [processID#173221, collect_list(struct(FromServer, FromServer#173217, ToServer, ToServer#173218, time, time#173219, action, action#173220), 0, 0) AS actions#62941]
            :        :                                                                                                                                                                    +- Relation [FromServer#173217,ToServer#173218,time#173219,action#173220,processId#173221] csv
            :        +- Project [processId#173270, servers_array#172659]
            :           +- Aggregate [processId#173270], [processId#173270, collect_set(FromServer#173266, 0, 0) AS servers_array#172659]
            :              +- Union false, false
            :                 :- Project [FromServer#173266, ToServer#173267, time#173268, action#173269, processID#173270]
            :                 :  +- Project [processId#173270, FromServer#173266, ToServer#173267, time#173268, action#173269]
            :                 :     +- Join LeftAnti, (processId#173270 = processID#65957)
            :                 :        :- Relation [FromServer#173266,ToServer#173267,time#173268,action#173269,processId#173270] csv
            :                 :        +- Deduplicate [processID#65957]
            :                 :           +- Project [processID#65957]
            :                 :              +- Generate explode(final_group#65585), false, [processID#65957]
            :                 :                 +- Project [concat_ws(_, cast(final_group#65585 as array<string>)) AS Group#65587, final_group#65585]
            :                 :                    +- Project [final_group#65585]
            :                 :                       +- Generate explode(merged_groups#65581), false, [final_group#65585]
            :                 :                          +- Project [group_lists#65576, <lambda>(group_lists#65576)#65580 AS merged_groups#65581]
            :                 :                             +- Aggregate [collect_list(group_list#65572, 0, 0) AS group_lists#65576]
            :                 :                                +- Aggregate [group_representative#65520], [group_representative#65520, collect_list(processID#65521, 0, 0) AS group_list#65572]
            :                 :                                   +- Project [processID#65521, processID_A#65377 AS group_representative#65520]
            :                 :                                      +- Generate explode(all_processIDs#65515), false, [processID#65521]
            :                 :                                         +- Project [processID_A#65377, similar_processIDs#65512, array_union(array(processID_A#65377), similar_processIDs#65512) AS all_processIDs#65515]
            :                 :                                            +- Aggregate [processID_A#65377], [processID_A#65377, collect_list(processID_B#65378, 0, 0) AS similar_processIDs#65512]
            :                 :                                               +- Project [processID_A#65377, processID_B#65378, JaccardDistance#65365, featuresA#65379, featuresB#65380, jaccard_similarity(featuresA#65379, featuresB#65380)#65497 AS JaccardSimilarity#65498]
            :                 :                                                  +- Filter (processID_A#65377 < processID_B#65378)
            :                 :                                                     +- Project [datasetA#65311.processID AS processID_A#65377, datasetB#65344.processID AS processID_B#65378, JaccardDistance#65365, datasetA#65311.features AS featuresA#65379, datasetB#65344.features AS featuresB#65380]
            :                 :                                                        +- Filter (JaccardDistance#65365 < 0.8)
            :                 :                                                           +- Project [datasetA#65311, datasetB#65344, UDF(datasetA#65311.features, datasetB#65344.features) AS JaccardDistance#65365]
            :                 :                                                              +- Deduplicate [datasetA#65311, datasetB#65344]
            :                 :                                                                 +- Project [datasetA#65311, datasetB#65344]
            :                 :                                                                    +- Project [entry#65312, hashValue#65313, datasetA#65311, datasetB#65344]
            :                 :                                                                       +- Join Inner, ((entry#65312 = entry#65345) AND (hashValue#65313 = hashValue#65346))
            :                 :                                                                          :- Project [struct(processID, processID#173275, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, features, features#63047, hashes, hashes#64627) AS datasetA#65311, entry#65312, hashValue#65313]
            :                 :                                                                          :  +- Generate posexplode(hashes#64627), false, [entry#65312, hashValue#65313]
            :                 :                                                                          :     +- Project [processID#173275, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :                 :                                                                          :        +- Project [processID#173275, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :                 :                                                                          :           +- Project [processID#173275, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :                 :                                                                          :              +- Project [processID#173275, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :                 :                                                                          :                 +- Aggregate [processID#173275], [processID#173275, collect_list(struct(FromServer, FromServer#173271, ToServer, ToServer#173272, time, time#173273, action, action#173274), 0, 0) AS actions#62941]
            :                 :                                                                          :                    +- Relation [FromServer#173271,ToServer#173272,time#173273,action#173274,processId#173275] csv
            :                 :                                                                          +- Project [struct(processID, processID#173280, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, hashes, hashes#64627, features, features#65324) AS datasetB#65344, entry#65345, hashValue#65346]
            :                 :                                                                             +- Generate posexplode(hashes#64627), false, [entry#65345, hashValue#65346]
            :                 :                                                                                +- Project [processID#173280, actions#62941, actions_str#62945, shingles#63004, hashes#64627, features#65324]
            :                 :                                                                                   +- Project [processID#173280, actions#62941, actions_str#62945, shingles#63004, inputCol_3e7494a3a9b6#65317, hashes#64627, inputCol_3e7494a3a9b6#65317 AS features#65324]
            :                 :                                                                                      +- Project [processID#173280, actions#62941, actions_str#62945, shingles#63004, features#63047 AS inputCol_3e7494a3a9b6#65317, hashes#64627]
            :                 :                                                                                         +- Project [processID#173280, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :                 :                                                                                            +- Project [processID#173280, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :                 :                                                                                               +- Project [processID#173280, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :                 :                                                                                                  +- Project [processID#173280, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :                 :                                                                                                     +- Aggregate [processID#173280], [processID#173280, collect_list(struct(FromServer, FromServer#173276, ToServer, ToServer#173277, time, time#173278, action, action#173279), 0, 0) AS actions#62941]
            :                 :                                                                                                        +- Relation [FromServer#173276,ToServer#173277,time#173278,action#173279,processId#173280] csv
            :                 +- Sort [time#173293 ASC NULLS FIRST], true
            :                    +- Project [FromServer#173291, ToServer#173292, time#173293, action#173294, processID#65988]
            :                       +- Project [(processID#173295 + 7) AS processID#65988, FromServer#173291, ToServer#173292, time#173293, action#173294]
            :                          +- Project [processID#173295, FromServer#173291, ToServer#173292, time#173293, action#173294]
            :                             +- Join Inner, (processID#173295 = representative_processID#65618)
            :                                :- Aggregate [Group#65587], [Group#65587, min(processID#65591) AS representative_processID#65618]
            :                                :  +- Project [Group#65587, final_group#65585, processID#65591]
            :                                :     +- Generate explode(final_group#65585), false, [processID#65591]
            :                                :        +- Project [concat_ws(_, cast(final_group#65585 as array<string>)) AS Group#65587, final_group#65585]
            :                                :           +- Project [final_group#65585]
            :                                :              +- Generate explode(merged_groups#65581), false, [final_group#65585]
            :                                :                 +- Project [group_lists#65576, <lambda>(group_lists#65576)#65580 AS merged_groups#65581]
            :                                :                    +- Aggregate [collect_list(group_list#65572, 0, 0) AS group_lists#65576]
            :                                :                       +- Aggregate [group_representative#65520], [group_representative#65520, collect_list(processID#65521, 0, 0) AS group_list#65572]
            :                                :                          +- Project [processID#65521, processID_A#65377 AS group_representative#65520]
            :                                :                             +- Generate explode(all_processIDs#65515), false, [processID#65521]
            :                                :                                +- Project [processID_A#65377, similar_processIDs#65512, array_union(array(processID_A#65377), similar_processIDs#65512) AS all_processIDs#65515]
            :                                :                                   +- Aggregate [processID_A#65377], [processID_A#65377, collect_list(processID_B#65378, 0, 0) AS similar_processIDs#65512]
            :                                :                                      +- Project [processID_A#65377, processID_B#65378, JaccardDistance#65365, featuresA#65379, featuresB#65380, jaccard_similarity(featuresA#65379, featuresB#65380)#65497 AS JaccardSimilarity#65498]
            :                                :                                         +- Filter (processID_A#65377 < processID_B#65378)
            :                                :                                            +- Project [datasetA#65311.processID AS processID_A#65377, datasetB#65344.processID AS processID_B#65378, JaccardDistance#65365, datasetA#65311.features AS featuresA#65379, datasetB#65344.features AS featuresB#65380]
            :                                :                                               +- Filter (JaccardDistance#65365 < 0.8)
            :                                :                                                  +- Project [datasetA#65311, datasetB#65344, UDF(datasetA#65311.features, datasetB#65344.features) AS JaccardDistance#65365]
            :                                :                                                     +- Deduplicate [datasetA#65311, datasetB#65344]
            :                                :                                                        +- Project [datasetA#65311, datasetB#65344]
            :                                :                                                           +- Project [entry#65312, hashValue#65313, datasetA#65311, datasetB#65344]
            :                                :                                                              +- Join Inner, ((entry#65312 = entry#65345) AND (hashValue#65313 = hashValue#65346))
            :                                :                                                                 :- Project [struct(processID, processID#173285, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, features, features#63047, hashes, hashes#64627) AS datasetA#65311, entry#65312, hashValue#65313]
            :                                :                                                                 :  +- Generate posexplode(hashes#64627), false, [entry#65312, hashValue#65313]
            :                                :                                                                 :     +- Project [processID#173285, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :                                :                                                                 :        +- Project [processID#173285, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :                                :                                                                 :           +- Project [processID#173285, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :                                :                                                                 :              +- Project [processID#173285, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :                                :                                                                 :                 +- Aggregate [processID#173285], [processID#173285, collect_list(struct(FromServer, FromServer#173281, ToServer, ToServer#173282, time, time#173283, action, action#173284), 0, 0) AS actions#62941]
            :                                :                                                                 :                    +- Relation [FromServer#173281,ToServer#173282,time#173283,action#173284,processId#173285] csv
            :                                :                                                                 +- Project [struct(processID, processID#173290, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, hashes, hashes#64627, features, features#65324) AS datasetB#65344, entry#65345, hashValue#65346]
            :                                :                                                                    +- Generate posexplode(hashes#64627), false, [entry#65345, hashValue#65346]
            :                                :                                                                       +- Project [processID#173290, actions#62941, actions_str#62945, shingles#63004, hashes#64627, features#65324]
            :                                :                                                                          +- Project [processID#173290, actions#62941, actions_str#62945, shingles#63004, inputCol_3e7494a3a9b6#65317, hashes#64627, inputCol_3e7494a3a9b6#65317 AS features#65324]
            :                                :                                                                             +- Project [processID#173290, actions#62941, actions_str#62945, shingles#63004, features#63047 AS inputCol_3e7494a3a9b6#65317, hashes#64627]
            :                                :                                                                                +- Project [processID#173290, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :                                :                                                                                   +- Project [processID#173290, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :                                :                                                                                      +- Project [processID#173290, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :                                :                                                                                         +- Project [processID#173290, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :                                :                                                                                            +- Aggregate [processID#173290], [processID#173290, collect_list(struct(FromServer, FromServer#173286, ToServer, ToServer#173287, time, time#173288, action, action#173289), 0, 0) AS actions#62941]
            :                                :                                                                                               +- Relation [FromServer#173286,ToServer#173287,time#173288,action#173289,processId#173290] csv
            :                                +- Project [processId#173295, FromServer#173291, ToServer#173292, time#173293, action#173294, Group#65631, final_group#65585]
            :                                   +- Join Inner, (processId#173295 = processID#65591)
            :                                      :- Relation [FromServer#173291,ToServer#173292,time#173293,action#173294,processId#173295] csv
            :                                      +- Project [Group#65631, final_group#65585, processID#65591]
            :                                         +- Generate explode(final_group#65585), false, [processID#65591]
            :                                            +- Project [concat_ws(_, cast(final_group#65585 as array<string>)) AS Group#65631, final_group#65585]
            :                                               +- Project [final_group#65585]
            :                                                  +- Generate explode(merged_groups#65581), false, [final_group#65585]
            :                                                     +- Project [group_lists#65576, <lambda>(group_lists#65576)#65580 AS merged_groups#65581]
            :                                                        +- Aggregate [collect_list(group_list#65572, 0, 0) AS group_lists#65576]
            :                                                           +- Aggregate [group_representative#65520], [group_representative#65520, collect_list(processID#65521, 0, 0) AS group_list#65572]
            :                                                              +- Project [processID#65521, processID_A#65377 AS group_representative#65520]
            :                                                                 +- Generate explode(all_processIDs#65515), false, [processID#65521]
            :                                                                    +- Project [processID_A#65377, similar_processIDs#65512, array_union(array(processID_A#65377), similar_processIDs#65512) AS all_processIDs#65515]
            :                                                                       +- Aggregate [processID_A#65377], [processID_A#65377, collect_list(processID_B#65378, 0, 0) AS similar_processIDs#65512]
            :                                                                          +- Project [processID_A#65377, processID_B#65378, JaccardDistance#65365, featuresA#65379, featuresB#65380, jaccard_similarity(featuresA#65379, featuresB#65380)#65497 AS JaccardSimilarity#65498]
            :                                                                             +- Filter (processID_A#65377 < processID_B#65378)
            :                                                                                +- Project [datasetA#65311.processID AS processID_A#65377, datasetB#65344.processID AS processID_B#65378, JaccardDistance#65365, datasetA#65311.features AS featuresA#65379, datasetB#65344.features AS featuresB#65380]
            :                                                                                   +- Filter (JaccardDistance#65365 < 0.8)
            :                                                                                      +- Project [datasetA#65311, datasetB#65344, UDF(datasetA#65311.features, datasetB#65344.features) AS JaccardDistance#65365]
            :                                                                                         +- Deduplicate [datasetA#65311, datasetB#65344]
            :                                                                                            +- Project [datasetA#65311, datasetB#65344]
            :                                                                                               +- Project [entry#65312, hashValue#65313, datasetA#65311, datasetB#65344]
            :                                                                                                  +- Join Inner, ((entry#65312 = entry#65345) AND (hashValue#65313 = hashValue#65346))
            :                                                                                                     :- Project [struct(processID, processID#173300, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, features, features#63047, hashes, hashes#64627) AS datasetA#65311, entry#65312, hashValue#65313]
            :                                                                                                     :  +- Generate posexplode(hashes#64627), false, [entry#65312, hashValue#65313]
            :                                                                                                     :     +- Project [processID#173300, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :                                                                                                     :        +- Project [processID#173300, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :                                                                                                     :           +- Project [processID#173300, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :                                                                                                     :              +- Project [processID#173300, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :                                                                                                     :                 +- Aggregate [processID#173300], [processID#173300, collect_list(struct(FromServer, FromServer#173296, ToServer, ToServer#173297, time, time#173298, action, action#173299), 0, 0) AS actions#62941]
            :                                                                                                     :                    +- Relation [FromServer#173296,ToServer#173297,time#173298,action#173299,processId#173300] csv
            :                                                                                                     +- Project [struct(processID, processID#173305, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, hashes, hashes#64627, features, features#65324) AS datasetB#65344, entry#65345, hashValue#65346]
            :                                                                                                        +- Generate posexplode(hashes#64627), false, [entry#65345, hashValue#65346]
            :                                                                                                           +- Project [processID#173305, actions#62941, actions_str#62945, shingles#63004, hashes#64627, features#65324]
            :                                                                                                              +- Project [processID#173305, actions#62941, actions_str#62945, shingles#63004, inputCol_3e7494a3a9b6#65317, hashes#64627, inputCol_3e7494a3a9b6#65317 AS features#65324]
            :                                                                                                                 +- Project [processID#173305, actions#62941, actions_str#62945, shingles#63004, features#63047 AS inputCol_3e7494a3a9b6#65317, hashes#64627]
            :                                                                                                                    +- Project [processID#173305, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
            :                                                                                                                       +- Project [processID#173305, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
            :                                                                                                                          +- Project [processID#173305, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
            :                                                                                                                             +- Project [processID#173305, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
            :                                                                                                                                +- Aggregate [processID#173305], [processID#173305, collect_list(struct(FromServer, FromServer#173301, ToServer, ToServer#173302, time, time#173303, action, action#173304), 0, 0) AS actions#62941]
            :                                                                                                                                   +- Relation [FromServer#173301,ToServer#173302,time#173303,action#173304,processId#173305] csv
            +- Project [processId#173340, servers_array#172659]
               +- Aggregate [processId#173340], [processId#173340, collect_set(FromServer#173336, 0, 0) AS servers_array#172659]
                  +- Union false, false
                     :- Project [FromServer#173336, ToServer#173337, time#173338, action#173339, processID#173340]
                     :  +- Project [processId#173340, FromServer#173336, ToServer#173337, time#173338, action#173339]
                     :     +- Join LeftAnti, (processId#173340 = processID#65957)
                     :        :- Relation [FromServer#173336,ToServer#173337,time#173338,action#173339,processId#173340] csv
                     :        +- Deduplicate [processID#65957]
                     :           +- Project [processID#65957]
                     :              +- Generate explode(final_group#65585), false, [processID#65957]
                     :                 +- Project [concat_ws(_, cast(final_group#65585 as array<string>)) AS Group#65587, final_group#65585]
                     :                    +- Project [final_group#65585]
                     :                       +- Generate explode(merged_groups#65581), false, [final_group#65585]
                     :                          +- Project [group_lists#65576, <lambda>(group_lists#65576)#65580 AS merged_groups#65581]
                     :                             +- Aggregate [collect_list(group_list#65572, 0, 0) AS group_lists#65576]
                     :                                +- Aggregate [group_representative#65520], [group_representative#65520, collect_list(processID#65521, 0, 0) AS group_list#65572]
                     :                                   +- Project [processID#65521, processID_A#65377 AS group_representative#65520]
                     :                                      +- Generate explode(all_processIDs#65515), false, [processID#65521]
                     :                                         +- Project [processID_A#65377, similar_processIDs#65512, array_union(array(processID_A#65377), similar_processIDs#65512) AS all_processIDs#65515]
                     :                                            +- Aggregate [processID_A#65377], [processID_A#65377, collect_list(processID_B#65378, 0, 0) AS similar_processIDs#65512]
                     :                                               +- Project [processID_A#65377, processID_B#65378, JaccardDistance#65365, featuresA#65379, featuresB#65380, jaccard_similarity(featuresA#65379, featuresB#65380)#65497 AS JaccardSimilarity#65498]
                     :                                                  +- Filter (processID_A#65377 < processID_B#65378)
                     :                                                     +- Project [datasetA#65311.processID AS processID_A#65377, datasetB#65344.processID AS processID_B#65378, JaccardDistance#65365, datasetA#65311.features AS featuresA#65379, datasetB#65344.features AS featuresB#65380]
                     :                                                        +- Filter (JaccardDistance#65365 < 0.8)
                     :                                                           +- Project [datasetA#65311, datasetB#65344, UDF(datasetA#65311.features, datasetB#65344.features) AS JaccardDistance#65365]
                     :                                                              +- Deduplicate [datasetA#65311, datasetB#65344]
                     :                                                                 +- Project [datasetA#65311, datasetB#65344]
                     :                                                                    +- Project [entry#65312, hashValue#65313, datasetA#65311, datasetB#65344]
                     :                                                                       +- Join Inner, ((entry#65312 = entry#65345) AND (hashValue#65313 = hashValue#65346))
                     :                                                                          :- Project [struct(processID, processID#173345, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, features, features#63047, hashes, hashes#64627) AS datasetA#65311, entry#65312, hashValue#65313]
                     :                                                                          :  +- Generate posexplode(hashes#64627), false, [entry#65312, hashValue#65313]
                     :                                                                          :     +- Project [processID#173345, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
                     :                                                                          :        +- Project [processID#173345, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
                     :                                                                          :           +- Project [processID#173345, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
                     :                                                                          :              +- Project [processID#173345, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
                     :                                                                          :                 +- Aggregate [processID#173345], [processID#173345, collect_list(struct(FromServer, FromServer#173341, ToServer, ToServer#173342, time, time#173343, action, action#173344), 0, 0) AS actions#62941]
                     :                                                                          :                    +- Relation [FromServer#173341,ToServer#173342,time#173343,action#173344,processId#173345] csv
                     :                                                                          +- Project [struct(processID, processID#173350, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, hashes, hashes#64627, features, features#65324) AS datasetB#65344, entry#65345, hashValue#65346]
                     :                                                                             +- Generate posexplode(hashes#64627), false, [entry#65345, hashValue#65346]
                     :                                                                                +- Project [processID#173350, actions#62941, actions_str#62945, shingles#63004, hashes#64627, features#65324]
                     :                                                                                   +- Project [processID#173350, actions#62941, actions_str#62945, shingles#63004, inputCol_3e7494a3a9b6#65317, hashes#64627, inputCol_3e7494a3a9b6#65317 AS features#65324]
                     :                                                                                      +- Project [processID#173350, actions#62941, actions_str#62945, shingles#63004, features#63047 AS inputCol_3e7494a3a9b6#65317, hashes#64627]
                     :                                                                                         +- Project [processID#173350, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
                     :                                                                                            +- Project [processID#173350, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
                     :                                                                                               +- Project [processID#173350, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
                     :                                                                                                  +- Project [processID#173350, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
                     :                                                                                                     +- Aggregate [processID#173350], [processID#173350, collect_list(struct(FromServer, FromServer#173346, ToServer, ToServer#173347, time, time#173348, action, action#173349), 0, 0) AS actions#62941]
                     :                                                                                                        +- Relation [FromServer#173346,ToServer#173347,time#173348,action#173349,processId#173350] csv
                     +- Sort [time#173363 ASC NULLS FIRST], true
                        +- Project [FromServer#173361, ToServer#173362, time#173363, action#173364, processID#65988]
                           +- Project [(processID#173365 + 7) AS processID#65988, FromServer#173361, ToServer#173362, time#173363, action#173364]
                              +- Project [processID#173365, FromServer#173361, ToServer#173362, time#173363, action#173364]
                                 +- Join Inner, (processID#173365 = representative_processID#65618)
                                    :- Aggregate [Group#65587], [Group#65587, min(processID#65591) AS representative_processID#65618]
                                    :  +- Project [Group#65587, final_group#65585, processID#65591]
                                    :     +- Generate explode(final_group#65585), false, [processID#65591]
                                    :        +- Project [concat_ws(_, cast(final_group#65585 as array<string>)) AS Group#65587, final_group#65585]
                                    :           +- Project [final_group#65585]
                                    :              +- Generate explode(merged_groups#65581), false, [final_group#65585]
                                    :                 +- Project [group_lists#65576, <lambda>(group_lists#65576)#65580 AS merged_groups#65581]
                                    :                    +- Aggregate [collect_list(group_list#65572, 0, 0) AS group_lists#65576]
                                    :                       +- Aggregate [group_representative#65520], [group_representative#65520, collect_list(processID#65521, 0, 0) AS group_list#65572]
                                    :                          +- Project [processID#65521, processID_A#65377 AS group_representative#65520]
                                    :                             +- Generate explode(all_processIDs#65515), false, [processID#65521]
                                    :                                +- Project [processID_A#65377, similar_processIDs#65512, array_union(array(processID_A#65377), similar_processIDs#65512) AS all_processIDs#65515]
                                    :                                   +- Aggregate [processID_A#65377], [processID_A#65377, collect_list(processID_B#65378, 0, 0) AS similar_processIDs#65512]
                                    :                                      +- Project [processID_A#65377, processID_B#65378, JaccardDistance#65365, featuresA#65379, featuresB#65380, jaccard_similarity(featuresA#65379, featuresB#65380)#65497 AS JaccardSimilarity#65498]
                                    :                                         +- Filter (processID_A#65377 < processID_B#65378)
                                    :                                            +- Project [datasetA#65311.processID AS processID_A#65377, datasetB#65344.processID AS processID_B#65378, JaccardDistance#65365, datasetA#65311.features AS featuresA#65379, datasetB#65344.features AS featuresB#65380]
                                    :                                               +- Filter (JaccardDistance#65365 < 0.8)
                                    :                                                  +- Project [datasetA#65311, datasetB#65344, UDF(datasetA#65311.features, datasetB#65344.features) AS JaccardDistance#65365]
                                    :                                                     +- Deduplicate [datasetA#65311, datasetB#65344]
                                    :                                                        +- Project [datasetA#65311, datasetB#65344]
                                    :                                                           +- Project [entry#65312, hashValue#65313, datasetA#65311, datasetB#65344]
                                    :                                                              +- Join Inner, ((entry#65312 = entry#65345) AND (hashValue#65313 = hashValue#65346))
                                    :                                                                 :- Project [struct(processID, processID#173355, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, features, features#63047, hashes, hashes#64627) AS datasetA#65311, entry#65312, hashValue#65313]
                                    :                                                                 :  +- Generate posexplode(hashes#64627), false, [entry#65312, hashValue#65313]
                                    :                                                                 :     +- Project [processID#173355, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
                                    :                                                                 :        +- Project [processID#173355, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
                                    :                                                                 :           +- Project [processID#173355, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
                                    :                                                                 :              +- Project [processID#173355, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
                                    :                                                                 :                 +- Aggregate [processID#173355], [processID#173355, collect_list(struct(FromServer, FromServer#173351, ToServer, ToServer#173352, time, time#173353, action, action#173354), 0, 0) AS actions#62941]
                                    :                                                                 :                    +- Relation [FromServer#173351,ToServer#173352,time#173353,action#173354,processId#173355] csv
                                    :                                                                 +- Project [struct(processID, processID#173360, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, hashes, hashes#64627, features, features#65324) AS datasetB#65344, entry#65345, hashValue#65346]
                                    :                                                                    +- Generate posexplode(hashes#64627), false, [entry#65345, hashValue#65346]
                                    :                                                                       +- Project [processID#173360, actions#62941, actions_str#62945, shingles#63004, hashes#64627, features#65324]
                                    :                                                                          +- Project [processID#173360, actions#62941, actions_str#62945, shingles#63004, inputCol_3e7494a3a9b6#65317, hashes#64627, inputCol_3e7494a3a9b6#65317 AS features#65324]
                                    :                                                                             +- Project [processID#173360, actions#62941, actions_str#62945, shingles#63004, features#63047 AS inputCol_3e7494a3a9b6#65317, hashes#64627]
                                    :                                                                                +- Project [processID#173360, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
                                    :                                                                                   +- Project [processID#173360, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
                                    :                                                                                      +- Project [processID#173360, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
                                    :                                                                                         +- Project [processID#173360, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
                                    :                                                                                            +- Aggregate [processID#173360], [processID#173360, collect_list(struct(FromServer, FromServer#173356, ToServer, ToServer#173357, time, time#173358, action, action#173359), 0, 0) AS actions#62941]
                                    :                                                                                               +- Relation [FromServer#173356,ToServer#173357,time#173358,action#173359,processId#173360] csv
                                    +- Project [processId#173365, FromServer#173361, ToServer#173362, time#173363, action#173364, Group#65631, final_group#65585]
                                       +- Join Inner, (processId#173365 = processID#65591)
                                          :- Relation [FromServer#173361,ToServer#173362,time#173363,action#173364,processId#173365] csv
                                          +- Project [Group#65631, final_group#65585, processID#65591]
                                             +- Generate explode(final_group#65585), false, [processID#65591]
                                                +- Project [concat_ws(_, cast(final_group#65585 as array<string>)) AS Group#65631, final_group#65585]
                                                   +- Project [final_group#65585]
                                                      +- Generate explode(merged_groups#65581), false, [final_group#65585]
                                                         +- Project [group_lists#65576, <lambda>(group_lists#65576)#65580 AS merged_groups#65581]
                                                            +- Aggregate [collect_list(group_list#65572, 0, 0) AS group_lists#65576]
                                                               +- Aggregate [group_representative#65520], [group_representative#65520, collect_list(processID#65521, 0, 0) AS group_list#65572]
                                                                  +- Project [processID#65521, processID_A#65377 AS group_representative#65520]
                                                                     +- Generate explode(all_processIDs#65515), false, [processID#65521]
                                                                        +- Project [processID_A#65377, similar_processIDs#65512, array_union(array(processID_A#65377), similar_processIDs#65512) AS all_processIDs#65515]
                                                                           +- Aggregate [processID_A#65377], [processID_A#65377, collect_list(processID_B#65378, 0, 0) AS similar_processIDs#65512]
                                                                              +- Project [processID_A#65377, processID_B#65378, JaccardDistance#65365, featuresA#65379, featuresB#65380, jaccard_similarity(featuresA#65379, featuresB#65380)#65497 AS JaccardSimilarity#65498]
                                                                                 +- Filter (processID_A#65377 < processID_B#65378)
                                                                                    +- Project [datasetA#65311.processID AS processID_A#65377, datasetB#65344.processID AS processID_B#65378, JaccardDistance#65365, datasetA#65311.features AS featuresA#65379, datasetB#65344.features AS featuresB#65380]
                                                                                       +- Filter (JaccardDistance#65365 < 0.8)
                                                                                          +- Project [datasetA#65311, datasetB#65344, UDF(datasetA#65311.features, datasetB#65344.features) AS JaccardDistance#65365]
                                                                                             +- Deduplicate [datasetA#65311, datasetB#65344]
                                                                                                +- Project [datasetA#65311, datasetB#65344]
                                                                                                   +- Project [entry#65312, hashValue#65313, datasetA#65311, datasetB#65344]
                                                                                                      +- Join Inner, ((entry#65312 = entry#65345) AND (hashValue#65313 = hashValue#65346))
                                                                                                         :- Project [struct(processID, processID#173370, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, features, features#63047, hashes, hashes#64627) AS datasetA#65311, entry#65312, hashValue#65313]
                                                                                                         :  +- Generate posexplode(hashes#64627), false, [entry#65312, hashValue#65313]
                                                                                                         :     +- Project [processID#173370, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
                                                                                                         :        +- Project [processID#173370, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
                                                                                                         :           +- Project [processID#173370, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
                                                                                                         :              +- Project [processID#173370, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
                                                                                                         :                 +- Aggregate [processID#173370], [processID#173370, collect_list(struct(FromServer, FromServer#173366, ToServer, ToServer#173367, time, time#173368, action, action#173369), 0, 0) AS actions#62941]
                                                                                                         :                    +- Relation [FromServer#173366,ToServer#173367,time#173368,action#173369,processId#173370] csv
                                                                                                         +- Project [struct(processID, processID#173375, actions, actions#62941, actions_str, actions_str#62945, shingles, shingles#63004, hashes, hashes#64627, features, features#65324) AS datasetB#65344, entry#65345, hashValue#65346]
                                                                                                            +- Generate posexplode(hashes#64627), false, [entry#65345, hashValue#65346]
                                                                                                               +- Project [processID#173375, actions#62941, actions_str#62945, shingles#63004, hashes#64627, features#65324]
                                                                                                                  +- Project [processID#173375, actions#62941, actions_str#62945, shingles#63004, inputCol_3e7494a3a9b6#65317, hashes#64627, inputCol_3e7494a3a9b6#65317 AS features#65324]
                                                                                                                     +- Project [processID#173375, actions#62941, actions_str#62945, shingles#63004, features#63047 AS inputCol_3e7494a3a9b6#65317, hashes#64627]
                                                                                                                        +- Project [processID#173375, actions#62941, actions_str#62945, shingles#63004, features#63047, UDF(features#63047) AS hashes#64627]
                                                                                                                           +- Project [processID#173375, actions#62941, actions_str#62945, shingles#63004, UDF(shingles#63004) AS features#63047]
                                                                                                                              +- Project [processID#173375, actions#62941, actions_str#62945, <lambda>(actions_str#62945)#63003 AS shingles#63004]
                                                                                                                                 +- Project [processID#173375, actions#62941, actions_to_string(actions#62941)#62944 AS actions_str#62945]
                                                                                                                                    +- Aggregate [processID#173375], [processID#173375, collect_list(struct(FromServer, FromServer#173371, ToServer, ToServer#173372, time, time#173373, action, action#173374), 0, 0) AS actions#62941]
                                                                                                                                       +- Relation [FromServer#173371,ToServer#173372,time#173373,action#173374,processId#173375] csv


# creating the txt files:
## The desired files will be in the folder output

In [48]:
# Format each row into the desired format
formatted_df = joined_df.withColumn(
    "formatted_row",
    concat_ws("", lit("<"), col("FromServer"), lit(","), col("ToServer"),
              lit(","), col("time"), lit(","), col("action"), lit(","), col("processID"), lit(">"))
)

# Group by Group name and aggregate process IDs and formatted rows
grouped_df = formatted_df.groupBy("Group").agg(
    collect_list("processID").alias("processIDs"),
    collect_list("formatted_row").alias("formatted_rows")
)



# Output path
output_path = "./output/part2Observations.txt"

# Call function to write to text file
write_groups_to_txt(grouped_df, output_path)