In [1]:
from pyspark.sql import SparkSession
import os
from pyspark.sql.types import StringType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import concat, lit


# code to start the Master:
1. Open cmd and admin
2. write "cd %SPARK_HOME%"
3. bin\spark-class2.cmd org.apache.spark.deploy.master.Master
# code to start the worker:
1. Open cmd and admin
2. write "cd %SPARK_HOME%"
3. write "bin\spark-class2.cmd org.apache.spark.deploy.worker.Worker -c 6 -m 10G spark://192.168.1.81:7077"
* in step 3:
* -c -> number of cores
* -m -> amount of RAM for the current worker
* the spark link is from the Master link ( go to the web page of the master and locate the spark link )

In [2]:
from pyspark.sql import SparkSession
import findspark
findspark.init()

spark = SparkSession.builder \
    .appName("part1Grouping") \
    .master("spark://192.168.1.81:7077") \
    .config("spark.executor.memory", "10g") \
    .config("spark.executor.cores", "6") \
    .config("spark.executor.instances", "3") \
    .config("spark.driver.memory", "10g") \
    .config("spark.driver.cores", "3") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.executor.heartbeatInterval", "100s") \
    .config("spark.sql.broadcastTimeout", "3600s") \
    .config("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2") \
    .config("spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored", "true") \
    .getOrCreate()



In [3]:
# Load the data into a DataFrame
data_path = "output2.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)
df.show()

+----------+--------+----+--------+---------+
|FromServer|ToServer|time|  action|processId|
+----------+--------+----+--------+---------+
|      null| lkVpiJ4|   0| Request|        1|
|   lkVpiJ4|    null|   6|Response|        1|
|      null| lkVpiJ4|   9| Request|        2|
|   lkVpiJ4|    null|  12|Response|        2|
|      null| OZBsEf0|  11| Request|        3|
|   OZBsEf0|    null|  13|Response|        3|
|      null|    Aum3|  18| Request|        4|
|      Aum3|    null|  28|Response|        4|
|      null|    Aum3|  22| Request|        5|
|      Aum3|    null|  24|Response|        5|
|      null|   qZGv1|  27| Request|        6|
|     qZGv1|    null|  36|Response|        6|
|      null|    asdf|  40| Request|        7|
|      asdf|    fdsa|  41| Request|        7|
|      fdsa|    asdf|  42|Response|        7|
|      asdf|    null|  43|Response|        7|
+----------+--------+----+--------+---------+



In [4]:
from pyspark.sql.functions import col, collect_list, struct

# Group by processID and collect the sequence of actions
processes_df = df.groupBy("processID").agg(collect_list(struct("FromServer", "ToServer", "time", "action")).alias("actions"))

# Convert actions to string for MinHash LSH
def actions_to_string(actions):
    return "".join([f"{action['FromServer']}{action['ToServer']}" for action in actions])

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

actions_to_string_udf = udf(actions_to_string, StringType())
processes_df = processes_df.withColumn("actions_str", actions_to_string_udf(col("actions")))
processes_df.show()

+---------+--------------------+--------------------+
|processID|             actions|         actions_str|
+---------+--------------------+--------------------+
|        1|[{null, lkVpiJ4, ...|nulllkVpiJ4lkVpiJ...|
|        6|[{null, qZGv1, 27...|  nullqZGv1qZGv1null|
|        3|[{null, OZBsEf0, ...|nullOZBsEf0OZBsEf...|
|        5|[{null, Aum3, 22,...|    nullAum3Aum3null|
|        4|[{null, Aum3, 18,...|    nullAum3Aum3null|
|        7|[{null, asdf, 40,...|nullasdfasdffdsaf...|
|        2|[{null, lkVpiJ4, ...|nulllkVpiJ4lkVpiJ...|
+---------+--------------------+--------------------+



In [5]:
# Convert actions string into shingles
def get_shingles(row, k=5):
    concatenated_str = ''.join(row)
    shingles = [concatenated_str[i:i+k] for i in range(len(concatenated_str) - (k - 1))]
    return shingles
from pyspark.sql.types import ArrayType

get_shingles_udf = udf(lambda x: get_shingles(x), ArrayType(StringType()))
processes_df = processes_df.withColumn("shingles", get_shingles_udf(col("actions_str")))
processes_df.show()


+---------+--------------------+--------------------+--------------------+
|processID|             actions|         actions_str|            shingles|
+---------+--------------------+--------------------+--------------------+
|        1|[{null, lkVpiJ4, ...|nulllkVpiJ4lkVpiJ...|[nulll, ulllk, ll...|
|        6|[{null, qZGv1, 27...|  nullqZGv1qZGv1null|[nullq, ullqZ, ll...|
|        3|[{null, OZBsEf0, ...|nullOZBsEf0OZBsEf...|[nullO, ullOZ, ll...|
|        5|[{null, Aum3, 22,...|    nullAum3Aum3null|[nullA, ullAu, ll...|
|        4|[{null, Aum3, 18,...|    nullAum3Aum3null|[nullA, ullAu, ll...|
|        7|[{null, asdf, 40,...|nullasdfasdffdsaf...|[nulla, ullas, ll...|
|        2|[{null, lkVpiJ4, ...|nulllkVpiJ4lkVpiJ...|[nulll, ulllk, ll...|
+---------+--------------------+--------------------+--------------------+



In [6]:
from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="shingles", outputCol="features",binary=True)
cv_model = cv.fit(processes_df)
vectorized_df = cv_model.transform(processes_df)
print(vectorized_df.show(truncate=True))

# print("Distinct Attributes (Vocabulary):")
# for i, attr in enumerate(cv_model.vocabulary):
#     print(f"{i}. {attr}")

+---------+--------------------+--------------------+--------------------+--------------------+
|processID|             actions|         actions_str|            shingles|            features|
+---------+--------------------+--------------------+--------------------+--------------------+
|        1|[{null, lkVpiJ4, ...|nulllkVpiJ4lkVpiJ...|[nulll, ulllk, ll...|(79,[0,1,2,6,10,1...|
|        6|[{null, qZGv1, 27...|  nullqZGv1qZGv1null|[nullq, ullqZ, ll...|(79,[4,37,40,47,4...|
|        3|[{null, OZBsEf0, ...|nullOZBsEf0OZBsEf...|[nullO, ullOZ, ll...|(79,[9,25,30,38,4...|
|        5|[{null, Aum3, 22,...|    nullAum3Aum3null|[nullA, ullAu, ll...|(79,[3,5,8,11,13,...|
|        4|[{null, Aum3, 18,...|    nullAum3Aum3null|[nullA, ullAu, ll...|(79,[3,5,8,11,13,...|
|        7|[{null, asdf, 40,...|nullasdfasdffdsaf...|[nulla, ullas, ll...|(79,[7,12,15,21,3...|
|        2|[{null, lkVpiJ4, ...|nulllkVpiJ4lkVpiJ...|[nulll, ulllk, ll...|(79,[0,1,2,6,10,1...|
+---------+--------------------+--------

In [7]:
from pyspark.ml.feature import MinHashLSH

mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=10)
mh_model = mh.fit(vectorized_df)
hashed_df = mh_model.transform(vectorized_df)


In [8]:
from pyspark.sql.functions import array, array_union, collect_list, explode, col
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import HashingTF

threshold = 0.8
# Find similar candidate process IDs using MinHashLSH
similarity_df = mh_model.approxSimilarityJoin(hashed_df, hashed_df, threshold, distCol="JaccardDistance") \
    .select(col("datasetA.processID").alias("processID_A"),
            col("datasetB.processID").alias("processID_B"),
            col("JaccardDistance"))

# Filter out self-joins and duplicates
similarity_df = similarity_df.filter(col("processID_A") < col("processID_B"))
similarity_df.show()
# Function to calculate Jaccard similarity
def jaccard_similarity(x, y):
    x_set = set(x)
    y_set = set(y)
    intersection = len(x_set & y_set)
    union = len(x_set | y_set)
    return float(intersection) / union

# Register the function as a UDF
from pyspark.sql.types import FloatType
jaccard_similarity_udf = udf(jaccard_similarity, FloatType())
# print("processes_df")
# processes_df.show()
# print("similarity_df")
# similarity_df.show()

# Join with original DataFrame to get shingles for each process
similarity_df = similarity_df \
    .join(processes_df.select("processID", "shingles"), similarity_df.processID_A == processes_df.processID) \
    .withColumnRenamed("shingles", "shingles_A") \
    .drop("processID") \
    .join(processes_df.select("processID", "shingles"), similarity_df.processID_B == processes_df.processID) \
    .withColumnRenamed("shingles", "shingles_B") \
    .drop("processID")

# Calculate Jaccard similarity for each candidate pair
similarity_df = similarity_df.withColumn("JaccardSimilarity", jaccard_similarity_udf(col("shingles_A"), col("shingles_B")))
# Filter pairs with Jaccard similarity above a threshold (e.g., 0.8)
similarity_df_filtered = similarity_df.filter(col("JaccardSimilarity") >= 0.9)
# Group by processID_A and collect similar processIDs
grouped_df = similarity_df.groupBy("processID_A").agg(collect_list("processID_B").alias("similar_processIDs"))

# Convert processID_A to an array and concatenate with similar_processIDs
grouped_df = grouped_df.withColumn("all_processIDs", array_union(array(col("processID_A")), col("similar_processIDs")))

# Explode the all_processIDs array to get a mapping of each process ID to its group
exploded_df = grouped_df.select(explode(col("all_processIDs")).alias("processID"), col("processID_A").alias("group_representative"))
similarity_df.show()

+-----------+-----------+---------------+
|processID_A|processID_B|JaccardDistance|
+-----------+-----------+---------------+
|          4|          5|            0.0|
|          1|          2|            0.0|
+-----------+-----------+---------------+

+-----------+-----------+---------------+--------------------+--------------------+-----------------+
|processID_A|processID_B|JaccardDistance|          shingles_A|          shingles_B|JaccardSimilarity|
+-----------+-----------+---------------+--------------------+--------------------+-----------------+
|          4|          5|            0.0|[nullA, ullAu, ll...|[nullA, ullAu, ll...|              1.0|
|          1|          2|            0.0|[nulll, ulllk, ll...|[nulll, ulllk, ll...|              1.0|
+-----------+-----------+---------------+--------------------+--------------------+-----------------+



In [None]:
# Merge overlapping groups
def merge_groups(group_list):
    groups = []
    for group in group_list:
        merged = False
        for existing_group in groups:
            if any(item in group for item in existing_group):
                existing_group.update(group)
                merged = True
                break
        if not merged:
            groups.append(set(group))
    return [list(group) for group in groups]

merge_groups_udf = udf(lambda x: merge_groups(x), ArrayType(ArrayType(IntegerType())))

grouped_lists = exploded_df.groupBy("group_representative") \
    .agg(collect_list("processID").alias("group_list")) \
    .agg(collect_list("group_list").alias("group_lists"))

merged_groups = grouped_lists.withColumn("merged_groups", merge_groups_udf(col("group_lists"))) \
    .select(explode(col("merged_groups")).alias("final_group"))

# Convert the final groups to a DataFrame
from pyspark.sql.functions import concat_ws

final_groups_df = merged_groups.select(concat_ws("_", col("final_group")).alias("Group"), col("final_group"))

# Find the representative process for each final group
final_groups_exploded = final_groups_df.withColumn("processID", explode(col("final_group")))

# Join with the original DataFrame to keep only the representative process
filtered_df = df.join(final_groups_exploded, on="processID", how="inner")

# Select the smallest processID in each group as the representative
from pyspark.sql.functions import min

group_representative_df = final_groups_exploded.groupBy("Group").agg(min("processID").alias("representative_processID"))

# Join to get the full details of the representative processes
representative_processes_df = group_representative_df.join(filtered_df, filtered_df["processID"] == group_representative_df.representative_processID, "inner") \
    .select("processID", "FromServer", "ToServer", "time", "action")
representative_processes_df.show()
final_groups_df.show()

+---------+----------+--------+----+--------+
|processID|FromServer|ToServer|time|  action|
+---------+----------+--------+----+--------+
|        1|      null| lkVpiJ4|   0| Request|
|        1|   lkVpiJ4|    null|   6|Response|
|        4|      null|    Aum3|  18| Request|
|        4|      Aum3|    null|  28|Response|
+---------+----------+--------+----+--------+



In [None]:
# Show the final result
representative_processes_df.show(truncate=False)

# Optional: Write the final groups to a file
# final_groups_df.write.csv("path_to_output_groups_file.csv", header=True)
# representative_processes_df.write.csv("path_to_output_filtered_file.csv", header=True)


In [None]:
from pyspark.sql.functions import col, expr
# Step 1: Remove Processes in Groups
# Get the list of process IDs to remove
processes_to_remove = final_groups_df.selectExpr("explode(final_group) as processID").distinct()

# Filter out rows where processID is in processes_to_remove
df_without_groups = df.join(processes_to_remove, "processID", "left_anti")
df_without_groups = df_without_groups.select("FromServer", "ToServer", "time", "action","processID")
# Add a constant number to processID
constant_number = df.agg({"processID": "max"}).first()[0]
new_representative_processes_df = representative_processes_df.withColumn(
    "processID",
    expr(f"processID + {constant_number}")
)

# Show the final DataFrame
new_representative_processes_df = new_representative_processes_df.select("FromServer", "ToServer", "time", "action","processID").orderBy("time")

# Combine original DataFrame and representatives DataFrame
combined_df = df_without_groups.union(new_representative_processes_df)

# Show final combined DataFrame
combined_df.show(truncate=False)

# creating the txt files:
## The desired files will be in the folder output

In [None]:
def write_to_one_txt(df, local_path_name,wanted_list):
    correct_path = wanted_list + "/part1Output.txt"
    formatted_df = df.withColumn(
    "formatted_line",
    concat(lit("<"), df.FromServer, lit(","),
           df.ToServer, lit(","),
           df.time, lit(","),
           df.action, lit(","),
           df.processID, lit(">"))
)
    open(correct_path, "w")
    formatted_df.select("formatted_line").write.mode("overwrite").text(output_path)
    os.system(f'cat {local_path_name}/*.txt >> {correct_path}')
    os.system(f'rm -r {local_path_name}')
    

In [None]:

# Define the output path
output_path = "./part1OUT1"
output_path1 = "./output"
write_to_one_txt(combined_df,output_path,output_path1)
# Write the DataFrame to a CSV file


In [None]:
# creating a dataframe only with the processes that were grouped.
df_with_groups = df.join(processes_to_remove, "processID", "semi")
df_with_groups.show()

In [None]:
exploded_final_groups_df = final_groups_df.select("Group", explode("final_group").alias("processID"))
exploded_final_groups_df.show()
df_with_groups.show()
joined_df = df_with_groups.join(exploded_final_groups_df, "processID")

In [None]:

# Format each row into the desired format
formatted_df = joined_df.withColumn(
    "formatted_row",
    concat_ws("", lit("<"), col("FromServer"), lit(","), col("ToServer"),
              lit(","), col("time"), lit(","), col("action"), lit(","), col("processID"), lit(">"))
)

# Group by Group name and aggregate process IDs and formatted rows
grouped_df = formatted_df.groupBy("Group").agg(
    collect_list("processID").alias("processIDs"),
    collect_list("formatted_row").alias("formatted_rows")
)

# Function to write groups to txt file
def write_groups_to_txt(grouped_df, output_path):
    with open(output_path, "w") as file:
        for row in grouped_df.collect():
            group_name = row["Group"]
            process_ids = row["processIDs"]
            formatted_rows = row["formatted_rows"]
            
            # Ensure process_ids are unique and sorted
            process_ids = sorted(set(process_ids))
            
            file.write(f"Group: {{{', '.join(map(str, process_ids))}}}\n")
            
            for process_id in process_ids:
                file.write(f"{process_id}:\n")
                
                # Find all formatted rows for the current process ID
                rows_for_process_id = [row for row in formatted_rows if row.endswith(f",{process_id}>")]
                
                if rows_for_process_id:
                    for formatted_row in rows_for_process_id:
                        file.write(f"{formatted_row}\n")
                else:
                    file.write("<No corresponding formatted rows found>\n")
                    
            file.write("\n")  # Add empty line between groups for clarity

# Output path
output_path = "./output/part1Observations.txt"

# Call function to write to text file
write_groups_to_txt(grouped_df, output_path)