In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())


from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


# identify reverse pairs

In [2]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("ReversePairsSelfJoin").getOrCreate()

# Sample Data: Pairs of connections (like friendships, trades, etc.)
data = [
    ("Alice", "Bob"),
    ("Bob", "Alice"),
    ("Charlie", "David"),
    ("David", "Charlie"),
    ("Eva", "Frank"),
    ("Frank", "Eva"),
    ("Grace", "Henry"),
    ("Ivy", "Jack"),
    ("Jack", "Ivy"),
    ("Liam", "Mia"),
    ("Mia", "Liam"),
    ("Noah", "Olivia"),
    ("Peter", "Quinn"),
    ("Quinn", "Peter")
]

# Create DataFrame
df = spark.createDataFrame(data, ["person1", "person2"])

# Create a Temporary View for Spark SQL
df.createOrReplaceTempView("connections")

# Show the Original DataFrame
df.show(truncate=False)


+-------+-------+
|person1|person2|
+-------+-------+
|Alice  |Bob    |
|Bob    |Alice  |
|Charlie|David  |
|David  |Charlie|
|Eva    |Frank  |
|Frank  |Eva    |
|Grace  |Henry  |
|Ivy    |Jack   |
|Jack   |Ivy    |
|Liam   |Mia    |
|Mia    |Liam   |
|Noah   |Olivia |
|Peter  |Quinn  |
|Quinn  |Peter  |
+-------+-------+



In [4]:

#Self-Join (c1 and c2): Match rows where person1 of c1 is equal to person2 of c2 and vice versa.
#WHERE c1.person1 < c1.person2: Ensures each reverse pair is shown only once (e.g., (Alice, Bob) and not (Bob, Alice)).



res = spark.sql(""" 
                
SELECT c1.person1, c1.person2
FROM connections c1
JOIN connections c2 ON c1.person1 = c2.person2 AND c1.person2 = c2.person1
WHERE c1.person1 < c1.person2;


              
                """)
res.show()


+-------+-------+
|person1|person2|
+-------+-------+
|  Alice|    Bob|
|Charlie|  David|
|    Eva|  Frank|
|    Ivy|   Jack|
|   Liam|    Mia|
|  Peter|  Quinn|
+-------+-------+



In [6]:
#df.alias("c1") and df.alias("c2"): Aliases are used to create a self-join.
#Conditions: Join where person1 from c1 equals person2 from c2 and vice versa.
#Filter: Keep only one entry per reverse pair using .filter("c1.person1 < c1.person2").

# Perform Self-Join to find reverse pairs
df_reversed = df.alias("c1").join(
    df.alias("c2"),
    (df.alias("c1")["person1"] == df.alias("c2")["person2"]) & (df.alias("c1")["person2"] == df.alias("c2")["person1"]),
    "inner"
).select("c1.person1", "c1.person2").filter("c1.person1 < c1.person2")

# Show the result
df_reversed.show(truncate=False)



+-------+-------+
|person1|person2|
+-------+-------+
+-------+-------+



In [11]:
from pyspark.sql.functions import col

# Reverse columns to create a mirrored DataFrame
df_reversed = df.withColumnRenamed("person1", "person2_rev").withColumnRenamed("person2", "person1_rev")

# Perform the self-join to find reverse pairs
df_reverse_pairs = df.alias("original").join(
    df_reversed.alias("reversed"),
    (col("original.person1") == col("reversed.person1_rev")) & (col("original.person2") == col("reversed.person2_rev")),
    "inner"
).select(col("original.person1"), col("original.person2")).filter(col("original.person1") < col("original.person2"))

# Show the result
df_reverse_pairs.show(truncate=False)



+-------+-------+
|person1|person2|
+-------+-------+
|Alice  |Bob    |
|Charlie|David  |
|Eva    |Frank  |
|Ivy    |Jack   |
|Liam   |Mia    |
|Peter  |Quinn  |
+-------+-------+



In [7]:
#CASE WHEN: Create a unique identifier to ensure (A, B) and (B, A) are considered the same.
#DISTINCT: Remove any duplicates from the result set.



res1 = spark.sql(""" 
                
WITH reverse_pairs AS (
    SELECT c1.person1, c1.person2, 
           CASE WHEN c1.person1 < c1.person2 THEN CONCAT(c1.person1, '-', c1.person2) 
                ELSE CONCAT(c1.person2, '-', c1.person1) END AS pair_id
    FROM connections c1
    JOIN connections c2 ON c1.person1 = c2.person2 AND c1.person2 = c2.person1
)
SELECT DISTINCT person1, person2 
FROM reverse_pairs;



              
                """)
res1.show()

+-------+-------+
|person1|person2|
+-------+-------+
|    Bob|  Alice|
|  Alice|    Bob|
|  David|Charlie|
|Charlie|  David|
|  Frank|    Eva|
|    Eva|  Frank|
|   Jack|    Ivy|
|    Ivy|   Jack|
|    Mia|   Liam|
|   Liam|    Mia|
|  Quinn|  Peter|
|  Peter|  Quinn|
+-------+-------+



In [9]:
from pyspark.sql.functions import concat, lit, when,col

# Create a unique identifier for reverse pairs
df_pairs = df.withColumn(
    "pair_id", 
    when(col("person1") < col("person2"), concat(col("person1"), lit("-"), col("person2")))
    .otherwise(concat(col("person2"), lit("-"), col("person1")))
)

# Select distinct pairs based on the unique identifier
df_unique_pairs = df_pairs.dropDuplicates(["pair_id"]).select("person1", "person2")

# Show the result
df_unique_pairs.show(truncate=False)


+-------+-------+
|person1|person2|
+-------+-------+
|Alice  |Bob    |
|Charlie|David  |
|Eva    |Frank  |
|Grace  |Henry  |
|Ivy    |Jack   |
|Liam   |Mia    |
|Noah   |Olivia |
|Peter  |Quinn  |
+-------+-------+

