In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())


from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


In [2]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("ComplexDataSwapColumns").getOrCreate()

# Sample Data: Employee information with columns to swap
data = [
    (1, "Alice", "HR", 4500, "Manager", "New York", 5, 1001, "A"),
    (2, "Bob", "Finance", 5500, "Analyst", "Los Angeles", 7, 1002, "B"),
    (3, "Charlie", "IT", 7000, "Developer", "San Francisco", 4, 1003, "A"),
    (4, "David", "HR", 4800, "Recruiter", "Seattle", 3, 1004, "C"),
    (5, "Eva", "Marketing", 6000, "Executive", "Austin", 6, 1005, "B"),
    (6, "Frank", "Finance", 6200, "Consultant", "Chicago", 5, 1006, "A"),
    (7, "Grace", "IT", 7200, "Architect", "Houston", 8, 1007, "A"),
    (8, "Henry", "HR", 4600, "Manager", "Miami", 5, 1008, "B"),
    (9, "Ivy", "Marketing", 6100, "Executive", "Dallas", 4, 1009, "C"),
    (10, "Jack", "Finance", 5800, "Analyst", "Phoenix", 7, 1010, "A")
]

# Create DataFrame
df = spark.createDataFrame(data, ["emp_id", "name", "department", "salary", "designation", "location", "experience", "emp_code", "grade"])

# Create a Temporary View for Spark SQL
df.createOrReplaceTempView("employee_table")

# Show the Original DataFrame
df.show(truncate=False)


+------+-------+----------+------+-----------+-------------+----------+--------+-----+
|emp_id|name   |department|salary|designation|location     |experience|emp_code|grade|
+------+-------+----------+------+-----------+-------------+----------+--------+-----+
|1     |Alice  |HR        |4500  |Manager    |New York     |5         |1001    |A    |
|2     |Bob    |Finance   |5500  |Analyst    |Los Angeles  |7         |1002    |B    |
|3     |Charlie|IT        |7000  |Developer  |San Francisco|4         |1003    |A    |
|4     |David  |HR        |4800  |Recruiter  |Seattle      |3         |1004    |C    |
|5     |Eva    |Marketing |6000  |Executive  |Austin       |6         |1005    |B    |
|6     |Frank  |Finance   |6200  |Consultant |Chicago      |5         |1006    |A    |
|7     |Grace  |IT        |7200  |Architect  |Houston      |8         |1007    |A    |
|8     |Henry  |HR        |4600  |Manager    |Miami        |5         |1008    |B    |
|9     |Ivy    |Marketing |6100  |Executive

#### Swaps values between the salary and experience columns based on a condition (e.g., when grade is A).

In [3]:
res = spark.sql(""" 
                
SELECT emp_id, name, department, 
       CASE WHEN grade = 'A' THEN salary ELSE experience END AS new_salary,
       CASE WHEN grade = 'A' THEN experience ELSE salary END AS new_experience,
       designation, location, emp_code, grade
FROM employee_table;

              
                """)
res.show()


+------+-------+----------+----------+--------------+-----------+-------------+--------+-----+
|emp_id|   name|department|new_salary|new_experience|designation|     location|emp_code|grade|
+------+-------+----------+----------+--------------+-----------+-------------+--------+-----+
|     1|  Alice|        HR|      4500|             5|    Manager|     New York|    1001|    A|
|     2|    Bob|   Finance|         7|          5500|    Analyst|  Los Angeles|    1002|    B|
|     3|Charlie|        IT|      7000|             4|  Developer|San Francisco|    1003|    A|
|     4|  David|        HR|         3|          4800|  Recruiter|      Seattle|    1004|    C|
|     5|    Eva| Marketing|         6|          6000|  Executive|       Austin|    1005|    B|
|     6|  Frank|   Finance|      6200|             5| Consultant|      Chicago|    1006|    A|
|     7|  Grace|        IT|      7200|             8|  Architect|      Houston|    1007|    A|
|     8|  Henry|        HR|         5|          46

##### Swap Without Condition    Simply swaps the values of salary and experience

In [4]:
res1 = spark.sql(""" 
                
SELECT emp_id, name, department, 
       experience AS new_salary,
       salary AS new_experience,
       designation, location, emp_code, grade
FROM employee_table;


              
                """)
res1.show()


+------+-------+----------+----------+--------------+-----------+-------------+--------+-----+
|emp_id|   name|department|new_salary|new_experience|designation|     location|emp_code|grade|
+------+-------+----------+----------+--------------+-----------+-------------+--------+-----+
|     1|  Alice|        HR|         5|          4500|    Manager|     New York|    1001|    A|
|     2|    Bob|   Finance|         7|          5500|    Analyst|  Los Angeles|    1002|    B|
|     3|Charlie|        IT|         4|          7000|  Developer|San Francisco|    1003|    A|
|     4|  David|        HR|         3|          4800|  Recruiter|      Seattle|    1004|    C|
|     5|    Eva| Marketing|         6|          6000|  Executive|       Austin|    1005|    B|
|     6|  Frank|   Finance|         5|          6200| Consultant|      Chicago|    1006|    A|
|     7|  Grace|        IT|         8|          7200|  Architect|      Houston|    1007|    A|
|     8|  Henry|        HR|         5|          46

In [5]:
from pyspark.sql.functions import when, col

# Swap columns based on a condition (e.g., if the grade is 'A')
df_swapped = df.withColumn("new_salary", when(col("grade") == "A", col("experience")).otherwise(col("salary"))) \
               .withColumn("new_experience", when(col("grade") == "A", col("salary")).otherwise(col("experience")))

# Show the result
df_swapped.show(truncate=False)


+------+-------+----------+------+-----------+-------------+----------+--------+-----+----------+--------------+
|emp_id|name   |department|salary|designation|location     |experience|emp_code|grade|new_salary|new_experience|
+------+-------+----------+------+-----------+-------------+----------+--------+-----+----------+--------------+
|1     |Alice  |HR        |4500  |Manager    |New York     |5         |1001    |A    |5         |4500          |
|2     |Bob    |Finance   |5500  |Analyst    |Los Angeles  |7         |1002    |B    |5500      |7             |
|3     |Charlie|IT        |7000  |Developer  |San Francisco|4         |1003    |A    |4         |7000          |
|4     |David  |HR        |4800  |Recruiter  |Seattle      |3         |1004    |C    |4800      |3             |
|5     |Eva    |Marketing |6000  |Executive  |Austin       |6         |1005    |B    |6000      |6             |
|6     |Frank  |Finance   |6200  |Consultant |Chicago      |5         |1006    |A    |5         

In [6]:
# Swap columns directly
df_simple_swap = df.withColumn("new_salary", col("experience")) \
                   .withColumn("new_experience", col("salary"))

# Show the result
df_simple_swap.show(truncate=False)


+------+-------+----------+------+-----------+-------------+----------+--------+-----+----------+--------------+
|emp_id|name   |department|salary|designation|location     |experience|emp_code|grade|new_salary|new_experience|
+------+-------+----------+------+-----------+-------------+----------+--------+-----+----------+--------------+
|1     |Alice  |HR        |4500  |Manager    |New York     |5         |1001    |A    |5         |4500          |
|2     |Bob    |Finance   |5500  |Analyst    |Los Angeles  |7         |1002    |B    |7         |5500          |
|3     |Charlie|IT        |7000  |Developer  |San Francisco|4         |1003    |A    |4         |7000          |
|4     |David  |HR        |4800  |Recruiter  |Seattle      |3         |1004    |C    |3         |4800          |
|5     |Eva    |Marketing |6000  |Executive  |Austin       |6         |1005    |B    |6         |6000          |
|6     |Frank  |Finance   |6200  |Consultant |Chicago      |5         |1006    |A    |5         

#### Swap with Multiple Columns

In [7]:
res2 = spark.sql(""" 
                
SELECT emp_id, name, department, 
       experience AS new_salary,
       salary AS new_experience,
       location AS new_designation,
       designation AS new_location,
       emp_code, grade
FROM employee_table;


              
                """)
res2.show()


+------+-------+----------+----------+--------------+---------------+------------+--------+-----+
|emp_id|   name|department|new_salary|new_experience|new_designation|new_location|emp_code|grade|
+------+-------+----------+----------+--------------+---------------+------------+--------+-----+
|     1|  Alice|        HR|         5|          4500|       New York|     Manager|    1001|    A|
|     2|    Bob|   Finance|         7|          5500|    Los Angeles|     Analyst|    1002|    B|
|     3|Charlie|        IT|         4|          7000|  San Francisco|   Developer|    1003|    A|
|     4|  David|        HR|         3|          4800|        Seattle|   Recruiter|    1004|    C|
|     5|    Eva| Marketing|         6|          6000|         Austin|   Executive|    1005|    B|
|     6|  Frank|   Finance|         5|          6200|        Chicago|  Consultant|    1006|    A|
|     7|  Grace|        IT|         8|          7200|        Houston|   Architect|    1007|    A|
|     8|  Henry|    

In [8]:
# Swap multiple column pairs
df_multi_swap = df.withColumn("new_salary", col("experience")) \
                  .withColumn("new_experience", col("salary")) \
                  .withColumn("new_designation", col("location")) \
                  .withColumn("new_location", col("designation"))

# Show the result
df_multi_swap.show(truncate=False)


+------+-------+----------+------+-----------+-------------+----------+--------+-----+----------+--------------+---------------+------------+
|emp_id|name   |department|salary|designation|location     |experience|emp_code|grade|new_salary|new_experience|new_designation|new_location|
+------+-------+----------+------+-----------+-------------+----------+--------+-----+----------+--------------+---------------+------------+
|1     |Alice  |HR        |4500  |Manager    |New York     |5         |1001    |A    |5         |4500          |New York       |Manager     |
|2     |Bob    |Finance   |5500  |Analyst    |Los Angeles  |7         |1002    |B    |7         |5500          |Los Angeles    |Analyst     |
|3     |Charlie|IT        |7000  |Developer  |San Francisco|4         |1003    |A    |4         |7000          |San Francisco  |Developer   |
|4     |David  |HR        |4800  |Recruiter  |Seattle      |3         |1004    |C    |3         |4800          |Seattle        |Recruiter   |
|5    

In [9]:
# Define a UDF or use a temporary DataFrame for complex swaps
def swap_values(row):
    new_salary = row.experience
    new_experience = row.salary
    return (row.emp_id, row.name, row.department, new_salary, new_experience, row.designation, row.location, row.emp_code, row.grade)

# Apply swap logic using RDD transformation
rdd_swapped = df.rdd.map(lambda row: swap_values(row))
df_swapped_final = spark.createDataFrame(rdd_swapped, schema=["emp_id", "name", "department", "new_salary", "new_experience", "designation", "location", "emp_code", "grade"])

# Show the result
df_swapped_final.show(truncate=False)


+------+-------+----------+----------+--------------+-----------+-------------+--------+-----+
|emp_id|name   |department|new_salary|new_experience|designation|location     |emp_code|grade|
+------+-------+----------+----------+--------------+-----------+-------------+--------+-----+
|1     |Alice  |HR        |5         |4500          |Manager    |New York     |1001    |A    |
|2     |Bob    |Finance   |7         |5500          |Analyst    |Los Angeles  |1002    |B    |
|3     |Charlie|IT        |4         |7000          |Developer  |San Francisco|1003    |A    |
|4     |David  |HR        |3         |4800          |Recruiter  |Seattle      |1004    |C    |
|5     |Eva    |Marketing |6         |6000          |Executive  |Austin       |1005    |B    |
|6     |Frank  |Finance   |5         |6200          |Consultant |Chicago      |1006    |A    |
|7     |Grace  |IT        |8         |7200          |Architect  |Houston      |1007    |A    |
|8     |Henry  |HR        |5         |4600        