In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

In [2]:
from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)


In [7]:
from pyspark.sql import Row
from datetime import date



# Sample employee data
data = [
    Row(EmployeeID=1, EmployeeName="Alice", Department="HR", Salary=5000, HireDate=date(2022, 5, 15)),
    Row(EmployeeID=2, EmployeeName="Bob", Department="HR", Salary=4500, HireDate=date(2020, 3, 10)),
    Row(EmployeeID=3, EmployeeName="Charlie", Department="HR", Salary=5500, HireDate=date(2021, 7, 1)),
    Row(EmployeeID=4, EmployeeName="David", Department="IT", Salary=6000, HireDate=date(2019, 1, 25)),
    Row(EmployeeID=5, EmployeeName="Eve", Department="IT", Salary=6500, HireDate=date(2022, 8, 30)),
    Row(EmployeeID=6, EmployeeName="Frank", Department="IT", Salary=5800, HireDate=date(2021, 6, 14)),
    Row(EmployeeID=7, EmployeeName="Grace", Department="Finance", Salary=5200, HireDate=date(2020, 11, 20)),
    Row(EmployeeID=8, EmployeeName="Heidi", Department="Finance", Salary=4800, HireDate=date(2021, 12, 1)),
    Row(EmployeeID=9, EmployeeName="Ivan", Department="Finance", Salary=5300, HireDate=date(2018, 7, 18)),
    Row(EmployeeID=10, EmployeeName="Judy", Department="Sales", Salary=4000, HireDate=date(2023, 1, 5)),
    Row(EmployeeID=11, EmployeeName="Kevin", Department="Sales", Salary=4200, HireDate=date(2021, 9, 13)),
    Row(EmployeeID=12, EmployeeName="Laura", Department="Sales", Salary=4500, HireDate=date(2020, 4, 2)),
    Row(EmployeeID=13, EmployeeName="Mallory", Department="Marketing", Salary=4900, HireDate=date(2023, 3, 25)),
    Row(EmployeeID=14, EmployeeName="Niaj", Department="Marketing", Salary=5000, HireDate=date(2021, 2, 10)),
    Row(EmployeeID=15, EmployeeName="Oscar", Department="Marketing", Salary=4600, HireDate=date(2019, 10, 3))
]

# Create DataFrame
df = spark.createDataFrame(data)
df.createOrReplaceTempView('Employees')
df.cache()
# Display the DataFrame
df.show()

+----------+------------+----------+------+----------+
|EmployeeID|EmployeeName|Department|Salary|  HireDate|
+----------+------------+----------+------+----------+
|         1|       Alice|        HR|  5000|2022-05-15|
|         2|         Bob|        HR|  4500|2020-03-10|
|         3|     Charlie|        HR|  5500|2021-07-01|
|         4|       David|        IT|  6000|2019-01-25|
|         5|         Eve|        IT|  6500|2022-08-30|
|         6|       Frank|        IT|  5800|2021-06-14|
|         7|       Grace|   Finance|  5200|2020-11-20|
|         8|       Heidi|   Finance|  4800|2021-12-01|
|         9|        Ivan|   Finance|  5300|2018-07-18|
|        10|        Judy|     Sales|  4000|2023-01-05|
|        11|       Kevin|     Sales|  4200|2021-09-13|
|        12|       Laura|     Sales|  4500|2020-04-02|
|        13|     Mallory| Marketing|  4900|2023-03-25|
|        14|        Niaj| Marketing|  5000|2021-02-10|
|        15|       Oscar| Marketing|  4600|2019-10-03|
+---------

In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType
from datetime import datetime



# Define schema using StructType and StructField
schema = StructType([
    StructField("EmployeeID", IntegerType(), True),
    StructField("EmployeeName", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Salary", DoubleType(), True),
    StructField("ManagerID", IntegerType(), True),
    StructField("HireDate", DateType(), True)
])

# Sample data (EmployeeID, EmployeeName, Department, Salary, ManagerID, HireDate)
data = [
    (1, "Alice", "HR", 5000.0, None, datetime.strptime("2020-10-10", "%Y-%m-%d").date()),
    (2, "Bob", "HR", 4500.0, 1, datetime.strptime("2018-09-15", "%Y-%m-%d").date()),
    (3, "Charlie", "HR", 5500.0, 1, datetime.strptime("2019-08-20", "%Y-%m-%d").date()),
    (4, "David", "IT", 6000.0, None, datetime.strptime("2022-01-05", "%Y-%m-%d").date()),
    (5, "Eve", "IT", 6500.0, 4, datetime.strptime("2021-03-30", "%Y-%m-%d").date()),
    (6, "Frank", "IT", 5800.0, 4, datetime.strptime("2020-07-14", "%Y-%m-%d").date()),
    (7, "Grace", "Finance", 5200.0, None, datetime.strptime("2019-06-01", "%Y-%m-%d").date()),
    (8, "Heidi", "Finance", 4800.0, 7, datetime.strptime("2020-02-17", "%Y-%m-%d").date()),
    (9, "Ivan", "Finance", 5300.0, 7, datetime.strptime("2017-11-19", "%Y-%m-%d").date()),
    (10, "Judy", "Sales", 4000.0, None, datetime.strptime("2021-04-10", "%Y-%m-%d").date()),
    (11, "Kevin", "Sales", 4200.0, 10, datetime.strptime("2020-05-07", "%Y-%m-%d").date()),
    (12, "Laura", "Sales", 4500.0, 10, datetime.strptime("2019-07-12", "%Y-%m-%d").date()),
    (13, "Mallory", "Marketing", 4900.0, None, datetime.strptime("2019-03-01", "%Y-%m-%d").date()),
    (14, "Niaj", "Marketing", 5000.0, 13, datetime.strptime("2021-08-15", "%Y-%m-%d").date()),
    (15, "Oscar", "Marketing", 4600.0, 13, datetime.strptime("2020-01-25", "%Y-%m-%d").date())
]

# Create DataFrame
df1 = spark.createDataFrame(data, schema)
df1.createOrReplaceTempView("Employees")

df1.show()



+----------+------------+----------+------+---------+----------+
|EmployeeID|EmployeeName|Department|Salary|ManagerID|  HireDate|
+----------+------------+----------+------+---------+----------+
|         1|       Alice|        HR|5000.0|     null|2020-10-10|
|         2|         Bob|        HR|4500.0|        1|2018-09-15|
|         3|     Charlie|        HR|5500.0|        1|2019-08-20|
|         4|       David|        IT|6000.0|     null|2022-01-05|
|         5|         Eve|        IT|6500.0|        4|2021-03-30|
|         6|       Frank|        IT|5800.0|        4|2020-07-14|
|         7|       Grace|   Finance|5200.0|     null|2019-06-01|
|         8|       Heidi|   Finance|4800.0|        7|2020-02-17|
|         9|        Ivan|   Finance|5300.0|        7|2017-11-19|
|        10|        Judy|     Sales|4000.0|     null|2021-04-10|
|        11|       Kevin|     Sales|4200.0|       10|2020-05-07|
|        12|       Laura|     Sales|4500.0|       10|2019-07-12|
|        13|     Mallory|

In [5]:
df.dtypes

[('EmployeeID', 'bigint'),
 ('EmployeeName', 'string'),
 ('Department', 'string'),
 ('Salary', 'bigint'),
 ('HireDate', 'date')]

# Spark SQL

In [18]:
# SQL Query using DATE_ADD for increment logic
query2 = spark.sql("""
SELECT EmployeeID, EmployeeName, Department,
       CASE WHEN HireDate <= DATE_ADD(current_date(), -730) THEN Salary * 1.1 ELSE Salary END AS NewSalary, HireDate
FROM Employees
""")


query2.show()


+----------+------------+----------+---------+----------+
|EmployeeID|EmployeeName|Department|NewSalary|  HireDate|
+----------+------------+----------+---------+----------+
|         1|       Alice|        HR|   5500.0|2022-05-15|
|         2|         Bob|        HR|   4950.0|2020-03-10|
|         3|     Charlie|        HR|   6050.0|2021-07-01|
|         4|       David|        IT|   6600.0|2019-01-25|
|         5|         Eve|        IT|   7150.0|2022-08-30|
|         6|       Frank|        IT|   6380.0|2021-06-14|
|         7|       Grace|   Finance|   5720.0|2020-11-20|
|         8|       Heidi|   Finance|   5280.0|2021-12-01|
|         9|        Ivan|   Finance|   5830.0|2018-07-18|
|        10|        Judy|     Sales|   4000.0|2023-01-05|
|        11|       Kevin|     Sales|   4620.0|2021-09-13|
|        12|       Laura|     Sales|   4950.0|2020-04-02|
|        13|     Mallory| Marketing|   4900.0|2023-03-25|
|        14|        Niaj| Marketing|   5500.0|2021-02-10|
|        15|  

In [16]:
# SQL Query to increment salaries for employees with 2+ years in the organization
query = spark.sql("""
SELECT EmployeeID, EmployeeName, Department,
       CASE WHEN DATEDIFF(current_date(), HireDate) >= 730 THEN Salary * 1.1 ELSE Salary END AS NewSalary,HireDate
FROM Employees
""")

# Execute the query
query.show()


+----------+------------+----------+---------+----------+
|EmployeeID|EmployeeName|Department|NewSalary|  HireDate|
+----------+------------+----------+---------+----------+
|         1|       Alice|        HR|   5500.0|2022-05-15|
|         2|         Bob|        HR|   4950.0|2020-03-10|
|         3|     Charlie|        HR|   6050.0|2021-07-01|
|         4|       David|        IT|   6600.0|2019-01-25|
|         5|         Eve|        IT|   7150.0|2022-08-30|
|         6|       Frank|        IT|   6380.0|2021-06-14|
|         7|       Grace|   Finance|   5720.0|2020-11-20|
|         8|       Heidi|   Finance|   5280.0|2021-12-01|
|         9|        Ivan|   Finance|   5830.0|2018-07-18|
|        10|        Judy|     Sales|   4000.0|2023-01-05|
|        11|       Kevin|     Sales|   4620.0|2021-09-13|
|        12|       Laura|     Sales|   4950.0|2020-04-02|
|        13|     Mallory| Marketing|   4900.0|2023-03-25|
|        14|        Niaj| Marketing|   5500.0|2021-02-10|
|        15|  

In [8]:
res = spark.sql(""" 

SELECT EmployeeID, EmployeeName, Department, Salary, HireDate,
       CASE WHEN DATEDIFF(CURRENT_DATE(), HireDate) / 365 >= 2 THEN Salary * 1.10 ELSE Salary END AS NewSalary
FROM Employees
             """)
res.show()

+----------+------------+----------+------+----------+---------+
|EmployeeID|EmployeeName|Department|Salary|  HireDate|NewSalary|
+----------+------------+----------+------+----------+---------+
|         1|       Alice|        HR|  5000|2022-05-15|  5500.00|
|         2|         Bob|        HR|  4500|2020-03-10|  4950.00|
|         3|     Charlie|        HR|  5500|2021-07-01|  6050.00|
|         4|       David|        IT|  6000|2019-01-25|  6600.00|
|         5|         Eve|        IT|  6500|2022-08-30|  7150.00|
|         6|       Frank|        IT|  5800|2021-06-14|  6380.00|
|         7|       Grace|   Finance|  5200|2020-11-20|  5720.00|
|         8|       Heidi|   Finance|  4800|2021-12-01|  5280.00|
|         9|        Ivan|   Finance|  5300|2018-07-18|  5830.00|
|        10|        Judy|     Sales|  4000|2023-01-05|  4000.00|
|        11|       Kevin|     Sales|  4200|2021-09-13|  4620.00|
|        12|       Laura|     Sales|  4500|2020-04-02|  4950.00|
|        13|     Mallory|

# Pyspark

In [10]:
from pyspark.sql.functions import col, when, current_date, datediff

# Define the increment percentage (e.g., 10%)
increment_percentage = 0.10

# Calculate tenure in days and convert to years
df_with_increment = df \
.withColumn("YearsWithOrg", datediff(current_date(), col("HireDate")) / 365) \
.withColumn("NewSalary", when(col("YearsWithOrg") >= 2, col("Salary") * (1 + increment_percentage)).otherwise(col("Salary")))


# Show updated salary details
df_with_increment.select("EmployeeID", "EmployeeName", "Department", "Salary", "NewSalary","HireDate").show()


+----------+------------+----------+------+-----------------+----------+
|EmployeeID|EmployeeName|Department|Salary|        NewSalary|  HireDate|
+----------+------------+----------+------+-----------------+----------+
|         1|       Alice|        HR|  5000|           5500.0|2022-05-15|
|         2|         Bob|        HR|  4500|           4950.0|2020-03-10|
|         3|     Charlie|        HR|  5500|6050.000000000001|2021-07-01|
|         4|       David|        IT|  6000|6600.000000000001|2019-01-25|
|         5|         Eve|        IT|  6500|7150.000000000001|2022-08-30|
|         6|       Frank|        IT|  5800|6380.000000000001|2021-06-14|
|         7|       Grace|   Finance|  5200|5720.000000000001|2020-11-20|
|         8|       Heidi|   Finance|  4800|           5280.0|2021-12-01|
|         9|        Ivan|   Finance|  5300|5830.000000000001|2018-07-18|
|        10|        Judy|     Sales|  4000|           4000.0|2023-01-05|
|        11|       Kevin|     Sales|  4200|        

In [11]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from datetime import datetime

# Define a UDF to calculate the new salary
def calculate_new_salary(salary, hire_date):
    current_date = datetime.now().date()
    tenure_years = (current_date - hire_date).days / 365
    if tenure_years >= 2:
        return salary * 1.10  # 10% increment
    else:
        return salary

# Register the UDF
increment_udf = udf(calculate_new_salary, DoubleType())

# Apply the UDF to calculate new salaries
df_with_new_salary = df.withColumn("NewSalary", increment_udf(col("Salary"), col("HireDate")))

# Show the result
df_with_new_salary.select("EmployeeID", "EmployeeName", "Department", "Salary", "NewSalary", "HireDate").show()


+----------+------------+----------+------+-----------------+----------+
|EmployeeID|EmployeeName|Department|Salary|        NewSalary|  HireDate|
+----------+------------+----------+------+-----------------+----------+
|         1|       Alice|        HR|  5000|           5500.0|2022-05-15|
|         2|         Bob|        HR|  4500|           4950.0|2020-03-10|
|         3|     Charlie|        HR|  5500|6050.000000000001|2021-07-01|
|         4|       David|        IT|  6000|6600.000000000001|2019-01-25|
|         5|         Eve|        IT|  6500|7150.000000000001|2022-08-30|
|         6|       Frank|        IT|  5800|6380.000000000001|2021-06-14|
|         7|       Grace|   Finance|  5200|5720.000000000001|2020-11-20|
|         8|       Heidi|   Finance|  4800|           5280.0|2021-12-01|
|         9|        Ivan|   Finance|  5300|5830.000000000001|2018-07-18|
|        10|        Judy|     Sales|  4000|             null|2023-01-05|
|        11|       Kevin|     Sales|  4200|        

In [17]:
# Increment salary for employees with 2+ years using PySpark DataFrame methods
df_with_increment = df.withColumn(
    "NewSalary",
    F.when(F.datediff(F.current_date(), F.col("HireDate")) >= 730, F.col("Salary") * 1.1).otherwise(F.col("Salary"))
)

# Show the result
df_with_increment.show()


+----------+------------+----------+------+----------+-----------------+
|EmployeeID|EmployeeName|Department|Salary|  HireDate|        NewSalary|
+----------+------------+----------+------+----------+-----------------+
|         1|       Alice|        HR|  5000|2022-05-15|           5500.0|
|         2|         Bob|        HR|  4500|2020-03-10|           4950.0|
|         3|     Charlie|        HR|  5500|2021-07-01|6050.000000000001|
|         4|       David|        IT|  6000|2019-01-25|6600.000000000001|
|         5|         Eve|        IT|  6500|2022-08-30|7150.000000000001|
|         6|       Frank|        IT|  5800|2021-06-14|6380.000000000001|
|         7|       Grace|   Finance|  5200|2020-11-20|5720.000000000001|
|         8|       Heidi|   Finance|  4800|2021-12-01|           5280.0|
|         9|        Ivan|   Finance|  5300|2018-07-18|5830.000000000001|
|        10|        Judy|     Sales|  4000|2023-01-05|           4000.0|
|        11|       Kevin|     Sales|  4200|2021-09-

In [19]:
# Calculate employees with 2+ years and apply salary increment
two_years_ago = F.date_sub(F.current_date(), 730)

# Use filter to find employees eligible for increment
filtered_df = df.filter(F.col("HireDate") <= two_years_ago).withColumn("NewSalary", F.col("Salary") * 1.1)
non_filtered_df = df.filter(F.col("HireDate") > two_years_ago).withColumn("NewSalary", F.col("Salary"))

# Union results for the final output
final_incremented_df = filtered_df.union(non_filtered_df)
final_incremented_df.show()


+----------+------------+----------+------+----------+-----------------+
|EmployeeID|EmployeeName|Department|Salary|  HireDate|        NewSalary|
+----------+------------+----------+------+----------+-----------------+
|         1|       Alice|        HR|  5000|2022-05-15|           5500.0|
|         2|         Bob|        HR|  4500|2020-03-10|           4950.0|
|         3|     Charlie|        HR|  5500|2021-07-01|6050.000000000001|
|         4|       David|        IT|  6000|2019-01-25|6600.000000000001|
|         5|         Eve|        IT|  6500|2022-08-30|7150.000000000001|
|         6|       Frank|        IT|  5800|2021-06-14|6380.000000000001|
|         7|       Grace|   Finance|  5200|2020-11-20|5720.000000000001|
|         8|       Heidi|   Finance|  4800|2021-12-01|           5280.0|
|         9|        Ivan|   Finance|  5300|2018-07-18|5830.000000000001|
|        11|       Kevin|     Sales|  4200|2021-09-13|           4620.0|
|        12|       Laura|     Sales|  4500|2020-04-