In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from datetime import datetime

# Initialize Spark Session
spark = SparkSession.builder.appName("EmployeesHiredInLastNMonths").getOrCreate()

# Define schema using StructType and StructField
schema = StructType([
    StructField("EmployeeID", IntegerType(), True),
    StructField("EmployeeName", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("HireDate", DateType(), True)
])

# Sample data (EmployeeID, EmployeeName, Department, HireDate)
data = [
    (1, "Alice", "HR", datetime.strptime("2023-01-15", "%Y-%m-%d").date()),
    (2, "Bob", "HR", datetime.strptime("2022-11-20", "%Y-%m-%d").date()),
    (3, "Charlie", "IT", datetime.strptime("2023-03-05", "%Y-%m-%d").date()),
    (4, "David", "IT", datetime.strptime("2022-12-25", "%Y-%m-%d").date()),
    (5, "Eve", "IT", datetime.strptime("2023-06-30", "%Y-%m-%d").date()),
    (6, "Frank", "Finance", datetime.strptime("2022-10-11", "%Y-%m-%d").date()),
    (7, "Grace", "Finance", datetime.strptime("2023-08-22", "%Y-%m-%d").date()),
    (8, "Heidi", "HR", datetime.strptime("2023-02-18", "%Y-%m-%d").date()),
    (9, "Ivan", "Sales", datetime.strptime("2023-04-10", "%Y-%m-%d").date()),
    (10, "Judy", "Sales", datetime.strptime("2023-07-01", "%Y-%m-%d").date()),
    (11, "Kevin", "Sales", datetime.strptime("2022-09-15", "%Y-%m-%d").date()),
    (12, "Laura", "IT", datetime.strptime("2023-05-20", "%Y-%m-%d").date()),
    (13, "Mallory", "Marketing", datetime.strptime("2022-08-05", "%Y-%m-%d").date()),
    (14, "Niaj", "Marketing", datetime.strptime("2023-07-30", "%Y-%m-%d").date()),
    (15, "Oscar", "Marketing", datetime.strptime("2023-09-10", "%Y-%m-%d").date())
]

# Create DataFrame
df = spark.createDataFrame(data, schema)
df.show()


+----------+------------+----------+----------+
|EmployeeID|EmployeeName|Department|  HireDate|
+----------+------------+----------+----------+
|         1|       Alice|        HR|2023-01-15|
|         2|         Bob|        HR|2022-11-20|
|         3|     Charlie|        IT|2023-03-05|
|         4|       David|        IT|2022-12-25|
|         5|         Eve|        IT|2023-06-30|
|         6|       Frank|   Finance|2022-10-11|
|         7|       Grace|   Finance|2023-08-22|
|         8|       Heidi|        HR|2023-02-18|
|         9|        Ivan|     Sales|2023-04-10|
|        10|        Judy|     Sales|2023-07-01|
|        11|       Kevin|     Sales|2022-09-15|
|        12|       Laura|        IT|2023-05-20|
|        13|     Mallory| Marketing|2022-08-05|
|        14|        Niaj| Marketing|2023-07-30|
|        15|       Oscar| Marketing|2023-09-10|
+----------+------------+----------+----------+



In [3]:
#How to find employees hired in last n months
# Register the DataFrame as a temporary table
df.createOrReplaceTempView("Employees")


# Spark SQL

In [8]:
# Define the number of months (n)
n = 6

# Calculate the number of days (approximate)
days = n * 30

res = spark.sql(f"""
SELECT *
FROM Employees
WHERE HireDate >= DATE_SUB(CURRENT_DATE(), {days})
""")
res.show()

+----------+------------+----------+--------+
|EmployeeID|EmployeeName|Department|HireDate|
+----------+------------+----------+--------+
+----------+------------+----------+--------+



# Pyspark

In [4]:
from pyspark.sql.functions import current_date, date_sub

# Define the number of months (n)
n = 6

# Calculate the date n months ago
n_months_ago = date_sub(current_date(), n * 30)  # Approximate calculation for months

# Filter employees hired in the last n months
recent_hires_df = df.filter(df["HireDate"] >= n_months_ago)

recent_hires_df.show()


+----------+------------+----------+--------+
|EmployeeID|EmployeeName|Department|HireDate|
+----------+------------+----------+--------+
+----------+------------+----------+--------+



In [6]:
from pyspark.sql.functions import add_months

# Define the number of months (n)
n = 6

# Filter employees hired in the last n months using add_months
recent_hires_exact_df = df.filter(df["HireDate"] >= add_months(current_date(), -n))

recent_hires_exact_df.show()


+----------+------------+----------+--------+
|EmployeeID|EmployeeName|Department|HireDate|
+----------+------------+----------+--------+
+----------+------------+----------+--------+



In [None]:
from pyspark.sql.functions import months_between, lit

# Filter employees hired within the last n months using months_between
recent_hires_months_between_df = df.filter(months_between(current_date(), df["HireDate"]) <= n)
recent_hires_months_between_df.show(truncate=False)
