# How to find departments having only male / female employees

In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

# Pysaprk

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize Spark Session
spark = SparkSession.builder.appName("GenderSpecificDepartments").getOrCreate()

# Define schema and sample data
schema = StructType([
    StructField("EmployeeID", IntegerType(), True),
    StructField("EmployeeName", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Gender", StringType(), True)
])

# Sample data
data = [
    (1, "Alice", "Marketing", "Female"),
    (2, "Bob", "Marketing", "Male"),
    (3, "Charlie", "IT", "Male"),
    (4, "David", "IT", "Male"),
    (5, "Eve", "HR", "Female"),
    (6, "Frank", "HR", "Male"),
    (7, "Grace", "Finance", "Female"),
    (8, "Heidi", "Finance", "Female"),
    (9, "Ivan", "Production", "Male"),
    (10, "Judy", "Production", "Male"),
    (11, "Kevin", "Sales", "Male"),
    (12, "Laura", "Sales", "Female"),
    (13, "Mallory", "Legal", "Female"),
    (14, "Niaj", "Legal", "Female"),
    (15, "Oscar", "Logistics", "Male")
]

# Create DataFrame
df = spark.createDataFrame(data, schema)
df.show(truncate=False)


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Gender|
+----------+------------+----------+------+
|1         |Alice       |Marketing |Female|
|2         |Bob         |Marketing |Male  |
|3         |Charlie     |IT        |Male  |
|4         |David       |IT        |Male  |
|5         |Eve         |HR        |Female|
|6         |Frank       |HR        |Male  |
|7         |Grace       |Finance   |Female|
|8         |Heidi       |Finance   |Female|
|9         |Ivan        |Production|Male  |
|10        |Judy        |Production|Male  |
|11        |Kevin       |Sales     |Male  |
|12        |Laura       |Sales     |Female|
|13        |Mallory     |Legal     |Female|
|14        |Niaj        |Legal     |Female|
|15        |Oscar       |Logistics |Male  |
+----------+------------+----------+------+



In [4]:
from pyspark.sql.functions import countDistinct, col

# Count distinct genders in each department
department_gender_count = df.groupBy("Department") \
    .agg(countDistinct("Gender").alias("GenderCount"))

# Filter departments having only one distinct gender (either all male or all female)
single_gender_departments = department_gender_count.filter(col("GenderCount") == 1)

# Join back to get gender-specific departments
result_df = single_gender_departments.join(df, "Department").select("Department").distinct()
result_df.show(truncate=False)


+----------+
|Department|
+----------+
|IT        |
|Finance   |
|Production|
|Legal     |
|Logistics |
+----------+



In [14]:
from pyspark.sql.functions import countDistinct, when

# Calculate distinct counts of males and females in each department
df_gender_count = df.groupBy("Department") \
    .agg(
        countDistinct(when(df.Gender == "Male", "EmployeeID")).alias("MaleCount"),
        countDistinct(when(df.Gender == "Female", "EmployeeID")).alias("FemaleCount")
    )

# Filter departments that have only males or only females
df_only_gender_dept = df_gender_count.filter(
    (df_gender_count.MaleCount > 0) & (df_gender_count.FemaleCount == 0) |
    (df_gender_count.FemaleCount > 0) & (df_gender_count.MaleCount == 0)
)

df_only_gender_dept.show(truncate=False)


+----------+---------+-----------+
|Department|MaleCount|FemaleCount|
+----------+---------+-----------+
|Finance   |0        |1          |
|Production|1        |0          |
|Legal     |0        |1          |
|IT        |1        |0          |
|Logistics |1        |0          |
+----------+---------+-----------+



In [10]:
from pyspark.sql.functions import collect_set, size, col

# Use collect_set to get a list of unique genders for each department
department_gender_set = df.groupBy("Department") \
    .agg(collect_set("Gender").alias("Genders"))

# Filter departments with a single gender in the list
single_gender_set = department_gender_set.filter(size(col("Genders")) == 1)
single_gender_set.show(truncate=False)


+----------+--------+
|Department|Genders |
+----------+--------+
|IT        |[Male]  |
|Finance   |[Female]|
|Production|[Male]  |
|Legal     |[Female]|
|Logistics |[Male]  |
+----------+--------+



# Spark SQL

In [3]:
# Register the DataFrame as a temporary table
df.createOrReplaceTempView("Employees")


In [6]:
# SQL Query
sql_query = """
SELECT Department
FROM Employees
GROUP BY Department
HAVING COUNT(DISTINCT Gender) = 1
"""

# Execute the query
result_sql = spark.sql(sql_query)
result_sql.show(truncate=False)


+----------+
|Department|
+----------+
|Finance   |
|Production|
|Legal     |
|IT        |
|Logistics |
+----------+



In [11]:
# SQL Query to find departments with only male or only female employees
query_gender_based_departments = """
SELECT Department
FROM (
    SELECT Department,
           COUNT(DISTINCT CASE WHEN Gender = 'Male' THEN 1 END) AS MaleCount,
           COUNT(DISTINCT CASE WHEN Gender = 'Female' THEN 1 END) AS FemaleCount
    FROM Employees
    GROUP BY Department
) AS dept_count
WHERE (MaleCount > 0 AND FemaleCount = 0) OR (FemaleCount > 0 AND MaleCount = 0)
"""

# Execute the query
result_gender_dept = spark.sql(query_gender_based_departments)
result_gender_dept.show(truncate=False)


+----------+
|Department|
+----------+
|Finance   |
|Production|
|Legal     |
|IT        |
|Logistics |
+----------+



In [12]:
# SQL Query using HAVING clause
query_having = """
SELECT Department
FROM Employees
GROUP BY Department
HAVING COUNT(DISTINCT CASE WHEN Gender = 'Male' THEN 1 END) = COUNT(EmployeeID)
    OR COUNT(DISTINCT CASE WHEN Gender = 'Female' THEN 1 END) = COUNT(EmployeeID)
"""

# Execute the query
result_having = spark.sql(query_having)
result_having.show(truncate=False)


+----------+
|Department|
+----------+
|Logistics |
+----------+



In [13]:
# SQL Query using EXISTS and NOT EXISTS
query_exists = """
SELECT DISTINCT a.Department
FROM Employees a
WHERE EXISTS (
    SELECT 1 FROM Employees b WHERE a.Department = b.Department AND b.Gender = 'Male'
)
AND NOT EXISTS (
    SELECT 1 FROM Employees c WHERE a.Department = c.Department AND c.Gender = 'Female'
)
UNION
SELECT DISTINCT a.Department
FROM Employees a
WHERE EXISTS (
    SELECT 1 FROM Employees b WHERE a.Department = b.Department AND b.Gender = 'Female'
)
AND NOT EXISTS (
    SELECT 1 FROM Employees c WHERE a.Department = c.Department AND c.Gender = 'Male'
)
"""

# Execute the query
result_exists = spark.sql(query_exists)
result_exists.show(truncate=False)


+----------+
|Department|
+----------+
|Production|
|IT        |
|Logistics |
|Finance   |
|Legal     |
+----------+



In [7]:
# SQL Query using COLLECT_SET equivalent in SQL
sql_query_collect_set = """
SELECT Department
FROM (
    SELECT Department, COLLECT_SET(Gender) AS Genders
    FROM Employees
    GROUP BY Department
) AS DepartmentGender
WHERE SIZE(Genders) = 1
"""

# Execute the query
result_collect_set_sql = spark.sql(sql_query_collect_set)
result_collect_set_sql.show(truncate=False)


+----------+
|Department|
+----------+
|IT        |
|Finance   |
|Production|
|Legal     |
|Logistics |
+----------+



# Python

In [8]:
import pandas as pd

# Create pandas DataFrame
data = [
    (1, "Alice", "Marketing", "Female"),
    (2, "Bob", "Marketing", "Male"),
    (3, "Charlie", "IT", "Male"),
    (4, "David", "IT", "Male"),
    (5, "Eve", "HR", "Female"),
    (6, "Frank", "HR", "Male"),
    (7, "Grace", "Finance", "Female"),
    (8, "Heidi", "Finance", "Female"),
    (9, "Ivan", "Production", "Male"),
    (10, "Judy", "Production", "Male"),
    (11, "Kevin", "Sales", "Male"),
    (12, "Laura", "Sales", "Female"),
    (13, "Mallory", "Legal", "Female"),
    (14, "Niaj", "Legal", "Female"),
    (15, "Oscar", "Logistics", "Male")
]

# Convert to pandas DataFrame
df_pandas = pd.DataFrame(data, columns=["EmployeeID", "EmployeeName", "Department", "Gender"])

# Group by Department and get unique Gender counts
gender_count = df_pandas.groupby("Department")["Gender"].nunique().reset_index()
single_gender_departments = gender_count[gender_count["Gender"] == 1]["Department"]
print(single_gender_departments)


0       Finance
2            IT
3         Legal
4     Logistics
6    Production
Name: Department, dtype: object


In [9]:
# Create dictionary to store departments with genders
department_dict = {}

for row in data:
    department = row[2]
    gender = row[3]
    if department not in department_dict:
        department_dict[department] = set()
    department_dict[department].add(gender)

# Identify departments with only one unique gender
single_gender_departments = [dept for dept, genders in department_dict.items() if len(genders) == 1]
print(single_gender_departments)


['IT', 'Finance', 'Production', 'Legal', 'Logistics']
