In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
os.getcwd()

'H:\\pyspark_advanced-coding_interview'

# Difference between Count(*), Count(1), Count(colname)

# Pyspark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Initialize Spark Session
spark = SparkSession.builder.appName("CountDifference").getOrCreate()

# Define schema using StructType and StructField
schema = StructType([
    StructField("EmployeeID", IntegerType(), True),
    StructField("EmployeeName", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Salary", IntegerType(), True)
])

# Sample data with some null values
data = [
    (1, "Alice", "HR", 70000),
    (2, "Bob", "Engineering", 85000),
    (3, "Charlie", "Engineering", 90000),
    (4, "David", "HR", 80000),
    (5, "Eve", "Marketing", None),
    (6, None, "Engineering", 75000),
    (7, "Grace", None, 60000),
    (8, "Heidi", "HR", None),
    (9, "Ivan", "Marketing", 88000),
    (10, "Judy", None, None),
    (11, "Kevin", "Engineering", 99000),
    (12, "Laura", "Sales", 72000),
    (13, None, "Sales", 65000),
    (14, "Mallory", "IT", 77000),
    (15, "Oscar", "IT", 81000)
]

# Create DataFrame
df = spark.createDataFrame(data, schema)
df.show(truncate=False)


+----------+------------+-----------+------+
|EmployeeID|EmployeeName|Department |Salary|
+----------+------------+-----------+------+
|1         |Alice       |HR         |70000 |
|2         |Bob         |Engineering|85000 |
|3         |Charlie     |Engineering|90000 |
|4         |David       |HR         |80000 |
|5         |Eve         |Marketing  |null  |
|6         |null        |Engineering|75000 |
|7         |Grace       |null       |60000 |
|8         |Heidi       |HR         |null  |
|9         |Ivan        |Marketing  |88000 |
|10        |Judy        |null       |null  |
|11        |Kevin       |Engineering|99000 |
|12        |Laura       |Sales      |72000 |
|13        |null        |Sales      |65000 |
|14        |Mallory     |IT         |77000 |
|15        |Oscar       |IT         |81000 |
+----------+------------+-----------+------+



In [11]:
# COUNT(1): Similar to COUNT(*), it counts all rows.
count_all_rows_1 = df.agg({"1": "count"}).alias("total_number_rows")
count_all_rows_1.show()


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `1` cannot be resolved. Did you mean one of the following? [`EmployeeID`, `EmployeeName`, `Department`, `Salary`].

In [12]:
# COUNT(EmployeeName): Counts non-NULL values in the EmployeeName column.
count_non_null = df.agg({"EmployeeName": "count"}).alias("NonNullEmployeeNames")
count_non_null.show()


+-------------------+
|count(EmployeeName)|
+-------------------+
|                 13|
+-------------------+



In [8]:
# COUNT(*): Counts all rows, including those with NULL values in any or all columns.
count_all_rows = df.agg({"*": "count"}).alias("TotalRows")
count_all_rows.show()


+--------+
|count(1)|
+--------+
|      15|
+--------+



In [13]:
from pyspark.sql.functions import count

# Group by Department and use different COUNT functions
df_grouped_count = df.groupBy("Department").agg(
    count("*").alias("TotalRows"),         # Equivalent to COUNT(*)
    count("EmployeeID").alias("CountID"),  # Counts non-NULL EmployeeID
    count("EmployeeName").alias("NonNullEmployeeNames")  # Counts non-NULL EmployeeName
)

df_grouped_count.show(truncate=False)


+-----------+---------+-------+--------------------+
|Department |TotalRows|CountID|NonNullEmployeeNames|
+-----------+---------+-------+--------------------+
|HR         |3        |3      |3                   |
|Engineering|4        |4      |3                   |
|Marketing  |2        |2      |2                   |
|null       |2        |2      |2                   |
|Sales      |2        |2      |1                   |
|IT         |2        |2      |2                   |
+-----------+---------+-------+--------------------+



# Spark SQL

In [4]:

# Register the DataFrame as a temporary table
df.createOrReplaceTempView("Employees")


# SQL Query using COUNT(*)
sql_count_star = """
SELECT COUNT(*) AS TotalRows
FROM Employees
"""

# Execute the query
result_count_star = spark.sql(sql_count_star)
result_count_star.show()


+---------+
|TotalRows|
+---------+
|       15|
+---------+



In [5]:
# SQL Query using COUNT(1)
sql_count_one = """
SELECT COUNT(1) AS TotalRows
FROM Employees
"""

# Execute the query
result_count_one = spark.sql(sql_count_one)
result_count_one.show()


+---------+
|TotalRows|
+---------+
|       15|
+---------+



In [6]:
# SQL Query using COUNT(EmployeeName)
sql_count_col = """
SELECT COUNT(EmployeeName) AS TotalEmployeeNames
FROM Employees
"""

# Execute the query
result_count_col = spark.sql(sql_count_col)
result_count_col.show()


+------------------+
|TotalEmployeeNames|
+------------------+
|                13|
+------------------+



In [7]:
# SQL Query using COUNT(*) and COUNT(EmployeeName) together
sql_count_group_by = """
SELECT Department, 
       COUNT(*) AS TotalRows, 
       COUNT(EmployeeName) AS NonNullEmployeeNames,
       COUNT(Salary) AS NonNullSalaries
FROM Employees
GROUP BY Department
"""

# Execute the query
result_count_group_by = spark.sql(sql_count_group_by)
result_count_group_by.show()


+-----------+---------+--------------------+---------------+
| Department|TotalRows|NonNullEmployeeNames|NonNullSalaries|
+-----------+---------+--------------------+---------------+
|         HR|        3|                   3|              2|
|Engineering|        4|                   3|              4|
|  Marketing|        2|                   2|              1|
|       null|        2|                   2|              1|
|      Sales|        2|                   1|              2|
|         IT|        2|                   2|              2|
+-----------+---------+--------------------+---------------+



# Python

In [5]:
# Full example code showing all methods
import pandas as pd

# Sample data
data = [
    (1, "Alice", "HR", 70000),
    (2, "Bob", "Engineering", 85000),
    (3, "Charlie", "Engineering", 90000),
    (4, "David", "HR", 80000),
    (5, "Eve", "Marketing", None),
    (6, None, "Engineering", 75000),
    (7, "Grace", None, 60000),
    (8, "Heidi", "HR", None),
    (9, "Ivan", "Marketing", 88000),
    (10, "Judy", None, None),
    (11, "Kevin", "Engineering", 99000),
    (12, "Laura", "Sales", 72000),
    (13, None, "Sales", 65000),
    (14, "Mallory", "IT", 77000),
    (15, "Oscar", "IT", 81000)
]

# Create DataFrame
df = pd.DataFrame(data, columns=["EmployeeID", "EmployeeName", "Department", "Salary"])

# Method 1: Count all rows (COUNT(*))
count_all = len(df)
print("Total Rows (COUNT(*)):", count_all)

# Method 2: Count all rows (COUNT(1))
count_one = len(df)
print("Total Rows (COUNT(1)):", count_one)

# Method 3: Count non-NULL values in columns (COUNT(column_name))
count_employee_names = df["EmployeeName"].count()
count_salaries = df["Salary"].count()
print("Total Non-NULL Employee Names (COUNT(EmployeeName)):", count_employee_names)
print("Total Non-NULL Salaries (COUNT(Salary)):", count_salaries)

# Using groupby and agg
grouped_count = df.groupby("Department").agg(
    TotalRows=pd.NamedAgg(column="Department", aggfunc="size"),
    NonNullEmployeeNames=pd.NamedAgg(column="EmployeeName", aggfunc="count"),
    NonNullSalaries=pd.NamedAgg(column="Salary", aggfunc="count")
)
print("\nGrouped Count:")
print(grouped_count)

# Count NULL values
null_employee_names = df["EmployeeName"].isnull().sum()
null_salaries = df["Salary"].isnull().sum()
print("NULL Employee Names:", null_employee_names)
print("NULL Salaries:", null_salaries)


Total Rows (COUNT(*)): 15
Total Rows (COUNT(1)): 15
Total Non-NULL Employee Names (COUNT(EmployeeName)): 13
Total Non-NULL Salaries (COUNT(Salary)): 12

Grouped Count:
             TotalRows  NonNullEmployeeNames  NonNullSalaries
Department                                                   
Engineering          4                     3                4
HR                   3                     3                2
IT                   2                     2                2
Marketing            2                     2                1
Sales                2                     1                2
NULL Employee Names: 2
NULL Salaries: 3
