In [62]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from datetime import datetime

# Initialize Spark session
spark = SparkSession.builder.appName("CreateDFWithPartialNulls").getOrCreate()

# Define schema with columns: id, user_id, first_login_timestamp, location
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("user_id", StringType(), True),
    StructField("first_login_timestamp", TimestampType(), True),
    StructField("location", StringType(), True)
])

# Create data with some null values
data = [
    (1, "user_1", datetime(2023, 1, 1, 8, 0, 0), "New York"),
    (2, None, datetime(2023, 1, 2, 9, 0, 0), "Los Angeles"),
    (3, "user_3", None, "Chicago"),
    (4, "user_4", datetime(2023, 1, 4, 10, 0, 0), None),
    (5, None, None, "Houston"),
    (6, "user_6", datetime(2023, 1, 6, 11, 0, 0), "Phoenix"),
    (7, None, datetime(2023, 1, 7, 12, 0, 0), None),
    (8, "user_8", None, "Philadelphia"),
    (9, "user_9", datetime(2023, 1, 9, 13, 0, 0), None),
    (10, None, None, "San Francisco")
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show DataFrame
df.show()


+---+-------+---------------------+-------------+
| id|user_id|first_login_timestamp|     location|
+---+-------+---------------------+-------------+
|  1| user_1|  2023-01-01 08:00:00|     New York|
|  2|   null|  2023-01-02 09:00:00|  Los Angeles|
|  3| user_3|                 null|      Chicago|
|  4| user_4|  2023-01-04 10:00:00|         null|
|  5|   null|                 null|      Houston|
|  6| user_6|  2023-01-06 11:00:00|      Phoenix|
|  7|   null|  2023-01-07 12:00:00|         null|
|  8| user_8|                 null| Philadelphia|
|  9| user_9|  2023-01-09 13:00:00|         null|
| 10|   null|                 null|San Francisco|
+---+-------+---------------------+-------------+



### Static Approach

In [63]:
from pyspark.sql.functions import count, isnull, col, when

In [64]:
df.select(count("*").alias("total_count")
          , count("user_id").alias("user_id_count")
          , count("first_login_timestamp").alias("flt_count")
          , count("location").alias("location_count")
         ).show()

+-----------+-------------+---------+--------------+
|total_count|user_id_count|flt_count|location_count|
+-----------+-------------+---------+--------------+
|         10|            6|        6|             7|
+-----------+-------------+---------+--------------+



In [6]:
df.select(count("*").alias("total_count")
            , count(when(isnull(col("user_id")),1
                        )
                   ).alias("user_nulls_count")
            , count(when(isnull(col("first_login_timestamp")),1
                        )
                   ).alias("flt_nulls_count")
            , count(when(isnull(col("location")),1
                        )
                   ).alias("location_nulls_count")
         ).show()

+-----------+----------------+---------------+--------------------+
|total_count|user_nulls_count|flt_nulls_count|location_nulls_count|
+-----------+----------------+---------------+--------------------+
|         10|               4|              4|                   3|
+-----------+----------------+---------------+--------------------+



#### To do this dynamically for all the columns, Python List Comprehension and Unpacking Operator is used

In [17]:
print(data)

[(1, 'user_1', datetime.datetime(2023, 1, 1, 8, 0), 'New York'), (2, None, datetime.datetime(2023, 1, 2, 9, 0), 'Los Angeles'), (3, 'user_3', None, 'Chicago'), (4, 'user_4', datetime.datetime(2023, 1, 4, 10, 0), None), (5, None, None, 'Houston'), (6, 'user_6', datetime.datetime(2023, 1, 6, 11, 0), 'Phoenix'), (7, None, datetime.datetime(2023, 1, 7, 12, 0), None), (8, 'user_8', None, 'Philadelphia'), (9, 'user_9', datetime.datetime(2023, 1, 9, 13, 0), None), (10, None, None, 'San Francisco')]


In [26]:
#Regular Print for user_id

for x in data:
    print(x[1])

user_1
None
user_3
user_4
None
user_6
None
user_8
user_9
None


In [31]:
# Print with List Comprehension
# Creates as list on the go, applies the logic on left of "for" on the elements in "data"

print([x[1] for x in data])

['user_1', None, 'user_3', 'user_4', None, 'user_6', None, 'user_8', 'user_9', None]


In [54]:
# Unpacking operator (*) just unpacks the list into individual int or str

print(*[x[1] for x in data])

user_1 None user_3 user_4 None user_6 None user_8 user_9 None


In [38]:
df.columns

['id', 'user_id', 'first_login_timestamp', 'location']

In [49]:
cols_list = df.columns

In [50]:
cols_list

['id', 'user_id', 'first_login_timestamp', 'location']

In [51]:
type(cols_list)

list

In [65]:
columns_counts = df.agg( *[ count(col(col_name)) for col_name in df.columns ] )

In [66]:
columns_counts.show()

+---------+--------------+----------------------------+---------------+
|count(id)|count(user_id)|count(first_login_timestamp)|count(location)|
+---------+--------------+----------------------------+---------------+
|       10|             6|                           6|              7|
+---------+--------------+----------------------------+---------------+



In [15]:
# Aggregate all null counts into a single row
df.agg(*[count(when(col(x).isNull(), 1)).alias(f"{x}_nulls_count") for x in df.columns])

id_nulls_count,user_id_nulls_count,first_login_timestamp_nulls_count,location_nulls_count
0,4,4,3
