In [1]:
import pyspark.sql.functions as F

# Define a dataset.

df = sc.parallelize([
    (10, '', 10000), 
    (20, 'Female', 30000), 
    (None, 'Male', 80000), 
    (None, 'Male', 5000)
]).toDF(["age", "gender", "income"])

df.show()



+----+------+------+
| age|gender|income|
+----+------+------+
|  10|      | 10000|
|  20|Female| 30000|
|null|  Male| 80000|
|null|  Male|  5000|
+----+------+------+



                                                                                

In [2]:
df.describe().show()



+-------+------------------+------+-----------------+
|summary|               age|gender|           income|
+-------+------------------+------+-----------------+
|  count|                 2|     4|                4|
|   mean|              15.0|  null|          31250.0|
| stddev|7.0710678118654755|  null|34247.87098005753|
|    min|                10|      |             5000|
|    max|                20|  Male|            80000|
+-------+------------------+------+-----------------+



                                                                                

In [6]:
# Treat Null Value (None) with Average one.

avg_age = df.na.drop().agg(F.avg("age")).collect()[0][0]

sparkf_replaceNull = F.udf(lambda x: avg_age if x is None else x)

no_null_df = df.withColumn('age', sparkf_replaceNull(F.col('age')))

no_null_df.show()

+----+------+------+
| age|gender|income|
+----+------+------+
|  10|      | 10000|
|  20|Female| 30000|
|15.0|  Male| 80000|
|15.0|  Male|  5000|
+----+------+------+



In [7]:
# Treat Missing Value with Defined Values.

treat_missing = F.udf(lambda x: "Male_Assume" if x == "" else x)

no_missing_df = no_null_df.withColumn(
    'new_gender', 
    treat_missing(no_null_df.gender)
)

no_missing_df.show()

+----+------+------+-----------+
| age|gender|income| new_gender|
+----+------+------+-----------+
|  10|      | 10000|Male_Assume|
|  20|Female| 30000|     Female|
|15.0|  Male| 80000|       Male|
|15.0|  Male|  5000|       Male|
+----+------+------+-----------+



In [8]:
# Treat Outliner with Remove one.

no_outlier_df = no_missing_df.filter(F.col('income') >= 10000)

no_outlier_df.show()

+----+------+------+-----------+
| age|gender|income| new_gender|
+----+------+------+-----------+
|  10|      | 10000|Male_Assume|
|  20|Female| 30000|     Female|
|15.0|  Male| 80000|       Male|
+----+------+------+-----------+



# Filter column names using 'startswith'

In [9]:
df.select([
    x for x in df.columns
    if x.startswith('in')
]).show()

+------+
|income|
+------+
| 10000|
| 30000|
| 80000|
|  5000|
+------+

