In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('filters').getOrCreate()

In [3]:
df = spark.read.csv("null.csv",header=True,inferSchema=True)

In [4]:
df.show()

+----+----------+------+
| age|experience|salary|
+----+----------+------+
|  25|         2| 45000|
|  32|         7| 62000|
|NULL|         5| 54000|
|  45|      NULL| 90000|
|  29|         4|  NULL|
|  38|        12| 78000|
|  41|      NULL| 85000|
|NULL|         3| 50000|
|  27|         1| 42000|
|  50|        25|  NULL|
|  36|        10| 70000|
|  31|      NULL| 61000|
|NULL|         6| 56000|
|  42|        15|  NULL|
|  28|         2| 48000|
|  34|         8| 65000|
|  39|      NULL| 80000|
|NULL|         4| 52000|
|  46|        20|  NULL|
|  30|         5| 58000|
+----+----------+------+



In [9]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age','experience','salary'],
    outputCols=[f"{c}_imputed" for c in ['age','experience','salary']]
).setStrategy('median')

In [10]:
df_m = imputer.fit(df).transform(df)

In [11]:
df_m.show()

+----+----------+------+-----------+------------------+--------------+
| age|experience|salary|age_imputed|experience_imputed|salary_imputed|
+----+----------+------+-----------+------------------+--------------+
|  25|         2| 45000|         25|                 2|         45000|
|  32|         7| 62000|         32|                 7|         62000|
|NULL|         5| 54000|         34|                 5|         54000|
|  45|      NULL| 90000|         45|                 5|         90000|
|  29|         4|  NULL|         29|                 4|         58000|
|  38|        12| 78000|         38|                12|         78000|
|  41|      NULL| 85000|         41|                 5|         85000|
|NULL|         3| 50000|         34|                 3|         50000|
|  27|         1| 42000|         27|                 1|         42000|
|  50|        25|  NULL|         50|                25|         58000|
|  36|        10| 70000|         36|                10|         70000|
|  31|

In [12]:
df_f = df_m.select(df_m.age_imputed.alias('age'),df_m.experience_imputed.alias('exp'),df_m.salary_imputed.alias('salary'))

In [13]:
df_f.show()

+---+---+------+
|age|exp|salary|
+---+---+------+
| 25|  2| 45000|
| 32|  7| 62000|
| 34|  5| 54000|
| 45|  5| 90000|
| 29|  4| 58000|
| 38| 12| 78000|
| 41|  5| 85000|
| 34|  3| 50000|
| 27|  1| 42000|
| 50| 25| 58000|
| 36| 10| 70000|
| 31|  5| 61000|
| 34|  6| 56000|
| 42| 15| 58000|
| 28|  2| 48000|
| 34|  8| 65000|
| 39|  5| 80000|
| 34|  4| 52000|
| 46| 20| 58000|
| 30|  5| 58000|
+---+---+------+



## Filter Operations

In [16]:
df_juniors = df_f.filter(df_f.exp < 5)

In [17]:
df_juniors.show()

+---+---+------+
|age|exp|salary|
+---+---+------+
| 25|  2| 45000|
| 29|  4| 58000|
| 34|  3| 50000|
| 27|  1| 42000|
| 28|  2| 48000|
| 34|  4| 52000|
+---+---+------+



In [19]:
df_seniors = df_f.filter("exp > 5")

In [20]:
df_seniors.show()

+---+---+------+
|age|exp|salary|
+---+---+------+
| 32|  7| 62000|
| 38| 12| 78000|
| 50| 25| 58000|
| 36| 10| 70000|
| 34|  6| 56000|
| 42| 15| 58000|
| 34|  8| 65000|
| 46| 20| 58000|
+---+---+------+



In [22]:
df_salary = df_f.filter((df_f['salary'] > 20000 ) & (df_f['salary'] < 80000 )) 

In [23]:
df_salary.show()

+---+---+------+
|age|exp|salary|
+---+---+------+
| 25|  2| 45000|
| 32|  7| 62000|
| 34|  5| 54000|
| 29|  4| 58000|
| 38| 12| 78000|
| 34|  3| 50000|
| 27|  1| 42000|
| 50| 25| 58000|
| 36| 10| 70000|
| 31|  5| 61000|
| 34|  6| 56000|
| 42| 15| 58000|
| 28|  2| 48000|
| 34|  8| 65000|
| 34|  4| 52000|
| 46| 20| 58000|
| 30|  5| 58000|
+---+---+------+



In [27]:
df_salary.sort('salary').show()

+---+---+------+
|age|exp|salary|
+---+---+------+
| 27|  1| 42000|
| 25|  2| 45000|
| 28|  2| 48000|
| 34|  3| 50000|
| 34|  4| 52000|
| 34|  5| 54000|
| 34|  6| 56000|
| 29|  4| 58000|
| 50| 25| 58000|
| 42| 15| 58000|
| 46| 20| 58000|
| 30|  5| 58000|
| 31|  5| 61000|
| 32|  7| 62000|
| 34|  8| 65000|
| 36| 10| 70000|
| 38| 12| 78000|
+---+---+------+

