In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
pointDF = spark.read.option('inferSchema','true').csv('datasets/points-null-values.txt')
print('Total : ',pointDF.count())
pointDF = pointDF.withColumnRenamed('_c0','x')
pointDF = pointDF.withColumnRenamed('_c1','y')
pointDF.describe().show()

Total :  21
+-------+-----------------+-----------------+
|summary|                x|                y|
+-------+-----------------+-----------------+
|  count|               17|               18|
|   mean|         588289.0|555612.3333333334|
| stddev|2425344.472743975|2357009.932025095|
|    min|                5|                5|
|    max|         10000008|         10000006|
+-------+-----------------+-----------------+



In [None]:
#Filter out the null rows
notNullDF = pointDF[pointDF['x'].isNotNull()]
print('Total :',notNullDF.count())
notNullDF.describe().show()

Total : 17
+-------+-----------------+-----------------+
|summary|                x|                y|
+-------+-----------------+-----------------+
|  count|               17|               15|
|   mean|         588289.0|666720.0666666667|
| stddev|2425344.472743975|2581975.785142247|
|    min|                5|                5|
|    max|         10000008|         10000006|
+-------+-----------------+-----------------+



In [None]:
pointDF.show()
replacedDF = pointDF.na.fill(0)
replacedDF.show()

+----+----+
|   x|   y|
+----+----+
|   5|   5|
| 105| 107|
| 105| 106|
|null|   6|
|   6|   6|
| 106|null|
|   7|   7|
|null|null|
| 107| 107|
|   6|   7|
|   7|null|
| 108| 108|
|   8|   6|
|null| 108|
|   5|   8|
| 107| 106|
|   8|   8|
|null| 107|
| 107| 108|
| 108| 106|
+----+----+
only showing top 20 rows

+---+---+
|  x|  y|
+---+---+
|  5|  5|
|105|107|
|105|106|
|  0|  6|
|  6|  6|
|106|  0|
|  7|  7|
|  0|  0|
|107|107|
|  6|  7|
|  7|  0|
|108|108|
|  8|  6|
|  0|108|
|  5|  8|
|107|106|
|  8|  8|
|  0|107|
|107|108|
|108|106|
+---+---+
only showing top 20 rows



In [None]:
pointDF = pointDF.fillna(0,['x'])
pointDF = pointDF.fillna(1,['y'])
pointDF.show()

+---+---+
|  x|  y|
+---+---+
|  5|  5|
|105|107|
|105|106|
|  0|  6|
|  6|  6|
|106|  1|
|  7|  7|
|  0|  1|
|107|107|
|  6|  7|
|  7|  1|
|108|108|
|  8|  6|
|  0|108|
|  5|  8|
|107|106|
|  8|  8|
|  0|107|
|107|108|
|108|106|
+---+---+
only showing top 20 rows



In [None]:
from pyspark.ml.feature import Imputer
pointDF = spark.read.option('inferSchema','true').csv('datasets/points-null-values.txt')
pointDF = pointDF.withColumnRenamed('_c0','x')
pointDF = pointDF.withColumnRenamed('_c1','y')
meanImputer = Imputer(inputCol='x',outputCol='x_Imputed',strategy='mean')
#Using mean imputer might not be a fgood idea if you have outliers
#First, You should filter out outliers than use mean imputer
#Or use median imputer
resultDF = meanImputer.fit(pointDF).transform(pointDF)
resultDF.show(1000)

+--------+--------+---------+
|       x|       y|x_Imputed|
+--------+--------+---------+
|       5|       5|        5|
|     105|     107|      105|
|     105|     106|      105|
|    null|       6|   588289|
|       6|       6|        6|
|     106|    null|      106|
|       7|       7|        7|
|    null|    null|   588289|
|     107|     107|      107|
|       6|       7|        6|
|       7|    null|        7|
|     108|     108|      108|
|       8|       6|        8|
|    null|     108|   588289|
|       5|       8|        5|
|     107|     106|      107|
|       8|       8|        8|
|    null|     107|   588289|
|     107|     108|      107|
|     108|     106|      108|
|10000008|10000006| 10000008|
+--------+--------+---------+



In [None]:
from pyspark.ml.feature import Imputer
pointDF = spark.read.option('inferSchema','true').csv('datasets/points-null-values.txt')
pointDF = pointDF.withColumnRenamed('_c0','x')
pointDF = pointDF.withColumnRenamed('_c1','y')
pointDF.describe().show()

noOutliersDF = pointDF[pointDF['x']<2425344*2]
noOutliersDF.show(100)

+-------+-----------------+-----------------+
|summary|                x|                y|
+-------+-----------------+-----------------+
|  count|               17|               18|
|   mean|         588289.0|555612.3333333334|
| stddev|2425344.472743975|2357009.932025095|
|    min|                5|                5|
|    max|         10000008|         10000006|
+-------+-----------------+-----------------+

+---+----+
|  x|   y|
+---+----+
|  5|   5|
|105| 107|
|105| 106|
|  6|   6|
|106|null|
|  7|   7|
|107| 107|
|  6|   7|
|  7|null|
|108| 108|
|  8|   6|
|  5|   8|
|107| 106|
|  8|   8|
|107| 108|
|108| 106|
+---+----+



In [None]:
#Median Imputer
from pyspark.ml.feature import Imputer
pointDF = spark.read.option('inferSchema','true').csv('datasets/points-null-values.txt')
pointDF = pointDF.withColumnRenamed('_c0','x')
pointDF = pointDF.withColumnRenamed('_c1','y')
medianImputer = Imputer(inputCol='x',outputCol='x_Imputed',strategy='median')

resultDF = medianImputer.fit(pointDF).transform(pointDF)
resultDF.show(1000)

+--------+--------+---------+
|       x|       y|x_Imputed|
+--------+--------+---------+
|       5|       5|        5|
|     105|     107|      105|
|     105|     106|      105|
|    null|       6|      105|
|       6|       6|        6|
|     106|    null|      106|
|       7|       7|        7|
|    null|    null|      105|
|     107|     107|      107|
|       6|       7|        6|
|       7|    null|        7|
|     108|     108|      108|
|       8|       6|        8|
|    null|     108|      105|
|       5|       8|        5|
|     107|     106|      107|
|       8|       8|        8|
|    null|     107|      105|
|     107|     108|      107|
|     108|     106|      108|
|10000008|10000006| 10000008|
+--------+--------+---------+

