# Dealing with unclean data

We're going to look at data that may require some cleansing.

In [1]:
# initialize Spark Session
import os
import sys
top_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
if top_dir not in sys.path:
    sys.path.append(top_dir)

from init_spark import init_spark
spark = init_spark()
spark

Initializing Spark...
Spark found in :  /Users/sujee/spark
Spark config:
	 spark.app.name=TestApp
	spark.master=local[*]
	executor.memory=2g
	spark.sql.warehouse.dir=/var/folders/lp/qm_skljd2hl4xtps5vw0tdgm0000gn/T/tmpd544opvp
	some_property=some_value
Spark UI running on port 4040


## Step 1: Read the admissions data that is not so clean

In [2]:
admissions = spark.read.csv("/data/college-admissions/admission-data-dirty.csv", \
                            header=True, inferSchema=True)
print("count ", admissions.count())


count  20


## Step 2: Print out the schema and see the data

In [3]:
admissions.printSchema()
admissions.show(20)

root
 |-- admit: integer (nullable = true)
 |-- gre: integer (nullable = true)
 |-- gpa: double (nullable = true)
 |-- rank: string (nullable = true)

+-----+----+----+----+
|admit| gre| gpa|rank|
+-----+----+----+----+
|    1| 400|3.23|   4|
|    1| 700|3.56|   1|
|    1| 800| 4.0|   2|
|    0| 500|3.53|   4|
|    0| 560|3.78|   2|
|    0|null|3.35|null|
|    1| 520|null|   3|
|    0| 440|3.17|   2|
|    1| 760| 3.0|   2|
| null| 600|2.82|   4|
|    1| 500| 3.6|   3|
|    0| 500|3.95|   4|
| null| 680|3.27|   2|
|    1| 560|3.59|   2|
|    0| 700|3.65|   2|
|    0| 520|2.98|   2|
|    0| 700|3.92|   2|
|    1| 620| 4.0|   x|
|    0| 640|3.51|   2|
|    1| 600|3.58|   1|
+-----+----+----+----+



## Step 3: Get Summary
See what we get.  It will skip null values

In [4]:
admissions.describe().show()

+-------+------------------+-----------------+------------------+------------------+
|summary|             admit|              gre|               gpa|              rank|
+-------+------------------+-----------------+------------------+------------------+
|  count|                18|               19|                19|                19|
|   mean|               0.5|594.7368421052631| 3.499473684210527|2.4444444444444446|
| stddev|0.5144957554275265| 109.309368361984|0.3534665422514952|0.9835244081556432|
|    min|                 0|              400|              2.82|                 1|
|    max|                 1|              800|               4.0|                 x|
+-------+------------------+-----------------+------------------+------------------+



In [5]:
## Describe one column
admissions.describe(['gre']).show()

+-------+-----------------+
|summary|              gre|
+-------+-----------------+
|  count|               19|
|   mean|594.7368421052631|
| stddev| 109.309368361984|
|    min|              400|
|    max|              800|
+-------+-----------------+



In [6]:
## TODO : Describe more than one column : gre and gpa
## Hint : add 'gpa' column
admissions.describe(['gre', 'gpa']).show()

+-------+-----------------+------------------+
|summary|              gre|               gpa|
+-------+-----------------+------------------+
|  count|               19|                19|
|   mean|594.7368421052631| 3.499473684210527|
| stddev| 109.309368361984|0.3534665422514952|
|    min|              400|              2.82|
|    max|              800|               4.0|
+-------+-----------------+------------------+



## Step 4: Drop all null values

In [7]:
print("raw data count ", admissions.count())
dropped_na = admissions.na.drop()
print("after drop count ", dropped_na.count())
dropped_na.show()


raw data count  20
after drop count  16
+-----+---+----+----+
|admit|gre| gpa|rank|
+-----+---+----+----+
|    1|400|3.23|   4|
|    1|700|3.56|   1|
|    1|800| 4.0|   2|
|    0|500|3.53|   4|
|    0|560|3.78|   2|
|    0|440|3.17|   2|
|    1|760| 3.0|   2|
|    1|500| 3.6|   3|
|    0|500|3.95|   4|
|    1|560|3.59|   2|
|    0|700|3.65|   2|
|    0|520|2.98|   2|
|    0|700|3.92|   2|
|    1|620| 4.0|   x|
|    0|640|3.51|   2|
|    1|600|3.58|   1|
+-----+---+----+----+



In [8]:
# only drop nulls from admit & gre column
print("raw data count ", admissions.count())
dropped2 = admissions.na.drop(subset=['admit', 'gre'])
print("after drop count ", dropped2.count())
dropped2.show()

raw data count  20
after drop count  17
+-----+---+----+----+
|admit|gre| gpa|rank|
+-----+---+----+----+
|    1|400|3.23|   4|
|    1|700|3.56|   1|
|    1|800| 4.0|   2|
|    0|500|3.53|   4|
|    0|560|3.78|   2|
|    1|520|null|   3|
|    0|440|3.17|   2|
|    1|760| 3.0|   2|
|    1|500| 3.6|   3|
|    0|500|3.95|   4|
|    1|560|3.59|   2|
|    0|700|3.65|   2|
|    0|520|2.98|   2|
|    0|700|3.92|   2|
|    1|620| 4.0|   x|
|    0|640|3.51|   2|
|    1|600|3.58|   1|
+-----+---+----+----+



## Step 5: Fill in the values

In [9]:
# fill every thing with zero
zero_fill = admissions.na.fill(0)
zero_fill.show()


+-----+---+----+----+
|admit|gre| gpa|rank|
+-----+---+----+----+
|    1|400|3.23|   4|
|    1|700|3.56|   1|
|    1|800| 4.0|   2|
|    0|500|3.53|   4|
|    0|560|3.78|   2|
|    0|  0|3.35|null|
|    1|520| 0.0|   3|
|    0|440|3.17|   2|
|    1|760| 3.0|   2|
|    0|600|2.82|   4|
|    1|500| 3.6|   3|
|    0|500|3.95|   4|
|    0|680|3.27|   2|
|    1|560|3.59|   2|
|    0|700|3.65|   2|
|    0|520|2.98|   2|
|    0|700|3.92|   2|
|    1|620| 4.0|   x|
|    0|640|3.51|   2|
|    1|600|3.58|   1|
+-----+---+----+----+



In [10]:
# or we can specify per column default value
## TODO : specify different default values per column
##        default value for gre = -100
fill2 = admissions.na.fill({'admit': 2, 'gre': -100 , 'gpa':-1, 'rank':10})
fill2.show()

+-----+----+----+----+
|admit| gre| gpa|rank|
+-----+----+----+----+
|    1| 400|3.23|   4|
|    1| 700|3.56|   1|
|    1| 800| 4.0|   2|
|    0| 500|3.53|   4|
|    0| 560|3.78|   2|
|    0|-100|3.35|  10|
|    1| 520|-1.0|   3|
|    0| 440|3.17|   2|
|    1| 760| 3.0|   2|
|    2| 600|2.82|   4|
|    1| 500| 3.6|   3|
|    0| 500|3.95|   4|
|    2| 680|3.27|   2|
|    1| 560|3.59|   2|
|    0| 700|3.65|   2|
|    0| 520|2.98|   2|
|    0| 700|3.92|   2|
|    1| 620| 4.0|   x|
|    0| 640|3.51|   2|
|    1| 600|3.58|   1|
+-----+----+----+----+



## Step 6: Replace values

In [11]:
# replace all 800 gre into 1000
admissions.na.replace(800, 1000, ['gre']).show()

+-----+----+----+----+
|admit| gre| gpa|rank|
+-----+----+----+----+
|    1| 400|3.23|   4|
|    1| 700|3.56|   1|
|    1|1000| 4.0|   2|
|    0| 500|3.53|   4|
|    0| 560|3.78|   2|
|    0|null|3.35|null|
|    1| 520|null|   3|
|    0| 440|3.17|   2|
|    1| 760| 3.0|   2|
| null| 600|2.82|   4|
|    1| 500| 3.6|   3|
|    0| 500|3.95|   4|
| null| 680|3.27|   2|
|    1| 560|3.59|   2|
|    0| 700|3.65|   2|
|    0| 520|2.98|   2|
|    0| 700|3.92|   2|
|    1| 620| 4.0|   x|
|    0| 640|3.51|   2|
|    1| 600|3.58|   1|
+-----+----+----+----+



In [12]:
# replace all rank 4 into rank 5
admissions.na.replace(4,5, ['rank']).show()

# why is this not working?
# Hint : print(admissions.schema)

+-----+----+----+----+
|admit| gre| gpa|rank|
+-----+----+----+----+
|    1| 400|3.23|   4|
|    1| 700|3.56|   1|
|    1| 800| 4.0|   2|
|    0| 500|3.53|   4|
|    0| 560|3.78|   2|
|    0|null|3.35|null|
|    1| 520|null|   3|
|    0| 440|3.17|   2|
|    1| 760| 3.0|   2|
| null| 600|2.82|   4|
|    1| 500| 3.6|   3|
|    0| 500|3.95|   4|
| null| 680|3.27|   2|
|    1| 560|3.59|   2|
|    0| 700|3.65|   2|
|    0| 520|2.98|   2|
|    0| 700|3.92|   2|
|    1| 620| 4.0|   x|
|    0| 640|3.51|   2|
|    1| 600|3.58|   1|
+-----+----+----+----+



## Step 7: Filter out dirty data

In [13]:
print("raw data count ", admissions.count())
a = admissions.filter(admissions.rank.isNotNull())
print("cleaned count ", a.count())
a.show()
print(a.count())

raw data count  20
cleaned count  19
+-----+---+----+----+
|admit|gre| gpa|rank|
+-----+---+----+----+
|    1|400|3.23|   4|
|    1|700|3.56|   1|
|    1|800| 4.0|   2|
|    0|500|3.53|   4|
|    0|560|3.78|   2|
|    1|520|null|   3|
|    0|440|3.17|   2|
|    1|760| 3.0|   2|
| null|600|2.82|   4|
|    1|500| 3.6|   3|
|    0|500|3.95|   4|
| null|680|3.27|   2|
|    1|560|3.59|   2|
|    0|700|3.65|   2|
|    0|520|2.98|   2|
|    0|700|3.92|   2|
|    1|620| 4.0|   x|
|    0|640|3.51|   2|
|    1|600|3.58|   1|
+-----+---+----+----+

19


In [14]:
print("raw data count ", admissions.count())
b = admissions.filter(admissions.rank.isin([1,2,3,4]))
print("clean count  ", b.count())
b.show()


raw data count  20
clean count   18
+-----+---+----+----+
|admit|gre| gpa|rank|
+-----+---+----+----+
|    1|400|3.23|   4|
|    1|700|3.56|   1|
|    1|800| 4.0|   2|
|    0|500|3.53|   4|
|    0|560|3.78|   2|
|    1|520|null|   3|
|    0|440|3.17|   2|
|    1|760| 3.0|   2|
| null|600|2.82|   4|
|    1|500| 3.6|   3|
|    0|500|3.95|   4|
| null|680|3.27|   2|
|    1|560|3.59|   2|
|    0|700|3.65|   2|
|    0|520|2.98|   2|
|    0|700|3.92|   2|
|    0|640|3.51|   2|
|    1|600|3.58|   1|
+-----+---+----+----+

