# Dealing with unclean data

We're going to look at data that may require some cleansing.

In [2]:
import numpy as np
import pandas as pd

print('Spark UI running on http://YOURIPADDRESS:' + sc.uiWebUrl.split(':')[2])

Spark UI running on http://YOURIPADDRESS:4040


## Read the admissions data that is not so clean

In [3]:
admissions = spark.read.csv("/data/college-admissions/admission-data-dirty.csv", header=True, inferSchema=True)
admissions.show(20)
admissions.count()

+-----+----+----+----+
|admit| gre| gpa|rank|
+-----+----+----+----+
|    1| 400|3.23|   4|
|    1| 700|3.56|   1|
|    1| 800| 4.0|   2|
|    0| 500|3.53|   4|
|    0| 560|3.78|   2|
|    0|null|3.35|null|
|    1| 520|null|   3|
|    0| 440|3.17|   2|
|    1| 760| 3.0|   2|
| null| 600|2.82|   4|
|    1| 500| 3.6|   3|
|    0| 500|3.95|   4|
| null| 680|3.27|   2|
|    1| 560|3.59|   2|
|    0| 700|3.65|   2|
|    0| 520|2.98|   2|
|    0| 700|3.92|   2|
|    1| 620| 4.0|   x|
|    0| 640|3.51|   2|
|    1| 600|3.58|   1|
+-----+----+----+----+



20

## Get Summary
See what we get.  It will skip null values

In [4]:
admissions.describe().show()

+-------+------------------+-----------------+------------------+------------------+
|summary|             admit|              gre|               gpa|              rank|
+-------+------------------+-----------------+------------------+------------------+
|  count|                18|               19|                19|                19|
|   mean|               0.5|594.7368421052631| 3.499473684210527|2.4444444444444446|
| stddev|0.5144957554275265| 109.309368361984|0.3534665422514952|0.9835244081556432|
|    min|                 0|              400|              2.82|                 1|
|    max|                 1|              800|               4.0|                 x|
+-------+------------------+-----------------+------------------+------------------+



## Drop all null values

In [None]:
dropped_na = admissions.na.drop()
dropped_na.show()
print(dropped_na.count())

# only drop nulls from admit & gre column
dropped2 = admissions.na.drop(subset=['admit', 'gre'])
dropped2.show()
print(dropped2.count())

## Fill in the values

In [None]:
# fill every thing with zero
admissions.na.fill(0).show()

# or we can specify per column default value
admissions.na.fill({'admit': 2, 'gre': -100 , 'gpa':-1, 'rank':10}).show()

## Replace values

In [None]:
# replace all 800 gre into 1000
admissions.na.replace(800, 1000, ['gre']).show()

# replace all rank 4 into rank 5
admissions.na.replace(4,5, ['rank']).show()

# why is this not working?
# Hint : print(admissions.schema)

## Filter out dirty data

In [None]:
print(admissions.count())

a = admissions.filter(admissions.rank.isNotNull())
a.show()
print(a.count())

b = admissions.filter(admissions.rank.isin([1,2,3,4]))
b.show()
print(b.count())

#c = admissions.where(admissions.rank.is_integer())