## Data Cleansign

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('learn_data_cleansing').getOrCreate()

In [None]:
customers_df = spark.read.format('csv')\
.option("infraSchema", "true")\
.option("header","true")\
.load("customers.csv")

customers_df.createOrReplaceTempView("customers_table")

### Assessing Data

In [None]:
# TIPE DATA & SUMMARY STATISTIC

print(customers_df.printSchema())
customers_df.summary().show()

root
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- home_address: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)

None
+-------+------------------+-------------+-----------------+------------------+--------------------+-----------------+---------+--------------------+---------+
|summary|       customer_id|customer_name|           gender|               age|        home_address|         zip_code|     city|               state|  country|
+-------+------------------+-------------+-----------------+------------------+--------------------+-----------------+---------+--------------------+---------+
|  count|              1007|         1007|              989|              1007|                1007|             1007|     1007|                1007|   

**MISSING VALUE**

In [None]:
from pyspark.sql.functions import isnull, isnan

customers_df.where(isnull('gender') | isnan('gender')).show()

print("jumlah missing value : ", customers_df.where(isnull('gender')|isnan('gender')).count())

+-----------+-------------+------+---+--------------------+--------+--------------------+--------------------+---------+
|customer_id|customer_name|gender|age|        home_address|zip_code|                city|               state|  country|
+-----------+-------------+------+---+--------------------+--------+--------------------+--------------------+---------+
|         39|     fulan 39|  NULL| 80|7440 Cameron Esta...|    4622|North Victoriache...|  Northern Territory|Australia|
|        168|    fulan 168|  NULL| 27|2781 Berge MallSu...|    1975|      North Leoburgh|   Western Australia|Australia|
|        322|    fulan 322|  NULL| 30|593 Becker Circle...|    1640|          Jacobiview|   Western Australia|Australia|
|        393|    fulan 393|  NULL| 34|5158 Levi HillSui...|    1474|          Johnsburgh|          Queensland|Australia|
|        442|    fulan 442|  NULL| 26|5157 Feil RoadApt...|    7249|          Port Chloe|     New South Wales|Australia|
|        720|    fulan 720|  NUL

**PERIKSA DATA DUPLIKAT**

In [None]:
print("jumlah duplikasi : ", (customers_df.count() - customers_df.distinct().count()))

jumlah duplikasi :  6


### Data Cleansing

In [None]:
# UBAH TIPE DATA W/ CAST()

from pyspark.sql.functions import col

new_customers_df = customers_df.withColumn("customer_id", col("customer_id").cast("string"))
new_customers_df = new_customers_df.withColumn("zip_code", col("zip_code").cast("string"))

new_customers_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- home_address: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)



Pada contoh kode di atas, kita mengubah tipe data pada kolom "customer_id" dan "zip_code" menjadi “string”. Hal ini dilakukan untuk mencegah adanya operasi matematis terhadap kedua kolom tersebut.

#### Dropping

Menghapus seluruh kolom/baris yang memiliki missing value

In [None]:
new_customers_df.na.drop()

# new_customers_df.na.drop(how="any") # Menghapus seluruh baris data yang mengandung missing value (walaupun hanya 1)
# customers_df.na.drop(how="all") # Menghapus seluruh baris data jika seluruh baris tersebut bernilai null atau nan
# new_customers_df.na.drop(thresh=2) # Menghapus seluruh baris data jika baris tersebut mengandung minimal 2 missing value


DataFrame[customer_id: string, customer_name: string, gender: string, age: string, home_address: string, zip_code: string, city: string, state: string, country: string]

In [None]:
# SIMPAN DATA YG TDK MISSING VALUE

new_customers_df = new_customers_df.na.drop()

print("jumlah missing value : ", new_customers_df.where(isnull('gender') | isnan('gender')).count())

jumlah missing value :  0


### Imputation

Metode ini bekerja dengan cara mengisi (fill) missing value dengan nilai tertentu. Hal ini tentunya akan mencegah hilangnya informasi akibat missing value.

In [None]:
customers_df.na.fill("prefer not to say", subset = ["gender"])

DataFrame[customer_id: string, customer_name: string, gender: string, age: string, home_address: string, zip_code: string, city: string, state: string, country: string]

### Interpolation

Interpolasi merupakan salah satu pendekatan numerik yang digunakan untuk menghitung titik data baru berdasarkan range data yang sudah ada.

Perhitungan tersebut membuat metode ini sangat cocok digunakan untuk menangani missing value pada data time series.

In [None]:
from pyspark import pandas as ps
customers_df_pandas = ps.read_csv("customers.csv")
customers_df_pandas["age"].interpolate(method='linear')

**Atasi invalid value**

In [None]:
new_customers_df.where(new_customers_df.age > 100).show()

+-----------+-------------+-----------------+---+--------------------+--------+----------+------------------+---------+
|customer_id|customer_name|           gender|age|        home_address|zip_code|      city|             state|  country|
+-----------+-------------+-----------------+---+--------------------+--------+----------+------------------+---------+
|        216|    fulan 216|Prefer not to say|500|038 Haley MewsApt...|    3991| Bayertown|Northern Territory|Australia|
|        961|    fulan 961|Prefer not to say|700|29 Farrell Parade...|    6528|New Joseph|   South Australia|Australia|
+-----------+-------------+-----------------+---+--------------------+--------+----------+------------------+---------+



In [None]:
from pyspark.sql.functions import when

new_customers_df = new_customers_df.withColumn(
    "age", when(new_customers_df.age == 700, 70) \
    .when(new_customers_df.age == 500, 50) \
    .otherwise(new_customers_df.age))

new_customers_df.summary().show()

+-------+------------------+-------------+-----------------+------------------+--------------------+-----------------+---------+--------------------+---------+
|summary|       customer_id|customer_name|           gender|               age|        home_address|         zip_code|     city|               state|  country|
+-------+------------------+-------------+-----------------+------------------+--------------------+-----------------+---------+--------------------+---------+
|  count|               989|          989|              989|               989|                 989|              989|      989|                 989|      989|
|   mean|498.27805864509605|         NULL|             NULL|49.876643073811934|                NULL|5026.199191102123|     NULL|                NULL|     NULL|
| stddev|287.67376465771207|         NULL|             NULL|17.651855611617894|                NULL|2880.569897954812|     NULL|                NULL|     NULL|
|    min|                 1|      fulan 

In [None]:
new_customers_df = new_customers_df.dropDuplicates()
print("Jumlah duplikasi: ", (new_customers_df.count() - new_customers_df.distinct().count()))

Jumlah duplikasi:  0
