# Spark Bank Marketing Cleansing
This notebook will be used for experiment purpose to cleanse the data

In [None]:
print('test')

: 

## Import Libs

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import when
from pyspark.sql.functions import regexp_replace

: 

In [None]:
# Spark session & context
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("sparkcleansing") \
    .getOrCreate()

: 

In [None]:
spark

: 

## Load Data

In [None]:
# See current dir
!pwd

: 

In [None]:
df_bank_marketing = (
    spark.read
    .format("csv")
    .option("sep", ";")
    .option("header", True)
    .load("/home/jovyan/work/data/bank-additional-full.csv")
)

: 

In [None]:
df_bank_marketing.show()

: 

In [None]:
df_bank_marketing.count()

: 

In [None]:
df_bank_marketing.createOrReplaceTempView('bank_marketing')

: 

## Format Standarization

In [None]:
# Rename column with dots, because spark cant read them
df_transform1 = df_bank_marketing.withColumnRenamed('emp.var.rate', 'emp_var_rate') \
    .withColumnRenamed('cons.price.idx', 'cons_price_idx') \
    .withColumnRenamed('cons.conf.idx', 'cons_conf_idx') \
    .withColumnRenamed('nr.employed', 'nr_employed')

: 

In [None]:
df_transform1.printSchema()

: 

In [None]:
# Rename education column value from basic.4y, basic.6y, basic.6y into basic
df_transform2 = df_transform1.withColumn("education",
                                        when(df_transform1.education.endswith('4y'), regexp_replace(df_transform1.education, 'basic.4y', 'basic')) \
                                        .when(df_transform1.education.endswith('6y'), regexp_replace(df_transform1.education, 'basic.6y', 'basic')) \
                                         .when(df_transform1.education.endswith('9y'), regexp_replace(df_transform1.education, 'basic.9y', 'basic')) \
                                         .otherwise(df_transform1.education)
                                        )

: 

In [None]:
df_transform2.show()

: 

## Cleanse Null Data

In [None]:
df_transform3 = df_transform2.na.drop("all")

: 

In [None]:
# Seems like there's no null data
df_transform3.count()

: 

## Save File

In [None]:
df_transform3.coalesce(1).write \
      .option("header","true") \
      .option("sep",";") \
      .mode("overwrite") \
      .csv("spark_output/")

: 