# Spark Bank Marketing Cleansing
This notebook will be used for experiment purpose to cleanse the data

## Import Libs

In [23]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [24]:
# Initate spark session
from pyspark.sql import SparkSession

# Spark session & context
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("sparkxperiment") \
    .getOrCreate()

In [25]:
spark

## Load Data

In [26]:
# See current dir
!pwd

/home/jovyan/work/notebooks


In [44]:
df_bank_marketing = (
    spark.read
    .format("csv")
    .option("sep", "")
    .option("header", True)
    .load("/home/jovyan/work/data/bank-additional-full.csv")
)

In [45]:
df_bank_marketing.show()

+---+-----------+--------+-------------------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
|age|        job| marital|          education|default|housing|loan|  contact|month|day_of_week|duration|campaign|pdays|previous|   poutcome|emp.var.rate|cons.price.idx|cons.conf.idx|euribor3m|nr.employed|  y|
+---+-----------+--------+-------------------+-------+-------+----+---------+-----+-----------+--------+--------+-----+--------+-----------+------------+--------------+-------------+---------+-----------+---+
| 56|  housemaid| married|           basic.4y|     no|     no|  no|telephone|  may|        mon|     261|       1|  999|       0|nonexistent|         1.1|        93.994|        -36.4|    4.857|       5191| no|
| 57|   services| married|        high.school|unknown|     no|  no|telephone|  may|        mon|     149|       1|  999|       0|nonexistent|         1.1|        93.

In [46]:
df_bank_marketing.count()

41188

In [47]:
df_bank_marketing.createOrReplaceTempView('bank_marketing')

## Format Standarization

In [78]:
# Rename column with dots, because spark cant read them
df_transform1 = df_bank_marketing.withColumnRenamed('emp.var.rate', 'emp_var_rate') \
    .withColumnRenamed('cons.price.idx', 'cons_price_idx') \
    .withColumnRenamed('cons.conf.idx', 'cons_conf_idx') \
    .withColumnRenamed('nr.employed', 'nr_employed')

In [79]:
df_transform1.printSchema()

root
 |-- age: string (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- campaign: string (nullable = true)
 |-- pdays: string (nullable = true)
 |-- previous: string (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- emp_var_rate: string (nullable = true)
 |-- cons_price_idx: string (nullable = true)
 |-- cons_conf_idx: string (nullable = true)
 |-- euribor3m: string (nullable = true)
 |-- nr_employed: string (nullable = true)
 |-- y: string (nullable = true)



## Cleanse Null Data

In [93]:
df_transform1.na.drop("all").show(False)

Py4JError: An error occurred while calling o264.showString. Trace:
py4j.Py4JException: Method showString([class java.lang.Boolean, class java.lang.Integer, class java.lang.Boolean]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)



In [91]:
df_transform1.count()

41188