<a href="https://colab.research.google.com/github/aekanun2020/2022-PUB_COC-Data-Science-for-Tourism/blob/main/WORKSHOP_Spark_Data_Frame_for_EDA_and_Data_Prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz
!tar xf spark-2.4.1-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.1-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
   .appName("Neural Network Model") \
   .config("spark.executor.memory", "3gb") \
   .getOrCreate()
   
sc = spark.sparkContext
sc

In [2]:
from pyspark.sql import functions as sparkf

# Define a dataset.

df = sc.parallelize([
    (10, '', 10000), (20, 'Female', 30000), (None, 'Male', 80000), (None, 'Male', 5000)
]).toDF(["age", "gender", "income"])

df.show()


+----+------+------+
| age|gender|income|
+----+------+------+
|  10|      | 10000|
|  20|Female| 30000|
|null|  Male| 80000|
|null|  Male|  5000|
+----+------+------+



In [3]:
df.describe().show()

+-------+------------------+------+-----------------+
|summary|               age|gender|           income|
+-------+------------------+------+-----------------+
|  count|                 2|     4|                4|
|   mean|              15.0|  null|          31250.0|
| stddev|7.0710678118654755|  null|34247.87098005753|
|    min|                10|      |             5000|
|    max|                20|  Male|            80000|
+-------+------------------+------+-----------------+



In [4]:
df.summary().take(10)

[Row(summary='count', age='2', gender='4', income='4'),
 Row(summary='mean', age='15.0', gender=None, income='31250.0'),
 Row(summary='stddev', age='7.0710678118654755', gender=None, income='34247.87098005753'),
 Row(summary='min', age='10', gender='', income='5000'),
 Row(summary='25%', age='10', gender=None, income='5000'),
 Row(summary='50%', age='10', gender=None, income='10000'),
 Row(summary='75%', age='20', gender=None, income='30000'),
 Row(summary='max', age='20', gender='Male', income='80000')]

In [5]:
median_age = float(df.summary().take(10)[5]['age'])

In [6]:
median_age

10.0

In [7]:
mean_age = float(df.summary().take(10)[1]['age'])

In [8]:
mean_age

15.0

In [9]:
df.select(sparkf.avg(sparkf.col('age'))).take(10)

[Row(avg(age)=15.0)]

In [10]:
df.select(sparkf.avg(sparkf.col('age'))).take(1)[0]['avg(age)']

15.0

In [11]:
mode_gender = df.groupBy('gender').count().orderBy('count',ascending=False).first()['gender']

In [12]:
# Treat Null Value (None) with Average one.

df.withColumn('age',sparkf.when(sparkf.col('age').isNull(),mean_age).otherwise(sparkf.col('age'))).show()

+----+------+------+
| age|gender|income|
+----+------+------+
|10.0|      | 10000|
|20.0|Female| 30000|
|15.0|  Male| 80000|
|15.0|  Male|  5000|
+----+------+------+



In [13]:
# Treat Empty Value with Defined Values.


df.withColumn('gender',sparkf.when(sparkf.col('gender')=='',mode_gender).otherwise(sparkf.col('gender'))).show()



+----+------+------+
| age|gender|income|
+----+------+------+
|  10|  Male| 10000|
|  20|Female| 30000|
|null|  Male| 80000|
|null|  Male|  5000|
+----+------+------+



In [14]:
# Treat Null Value (None) with Average one.

avg_age = df.na.drop().agg(sparkf.avg("age")).collect()[0][0]

sparkf_replaceNull = sparkf.udf(lambda x: avg_age if x == None else x)

no_null_df = df.withColumn('age', sparkf_replaceNull(sparkf.col('age')))

no_null_df.show()


+----+------+------+
| age|gender|income|
+----+------+------+
|  10|      | 10000|
|  20|Female| 30000|
|15.0|  Male| 80000|
|15.0|  Male|  5000|
+----+------+------+



In [15]:
# Treat Empty Value with Defined Values.
treat_missing = sparkf.udf(lambda x: "Male_Assume" if x == "" else x)

no_missing_df = no_null_df.withColumn('new_gender',treat_missing(no_null_df.gender))

no_missing_df.show()


+----+------+------+-----------+
| age|gender|income| new_gender|
+----+------+------+-----------+
|  10|      | 10000|Male_Assume|
|  20|Female| 30000|     Female|
|15.0|  Male| 80000|       Male|
|15.0|  Male|  5000|       Male|
+----+------+------+-----------+



In [16]:
# Treat Outliner with Remove one.

no_outlier_df = no_missing_df.filter(sparkf.col('income') >= 10000)

no_outlier_df .show()


+----+------+------+-----------+
| age|gender|income| new_gender|
+----+------+------+-----------+
|  10|      | 10000|Male_Assume|
|  20|Female| 30000|     Female|
|15.0|  Male| 80000|       Male|
+----+------+------+-----------+



In [17]:
# Treat Empty Value with Defined Values.

treat_missing = sparkf.udf(lambda x: "Male_Assume" if x == "" else x)

no_missing_df = no_null_df.withColumn('new_gender',treat_missing(no_null_df.gender))
no_missing_df = no_null_df.withColumn('target_variable',treat_missing(no_null_df.gender))

no_missing_df.show()

+----+------+------+---------------+
| age|gender|income|target_variable|
+----+------+------+---------------+
|  10|      | 10000|    Male_Assume|
|  20|Female| 30000|         Female|
|15.0|  Male| 80000|           Male|
|15.0|  Male|  5000|           Male|
+----+------+------+---------------+

