<a href="https://colab.research.google.com/github/aekanun2020/2022-PUB_COC-Data-Science-for-Tourism/blob/main/D_Checked_Preparation_using_PySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz
!tar xf spark-2.4.1-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.1-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
   .appName("Neural Network Model") \
   .config("spark.executor.memory", "3gb") \
   .getOrCreate()
   
sc = spark.sparkContext
sc

In [21]:
colors = ['white','green','yellow','red','brown','pink']

In [22]:
color_df = sc.parallelize(colors).map(lambda x:(x,len(x))).toDF(['color','length'])

In [23]:
color_df

DataFrame[color: string, length: bigint]

In [24]:
color_df.printSchema()

root
 |-- color: string (nullable = true)
 |-- length: long (nullable = true)



In [25]:
color_df.count()

6

In [26]:
color_df.show()

+------+------+
| color|length|
+------+------+
| white|     5|
| green|     5|
|yellow|     6|
|   red|     3|
| brown|     5|
|  pink|     4|
+------+------+



In [27]:
from pyspark.sql import functions as sparkf

In [28]:
# เปลี่ยนแปลงค่าใน Column 'length' โดยคูณ 2 เข้าไป

color_df.withColumn('length',sparkf.col('length')*2).show()

+------+------+
| color|length|
+------+------+
| white|    10|
| green|    10|
|yellow|    12|
|   red|     6|
| brown|    10|
|  pink|     8|
+------+------+



In [29]:
# เพิ่ม Column 'double_length' โดยให้มีค่าเท่ากับ length*2 (Derived Column)

color_df.withColumn('double_length',sparkf.col('length')*2).show()

+------+------+-------------+
| color|length|double_length|
+------+------+-------------+
| white|     5|           10|
| green|     5|           10|
|yellow|     6|           12|
|   red|     3|            6|
| brown|     5|           10|
|  pink|     4|            8|
+------+------+-------------+



In [30]:
# เปลี่ยนแปลงค่าใน Column 'color' โดยมีเงื่อนไขในการเปลี่ยนแปลงค่า

color_df.withColumn('color', sparkf.when(sparkf.col('color')=='white','grey')\
                    .otherwise(sparkf.col('color'))).show()

+------+------+
| color|length|
+------+------+
|  grey|     5|
| green|     5|
|yellow|     6|
|   red|     3|
| brown|     5|
|  pink|     4|
+------+------+



In [31]:
color_df.show()

+------+------+
| color|length|
+------+------+
| white|     5|
| green|     5|
|yellow|     6|
|   red|     3|
| brown|     5|
|  pink|     4|
+------+------+



In [32]:
# เปลี่ยนแปลงค่าใน Column 'color' โดยมีเงื่อนไขในการเปลี่ยนแปลงค่า

new_color_df = color_df\
.withColumn('color', sparkf.when(sparkf.col('color')=='white','grey')\
            .otherwise(sparkf.col('color')))

In [33]:
new_color_df.show()

+------+------+
| color|length|
+------+------+
|  grey|     5|
| green|     5|
|yellow|     6|
|   red|     3|
| brown|     5|
|  pink|     4|
+------+------+



3 บรรทัดหลังจากนี้ คือ การสร้าง Column ใหม่ คือ 'new_length' โดยให้มีค่าเป็น length ซึ่งผ่านการ normalized (max-min normalization)

In [34]:
max_length = color_df.select(sparkf.max(sparkf.col('length'))).collect()[0][0]

In [35]:
min_length = color_df.select(sparkf.min(sparkf.col('length'))).collect()[0][0]

In [36]:
color_df.withColumn('new_length',\
                    (sparkf.col('length')-min_length)/(max_length-min_length)).show()

+------+------+------------------+
| color|length|        new_length|
+------+------+------------------+
| white|     5|0.6666666666666666|
| green|     5|0.6666666666666666|
|yellow|     6|               1.0|
|   red|     3|               0.0|
| brown|     5|0.6666666666666666|
|  pink|     4|0.3333333333333333|
+------+------+------------------+



3 บรรทัดหลังจากนี้ คือ การสร้าง Column ใหม่ คือ 'new_length' โดยให้มีค่าเป็น length ซึ่งผ่านการ normalized (max-min normalization) ในอีกวิธีการหนึ่ง

In [37]:
sparkf_normalized = sparkf.udf(lambda x: ((x-min_length)/(max_length-min_length)))

In [38]:
color_df.withColumn('new_length',sparkf_normalized(sparkf.col('length'))).show()

+------+------+------------------+
| color|length|        new_length|
+------+------+------------------+
| white|     5|0.6666666666666666|
| green|     5|0.6666666666666666|
|yellow|     6|               1.0|
|   red|     3|               0.0|
| brown|     5|0.6666666666666666|
|  pink|     4|0.3333333333333333|
+------+------+------------------+

