In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.functions import col 
from pyspark.sql.types import *
from pyspark.sql.functions import udf

In [2]:
spark = (SparkSession
          .builder\
          .master("local[8]")
          .appName("demo_salmon")\
          .getOrCreate())

In [3]:
s3url = 'spark_demo_salmon_data.csv'
df = spark.read.format("csv") \
    .option("innerSchema","true") \
    .option("header","true") \
    .option("nullValue","None") \
    .load(s3url)

In [4]:
df.printSchema()

root
 |-- 編號: string (nullable = true)
 |-- 鄉鎮市區: string (nullable = true)
 |-- 主要用途: string (nullable = true)
 |-- 單價元坪: string (nullable = true)
 |-- 建築完成年: string (nullable = true)



In [5]:
# Data Mining
df.select('主要用途').distinct().show()
df.select('鄉鎮市區').distinct().show()
df.select('建築完成年').na.drop().distinct().show()

+--------+
|主要用途|
+--------+
|  住家用|
+--------+

+--------+
|鄉鎮市區|
+--------+
|  中正區|
|  中山區|
|  內湖區|
|  士林區|
|  北投區|
|  文山區|
|  大同區|
|  萬華區|
|  南港區|
|  松山區|
|  大安區|
|  信義區|
+--------+

+----------+
|建築完成年|
+----------+
|          |
|       084|
|       071|
|       070|
|       060|
|       106|
|       099|
|       069|
|       083|
|       107|
|       095|
|       068|
|       097|
|       089|
|       104|
|       072|
|       067|
|       078|
|       052|
|       073|
+----------+
only showing top 20 rows



In [6]:
# sql api 

In [7]:
df.createOrReplaceTempView("DF")

sql = f"""
    SELECT `編號`, `鄉鎮市區`, `主要用途`, `單價元坪`, 
            CAST(`建築完成年` AS INT)+1911 AS `建築完成西元年`
    FROM DF
    WHERE `單價元坪` BETWEEN 200000 and 500000 
"""

result = spark.sql(sql)

In [10]:
result.show()

+-------------------+--------+--------+------------------+--------------+
|               編號|鄉鎮市區|主要用途|          單價元坪|建築完成西元年|
+-------------------+--------+--------+------------------+--------------+
|RPQOMLNJNHOFFAA57CA|  中正區|  住家用|244284.29752066117|          1985|
|RPOQMLNJNHOFFAA57CA|  文山區|  住家用| 424525.6198347107|          null|
|RPQQMLNJNHOFFAA77CA|  文山區|  住家用|302614.87603305787|          1999|
|RPPNMLRJNHOFFAA77CA|  文山區|  住家用|282003.30578512396|          null|
|RPWPMLNJNHOFFAA28CA|  文山區|  住家用| 418532.2314049587|          1998|
|RPWNMLTJNHOFFAA68CA|  文山區|  住家用| 405818.1818181818|          1999|
|RPTOMLNKNHOFFAA97CA|  文山區|  住家用|323408.26446280995|          1995|
|RPQNMLRKNHOFFAA97CA|  文山區|  住家用| 365990.0826446281|          1982|
|RPRNMLMJNHOFFBA57CA|  大同區|  住家用|410429.75206611573|          1973|
|RPRPMLQJNHOFFBA18CA|  大同區|  住家用|352948.76033057855|          1981|
|RPTNMLOLNHOFFBA18CA|  萬華區|  住家用| 484879.3388429752|          2011|
|RPPRMLMJNHOFFCA87CA|  中山區|  住家用|  421606.611570248

In [11]:
# function api
df.withColumn('建築完成西元年' ,col('建築完成年').cast(IntegerType())+1911)\
  .select(col('編號'),
          col('鄉鎮市區'),
          col('主要用途'),
          col('單價元坪'),
          col('建築完成西元年')) \
    .where(col('單價元坪')>=200000) \
    .where(col('單價元坪')<=500000) \
    .show()

+-------------------+--------+--------+------------------+--------------+
|               編號|鄉鎮市區|主要用途|          單價元坪|建築完成西元年|
+-------------------+--------+--------+------------------+--------------+
|RPQOMLNJNHOFFAA57CA|  中正區|  住家用|244284.29752066117|          1985|
|RPOQMLNJNHOFFAA57CA|  文山區|  住家用| 424525.6198347107|          null|
|RPQQMLNJNHOFFAA77CA|  文山區|  住家用|302614.87603305787|          1999|
|RPPNMLRJNHOFFAA77CA|  文山區|  住家用|282003.30578512396|          null|
|RPWPMLNJNHOFFAA28CA|  文山區|  住家用| 418532.2314049587|          1998|
|RPWNMLTJNHOFFAA68CA|  文山區|  住家用| 405818.1818181818|          1999|
|RPTOMLNKNHOFFAA97CA|  文山區|  住家用|323408.26446280995|          1995|
|RPQNMLRKNHOFFAA97CA|  文山區|  住家用| 365990.0826446281|          1982|
|RPRNMLMJNHOFFBA57CA|  大同區|  住家用|410429.75206611573|          1973|
|RPRPMLQJNHOFFBA18CA|  大同區|  住家用|352948.76033057855|          1981|
|RPTNMLOLNHOFFBA18CA|  萬華區|  住家用| 484879.3388429752|          2011|
|RPPRMLMJNHOFFCA87CA|  中山區|  住家用|  421606.611570248

In [31]:
spark.stop()