In [None]:
from __future__ import print_function, division
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [None]:
# 启动 Spark （如果你已经启动就不需要）
spark = SparkSession.builder.master("local[2]") \
   .appName("test") \
   .enableHiveSupport() \
   .getOrCreate()

sc = spark.sparkContext

## 讀取資料

In [None]:
df = spark.read.csv('../data/ratings.csv', header=True)

In [None]:
df.show()

## 選擇欄位

In [None]:
df.select('userID').show()

In [None]:
df.select('userID','movieID').show()

## 操作欄位

In [None]:
df.selectExpr('userID as id').show()

In [None]:
df.selectExpr('movieID', 'rating * 2 as rating_2').show()

## 篩選

In [None]:
df.where('rating > 3').show()

In [None]:
df.where('userId = 2 and rating > 3').show()

In [None]:
df.select('userID','rating').filter('userID = 2 and rating > 3').show()

## 聚合運算

In [None]:
df.count()

In [None]:
df.filter('userID = 1').count()

In [None]:
df.agg({'userID':'count', 'rating':'avg'}).show()

## Group By

In [None]:
# 計算每個user 評比了多少部電影，平均分數為何?

In [None]:
df.groupBy('userID').agg({'movieID': 'count', 'rating':'avg'}).show()

In [None]:
from pyspark.sql.functions import *
df.groupBy('userID').agg(count('movieID'), round(avg(df.rating),2)).show()

In [None]:
from pyspark.sql.functions import *
df.groupBy('userID').agg(count('movieID'), round(avg(df.rating),2)).show()

## Join tables

In [None]:
d = [{'name': 'Alice', 'age': 1}, {'name': 'Bryan', 'age': 3}, {'name': 'Cool', 'age':2}]
df_profile = spark.createDataFrame(d)
df_profile.show()

In [None]:
d = [{'name': 'Jason', 'child': 'Alice'}, 
     {'name': 'Bill', 'child': 'Bryan'}, 
     {'name': 'Sera', 'child': 'Bryan'}, 
     {'name': 'Jill', 'child': 'Ken'}]
df_parents = spark.createDataFrame(d)
df_parents.show()

![sql](http://4.bp.blogspot.com/-_HsHikmChBI/VmQGJjLKgyI/AAAAAAAAEPw/JaLnV0bsbEo/s1600/sql%2Bjoins%2Bguide%2Band%2Bsyntax.jpg)

In [None]:
df_profile.join(df_parents, df_profile.name == df_parents.child).show()

In [None]:
df_profile.join(df_parents, df_profile.name == df_parents.child, 'left').show()

In [None]:
df_profile.join(df_parents, df_profile.name == df_parents.child, 'right').show()

In [None]:
df_profile.join(df_parents, df_profile.name == df_parents.child, 'outer').show()

## 注册为 SQL 表

In [None]:
df.registerTempTable("table")

In [None]:
def run_sql(sql_string):
    spark.sql(sql_string).show()

In [None]:
sql_string = """
select * 
from table
"""
run_sql(sql_string)

In [None]:
sql_string = """
select userid as id 
from table
"""
run_sql(sql_string)

In [None]:
sql_string = """
select 
s
"""
run_sql(sql_string)