# spark sql study

In [15]:
sc?

In [27]:
lines=sc.parallelize(["pan","i like pan"])

In [36]:
lines.first()

'pan'

In [37]:
lines.count()

2

In [2]:
for i in range(1):
    print i*i

0


In [6]:
from pyspark.sql import HiveContext,Row

In [7]:
from pyspark.sql import SQLContext,Row

In [None]:
sqlctx=sqlContextl(sc)

In [8]:
hivectx=HiveContext(sc)

入门

In [21]:
from pyspark.sql import SparkSession
spark=SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option","some-value") \
    .getOrCreate()

In [1]:
df=spark.read.json("../../examples/src/main/resources/people.json")
#df=spark.read.json("./examples/src/main/resources/people.json")
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [2]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [4]:
df.select("name").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [8]:
df.select(df['name'],df['age'],df['age']+1).show()

+-------+----+---------+
|   name| age|(age + 1)|
+-------+----+---------+
|Michael|null|     null|
|   Andy|  30|       31|
| Justin|  19|       20|
+-------+----+---------+



In [9]:
df.filter(df['age']>21).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [11]:
df.groupby('age').count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



In [12]:
df.createOrReplaceTempView('people')

In [13]:
sqlDF=spark.sql("select * from people")

In [14]:
sqlDF.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



Global Temporary View

In [15]:
df.createGlobalTempView('people')

In [17]:
spark.sql("select * from global_temp.people").show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [18]:
spark.newSession().sql("select * from global_temp.people").show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



创建数据集(Dataset)

In [16]:
from pyspark.sql import Row

sc=spark.sparkContext

# Load a text file and convert each line to a Row.
lines=sc.textFile("examples/src/main/resources/people.txt")
parts=lines.map(lambda l:l.split(","))
people=parts.map(lambda p: Row(name=p[0],age=int(p[1])))

# Infer the schema, and register the DataFrame as a table.
schemaPeople =spark.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people")
#spark.sql("select * from people").show()
#SQL can be run over DataFrames that have been registered as a table.
teenagers=spark.sql("select name from people where age>=13 and age <=19")
teenagers.show()
# The results of SQL queries are Dataframe objects.
# rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.

teenNames=teenagers.rdd.map(lambda p: "Name: " +p.name).collect()
for name in teenNames:
    print(name)

# for name in teenagers:
#     print(name)




+------+
|  name|
+------+
|Justin|
+------+

Name: Justin


In [26]:
#import data types
from pyspark.sql.types import *

sc=spark.sparkContext

lines=sc.textFile("examples/src/main/resources/people.txt")
parts=lines.map(lambda l:l.split(","))
# Each line is converted to a tuple.
people=parts.map(lambda p:(p[0],p[1].strip()))

# The schema is encoded in a string.
schemaString="name age"

fields=[StructField(field_name,StringType(),True) for field_name in schemaString.split()]
schema=StructType(fields)

# Apply the schema to the RDD.
schemaPeople=spark.createDataFrame(people,schema)

schemaPeople.createOrReplaceTempView("people")

#results=spark.sql("select name from people")
results=spark.sql("select * from people")
results.show()





+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 19|
+-------+---+



In [30]:
df=spark.read.load("examples/src/main/resources/users.parquet")
df.select("name","favorite_color").write.save("namesAndFavColors.parquet")
df.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



In [35]:
df=spark.sql("select * from parquet.`examples/src/main/resources/users.parquet`")
df.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



JSON Datasets

In [36]:
sc=spark.sparkContext
path="examples/src/main/resources/people.json"
peopleDF=spark.read.json(path)

peopleDF.printSchema()

peopleDF.createOrReplaceTempView("people")

teenagerNamesDF=spark.sql("select name from people where age between 13 and 19")
teenagerNamesDF.show()

jsonString=['{"name":"yin","adress":{"city":"Beijing","state":"xizhimen"}}']
otehpeopleRDD=sc.parallelize(jsonString)
otherpeople=spark.read.json(otehpeopleRDD)
otherpeople.show()




root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

+------+
|  name|
+------+
|Justin|
+------+

+------------------+----+
|            adress|name|
+------------------+----+
|[Beijing,xizhimen]| yin|
+------------------+----+



# JDBC To Other Databases

In [None]:
# Note: JDBC loading and saving can be achieved via either the load/save or jdbc methods
# Loading data from a JDBC source
jdbcDF = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql:dbserver") \
    .option("dbtable", "schema.tablename") \
    .option("user", "username") \
    .option("password", "password") \
    .load()

jdbcDF2 = spark.read \
    .jdbc("jdbc:postgresql:dbserver", "schema.tablename",
          properties={"user": "username", "password": "password"})

# Saving data to a JDBC source
jdbcDF.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql:dbserver") \
    .option("dbtable", "schema.tablename") \
    .option("user", "username") \
    .option("password", "password") \
    .save()

jdbcDF2.write \
    .jdbc("jdbc:postgresql:dbserver", "schema.tablename",
          properties={"user": "username", "password": "password"})

分布式SQL引擎

In [None]:
#Thrift JDBC / ODBC服务器：
./bin/beeline
beeline> !connect jdbc:hive2://localhost:10000

# 开始测试

In [1]:
sc=spark.sparkContext
path="../../spark-test.csv"
financeDF=spark.read.csv(path,header=True)
# financeDF.printSchema()

financeDF.createGlobalTempView("finance")
# teenagerNamesDF=spark.sql("select * from global_temp.finance limit 10 ")
# teenagerNamesDF.cache("fa")
# teenagerNamesDF.show()

In [2]:
teenagerNamesDF=spark.sql("""select date as shi from global_temp.finance limit 10""")      

In [3]:
teenagerNamesDF=spark.sql(""" select date  ,substring(data_type,5) as data_type,
sum( case when subject_type='收入' then money else 0 end )  `收入`
from global_temp.finance 
group by date,data_type
order by date,data_type
  """)
teenagerNamesDF.show()


+--------+---------+-----------------+
|    date|data_type|               收入|
+--------+---------+-----------------+
|2016/4/1|       实际|       1310081.17|
|2016/5/1|       实际|5463629.550000001|
|2016/6/1|       实际|             40.0|
+--------+---------+-----------------+



In [6]:
teenagerNamesDF.createGlobalTempView("finance_basic")
fbDF=spark.sql("""select * from global_temp.finance_basic""")
fbDF.show()

# teenagerNamesDF.createOrReplaceTempView("finance_basic")
# fbDF=spark.sql("""select * from finance_basic""")
# fbDF.show()

+--------+---------+-----------------+
|    date|data_type|               收入|
+--------+---------+-----------------+
|2016/4/1|       实际|       1310081.17|
|2016/5/1|       实际|5463629.550000001|
|2016/6/1|       实际|             40.0|
+--------+---------+-----------------+



In [8]:
fbDF=spark.sql("""select * from global_temp.finance_basic limit 1""")
fbDF.show()

+--------+---------+----------+
|    date|data_type|        收入|
+--------+---------+----------+
|2016/4/1|       实际|1310081.17|
+--------+---------+----------+



In [9]:
teenagerNamesDF=spark.sql("""  select date_part('y',DATEADD('mon',-3,date)) as `财年`,to_char(date,'yyyy-mm') as `月份`
,to_char(add_months(date,-12),'yyyy-mm') as `去年同期月份`,
date `日期`,area `区域`,data_type `数据类型`,
substring(data_type,5) `数据类型2`,
sum( case when subject_type='收入' then money else 0 end ) as `全收入`,
sum( case when subject_type='收入' and subject_code='60011108' then money else 0 end ) as `齿科双算`, -- 60011108收入 
sum( case when subject_detail_type='体检收入' then money else 0 end ) as `体检收入`,
sum( case when subject_code='64010201' or subject_code='64010202' then money else 0 end ) as  `变动成本1` ,   --64010201 and 64010202
sum( case when subject_detail_type='疾病检测收入' then money else 0 end ) as `疾病检测收入`,
sum( case when subject_detail_type='齿科收入' then money else 0 end ) as `齿科收入`,
sum( case when subject_code='64010221' then money else 0 end ) as `变动成本3`   ,  --64010221
sum( case when subject_detail_type='门诊收入' then money else 0 end ) as `门诊收入`,
sum( case when subject_code='64010222' then money else 0 end ) as `变动成本4`  , --64010222
sum( case when subject_detail_type='医疗管理收入' then money else 0 end ) as `医疗管理收入`,
sum( case when subject_code='64010211' then money else 0 end ) as `变动成本5`,   --64010211
sum( case when subject_detail_type='销售商品收入' then money else 0 end ) as `销售商品收入`,
sum( case when subject_detail_type='其他收入' then money else 0 end ) as `其他收入`,
sum( case when subject_detail_type='落关联成本' then money else 0 end ) as `落关联成本`,
sum( case when subject_type='成本费用' then money else 0 end ) as `成本费用`, 
nvl( sum( case when subject_type='收入' then money else 0 end ),0)-nvl(sum( case when subject_type='收入' and subject_code='60011108' then money else 0 end ),0) as `收入`,
-- nvl( sum( case when subject_type='收入' then money else 0 end ),0)-nvl(sum( case when subject_type='成本费用' then money else 0 end ) ,0) as `税前利润`,
case when area in ('西康','华检','元化医疗','健维管理','臻景','香港','BVI') then '非一体化'  else '一体化' end as `是否一体化`
from global_temp.finace
group by date_part('y',DATEADD('mon',-3,date)),to_char(date,'yyyy-mm'),to_char(add_months(date,-12),'yyyy-mm'),
	date ,area ,data_type,substring(data_type,5),
	case when area in ('西康','华检','元化医疗','健维管理','臻景','香港','BVI') then '非一体化'  else '一体化' end
limit 10 """)



ERROR:root:An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 9))



AnalysisException: u"Table or view not found: `global_temp`.`finace`; line 23 pos 5;\n'GlobalLimit 10\n+- 'LocalLimit 10\n   +- 'Aggregate ['date_part(y, 'DATEADD(mon, -3, 'date)), 'to_char('date, yyyy-mm), 'to_char('add_months('date, -12), yyyy-mm), 'date, 'area, 'data_type, 'substring('data_type, 5), CASE WHEN 'area IN (\u897f\u5eb7,\u534e\u68c0,\u5143\u5316\u533b\u7597,\u5065\u7ef4\u7ba1\u7406,\u81fb\u666f,\u9999\u6e2f,BVI) THEN \u975e\u4e00\u4f53\u5316 ELSE \u4e00\u4f53\u5316 END], ['date_part(y, 'DATEADD(mon, -3, 'date)) AS \u8d22\u5e74#130, 'to_char('date, yyyy-mm) AS \u6708\u4efd#131, 'to_char('add_months('date, -12), yyyy-mm) AS \u53bb\u5e74\u540c\u671f\u6708\u4efd#132, 'date AS \u65e5\u671f#133, 'area AS \u533a\u57df#134, 'data_type AS \u6570\u636e\u7c7b\u578b#135, 'substring('data_type, 5) AS \u6570\u636e\u7c7b\u578b2#136, 'sum(CASE WHEN ('subject_type = \u6536\u5165) THEN 'money ELSE 0 END) AS \u5168\u6536\u5165#137, 'sum(CASE WHEN (('subject_type = \u6536\u5165) && ('subject_code = 60011108)) THEN 'money ELSE 0 END) AS \u9f7f\u79d1\u53cc\u7b97#138, 'sum(CASE WHEN ('subject_detail_type = \u4f53\u68c0\u6536\u5165) THEN 'money ELSE 0 END) AS \u4f53\u68c0\u6536\u5165#139, 'sum(CASE WHEN (('subject_code = 64010201) || ('subject_code = 64010202)) THEN 'money ELSE 0 END) AS \u53d8\u52a8\u6210\u672c1#140, 'sum(CASE WHEN ('subject_detail_type = \u75be\u75c5\u68c0\u6d4b\u6536\u5165) THEN 'money ELSE 0 END) AS \u75be\u75c5\u68c0\u6d4b\u6536\u5165#141, 'sum(CASE WHEN ('subject_detail_type = \u9f7f\u79d1\u6536\u5165) THEN 'money ELSE 0 END) AS \u9f7f\u79d1\u6536\u5165#142, 'sum(CASE WHEN ('subject_code = 64010221) THEN 'money ELSE 0 END) AS \u53d8\u52a8\u6210\u672c3#143, 'sum(CASE WHEN ('subject_detail_type = \u95e8\u8bca\u6536\u5165) THEN 'money ELSE 0 END) AS \u95e8\u8bca\u6536\u5165#144, 'sum(CASE WHEN ('subject_code = 64010222) THEN 'money ELSE 0 END) AS \u53d8\u52a8\u6210\u672c4#145, 'sum(CASE WHEN ('subject_detail_type = \u533b\u7597\u7ba1\u7406\u6536\u5165) THEN 'money ELSE 0 END) AS \u533b\u7597\u7ba1\u7406\u6536\u5165#146, 'sum(CASE WHEN ('subject_code = 64010211) THEN 'money ELSE 0 END) AS \u53d8\u52a8\u6210\u672c5#147, 'sum(CASE WHEN ('subject_detail_type = \u9500\u552e\u5546\u54c1\u6536\u5165) THEN 'money ELSE 0 END) AS \u9500\u552e\u5546\u54c1\u6536\u5165#148, 'sum(CASE WHEN ('subject_detail_type = \u5176\u4ed6\u6536\u5165) THEN 'money ELSE 0 END) AS \u5176\u4ed6\u6536\u5165#149, 'sum(CASE WHEN ('subject_detail_type = \u843d\u5173\u8054\u6210\u672c) THEN 'money ELSE 0 END) AS \u843d\u5173\u8054\u6210\u672c#150, 'sum(CASE WHEN ('subject_type = \u6210\u672c\u8d39\u7528) THEN 'money ELSE 0 END) AS \u6210\u672c\u8d39\u7528#151, ('nvl('sum(CASE WHEN ('subject_type = \u6536\u5165) THEN 'money ELSE 0 END), 0) - 'nvl('sum(CASE WHEN (('subject_type = \u6536\u5165) && ('subject_code = 60011108)) THEN 'money ELSE 0 END), 0)) AS \u6536\u5165#152, CASE WHEN 'area IN (\u897f\u5eb7,\u534e\u68c0,\u5143\u5316\u533b\u7597,\u5065\u7ef4\u7ba1\u7406,\u81fb\u666f,\u9999\u6e2f,BVI) THEN \u975e\u4e00\u4f53\u5316 ELSE \u4e00\u4f53\u5316 END AS \u662f\u5426\u4e00\u4f53\u5316#153]\n      +- 'UnresolvedRelation `global_temp`.`finace`\n"