# spark sql study

In [15]:
sc?

In [27]:
lines=sc.parallelize(["pan","i like pan"])

In [36]:
lines.first()

'pan'

In [37]:
lines.count()

2

In [2]:
for i in range(1):
    print i*i

0


In [6]:
from pyspark.sql import HiveContext,Row

In [7]:
from pyspark.sql import SQLContext,Row

In [None]:
sqlctx=sqlContextl(sc)

In [8]:
hivectx=HiveContext(sc)

入门

In [21]:
from pyspark.sql import SparkSession
spark=SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option","some-value") \
    .getOrCreate()

In [1]:
df=spark.read.json("../../examples/src/main/resources/people.json")
#df=spark.read.json("./examples/src/main/resources/people.json")
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [2]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [4]:
df.select("name").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [8]:
df.select(df['name'],df['age'],df['age']+1).show()

+-------+----+---------+
|   name| age|(age + 1)|
+-------+----+---------+
|Michael|null|     null|
|   Andy|  30|       31|
| Justin|  19|       20|
+-------+----+---------+



In [9]:
df.filter(df['age']>21).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [11]:
df.groupby('age').count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



In [12]:
df.createOrReplaceTempView('people')

In [13]:
sqlDF=spark.sql("select * from people")

In [14]:
sqlDF.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



Global Temporary View

In [15]:
df.createGlobalTempView('people')

In [17]:
spark.sql("select * from global_temp.people").show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [18]:
spark.newSession().sql("select * from global_temp.people").show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



创建数据集(Dataset)

In [16]:
from pyspark.sql import Row

sc=spark.sparkContext

# Load a text file and convert each line to a Row.
lines=sc.textFile("examples/src/main/resources/people.txt")
parts=lines.map(lambda l:l.split(","))
people=parts.map(lambda p: Row(name=p[0],age=int(p[1])))

# Infer the schema, and register the DataFrame as a table.
schemaPeople =spark.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people")
#spark.sql("select * from people").show()
#SQL can be run over DataFrames that have been registered as a table.
teenagers=spark.sql("select name from people where age>=13 and age <=19")
teenagers.show()
# The results of SQL queries are Dataframe objects.
# rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.

teenNames=teenagers.rdd.map(lambda p: "Name: " +p.name).collect()
for name in teenNames:
    print(name)

# for name in teenagers:
#     print(name)




+------+
|  name|
+------+
|Justin|
+------+

Name: Justin


In [26]:
#import data types
from pyspark.sql.types import *

sc=spark.sparkContext

lines=sc.textFile("examples/src/main/resources/people.txt")
parts=lines.map(lambda l:l.split(","))
# Each line is converted to a tuple.
people=parts.map(lambda p:(p[0],p[1].strip()))

# The schema is encoded in a string.
schemaString="name age"

fields=[StructField(field_name,StringType(),True) for field_name in schemaString.split()]
schema=StructType(fields)

# Apply the schema to the RDD.
schemaPeople=spark.createDataFrame(people,schema)

schemaPeople.createOrReplaceTempView("people")

#results=spark.sql("select name from people")
results=spark.sql("select * from people")
results.show()





+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 19|
+-------+---+



In [30]:
df=spark.read.load("examples/src/main/resources/users.parquet")
df.select("name","favorite_color").write.save("namesAndFavColors.parquet")
df.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



In [35]:
df=spark.sql("select * from parquet.`examples/src/main/resources/users.parquet`")
df.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



JSON Datasets

In [36]:
sc=spark.sparkContext
path="examples/src/main/resources/people.json"
peopleDF=spark.read.json(path)

peopleDF.printSchema()

peopleDF.createOrReplaceTempView("people")

teenagerNamesDF=spark.sql("select name from people where age between 13 and 19")
teenagerNamesDF.show()

jsonString=['{"name":"yin","adress":{"city":"Beijing","state":"xizhimen"}}']
otehpeopleRDD=sc.parallelize(jsonString)
otherpeople=spark.read.json(otehpeopleRDD)
otherpeople.show()




root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

+------+
|  name|
+------+
|Justin|
+------+

+------------------+----+
|            adress|name|
+------------------+----+
|[Beijing,xizhimen]| yin|
+------------------+----+



# JDBC To Other Databases

In [None]:
# Note: JDBC loading and saving can be achieved via either the load/save or jdbc methods
# Loading data from a JDBC source
jdbcDF = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql:dbserver") \
    .option("dbtable", "schema.tablename") \
    .option("user", "username") \
    .option("password", "password") \
    .load()

jdbcDF2 = spark.read \
    .jdbc("jdbc:postgresql:dbserver", "schema.tablename",
          properties={"user": "username", "password": "password"})

# Saving data to a JDBC source
jdbcDF.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql:dbserver") \
    .option("dbtable", "schema.tablename") \
    .option("user", "username") \
    .option("password", "password") \
    .save()

jdbcDF2.write \
    .jdbc("jdbc:postgresql:dbserver", "schema.tablename",
          properties={"user": "username", "password": "password"})