In [1]:
from pyspark.sql import SparkSession

'''
来自：examples/src/main/python/sql/basic.py
包含：
    1、DataFrame的基本使用、创建视图、执行sql语句
    2、RDD和Datasets互相转换：
                    （1）转换一个 Row 对象的 RDD 成一个 DataFrame。
                    （2）StructType
                    
DataFrame函数：http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame

以下示例均单机测试
'''

spark = SparkSession\
    .builder\
    .master("local")\
    .appName("basicfunc")\
    .getOrCreate()

df = spark.read.json("data/people.json")
df.show()
df.printSchema()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [2]:
df.select("age").show()
df.select("age","name").show()
df.select(df["age"],df["name"]).show()
df.select((df["age"]+1).alias("age"),df["name"]).show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  31|   Andy|
|  20| Justin|
+----+-------+



In [9]:
df.filter(df["age"]>20).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [11]:
df.groupBy(df["age"]).count().show()
df.groupBy("age").count().show()
df.groupBy("age","name").count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+

+----+-------+-----+
| age|   name|count|
+----+-------+-----+
|null|Michael|    1|
|  30|   Andy|    1|
|  19| Justin|    1|
+----+-------+-----+



In [13]:
df.createOrReplaceTempView("peoplev")
spark.sql("select * from peoplev").show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [19]:
spark.catalog.dropGlobalTempView("peoplevgl")
df.createGlobalTempView("peoplevgl")

In [20]:
spark.sql("select * from global_temp.peoplevgl").show()
spark.newSession().sql("select * from global_temp.peoplevgl").show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [26]:
from pyspark.sql import Row
'''
schema推断
'''
def schema_inference_example(spark):
    sc = spark.sparkContext  # s是小写
    data = sc.textFile("data/people.txt")
    parts = data.map(lambda x:x.split(","))
    people = parts.map(lambda x:Row(name = x[0],age = int(x[1])))  # Row
    
    schemaPeople = spark.createDataFrame(people)
    schemaPeople.createOrReplaceTempView("people")
    teenagers = spark.sql("select name from people where age >=13 and age <=19")
    
    teenNames = teenagers.rdd.map(lambda x:"name:"+x.name).collect()
    for name in teenNames:
        print(name)
    
schema_inference_example(spark)

name:Justin


In [3]:
from pyspark.sql.types import StructField, StringType, StructType
'''
schema推断
'''
def programmatic_schema_example(spark):
    sc = spark.sparkContext
    data = sc.textFile("data/people.txt")
    parts = data.map(lambda x:x.split(","))
    people = parts.map(lambda x:(x[0],x[1].strip()))

    schemaString = "age name"
    fields = [StructField(filed_name,StringType(),True) for filed_name in schemaString.split(" ")]
    schema = StructType(fields)
    
    schemaPeople = spark.createDataFrame(people,schema)
    schemaPeople.createOrReplaceTempView("people")
    
    spark.sql("select * from people").show()

programmatic_schema_example(spark)

+-------+----+
|    age|name|
+-------+----+
|Michael|  29|
|   Andy|  30|
| Justin|  19|
+-------+----+

