In [1]:
from pyspark.sql import SparkSession

'''
来自：examples/src/main/python/sql/datasource.py
包含：
    1、读取、写入数据（三种方式）
    2、根据 parquet file 、json file 等创建临时视图
    3、合并 parquet schema
    3、连接 mysql
                    
DataFrame函数：http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame
'''

spark = SparkSession\
    .builder\
    .master("local")\
    .appName("datasource")\
    .getOrCreate()

In [2]:
# 三种读取方式

df1 = spark.read.load("data/users.parquet")
df2 = spark.read.parquet("data/users.parquet")
df3 = spark.sql("select * from parquet.`data/users.parquet`")

df1.show()
df2.show()
df3.show()

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+

+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+
|Alyssa|          null|  [3, 9, 15, 20]|
|   Ben|           red|              []|
+------+--------------+----------------+



In [18]:
# 三种写入方式  pyspark==2.4.4

df1.write.save("data/ouput1",format="json")
df1.write.partitionBy("name").format("json").save("data/ouput2")
df1.write.json("data/ouput3")
# 写入到持久化表
# df1.write.partitionBy("favorite_color")\
#     .bucketBy(42,"name")\
#     .saveAsTable("people_partitioned_bucketed")

In [None]:
# 读取其他格式的文件

dfj = spark.read.load("data/people.json", format="json")
dfj.select("name", "age")\
    .write\
    .save("namesAndAges.parquet", format="parquet")

dfc = spark.read.load("data/people.csv",
                      format="csv", 
                      sep=":", 
                      inferSchema="true", 
                      header="true")

dfo = spark.read.orc("data/users.orc")
dfo.write.format("orc")\
    .option("orc.bloom.filter.columns", "favorite_color")\
    .option("orc.dictionary.key.threshold", "1.0")\
    .save("users_with_options.orc")

In [19]:
# 根据 parquet file 创建临时视图

def parquet_example(spark):
    peopleDF = spark.read.json("data/people.json")
    peopleDF.write.parquet("data/people.parquet")
    
    parquetFile = spark.read.parquet("data/people.parquet")
    
    parquetFile.createOrReplaceTempView("parquetFile")
    
    spark.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19").show()

parquet_example(spark)

+------+
|  name|
+------+
|Justin|
+------+



In [21]:
# 合并 parquet schema

from pyspark.sql import Row
def parquet_schema_merging_example(spark):
    sc = spark.sparkContext
    
    squaresDF = spark.createDataFrame(sc.parallelize(range(1,6))
                                      .map(lambda x:Row(single=x,double=x**2)))
    squaresDF.write.parquet("data/test_table/key=1")
    
    cubesDF = spark.createDataFrame(sc.parallelize(range(6, 11))
                                    .map(lambda i: Row(single=i, triple=i ** 3)))
    cubesDF.write.parquet("data/test_table/key=2")
    
    mergedDF = spark.read.option("mergeSchema", "true").parquet("data/test_table")
    mergedDF.printSchema()
    mergedDF.show()

parquet_schema_merging_example(spark)

root
 |-- double: long (nullable = true)
 |-- single: long (nullable = true)
 |-- triple: long (nullable = true)
 |-- key: integer (nullable = true)

+------+------+------+---+
|double|single|triple|key|
+------+------+------+---+
|  null|     6|   216|  2|
|  null|     7|   343|  2|
|  null|     8|   512|  2|
|  null|     9|   729|  2|
|  null|    10|  1000|  2|
|     1|     1|  null|  1|
|     4|     2|  null|  1|
|     9|     3|  null|  1|
|    16|     4|  null|  1|
|    25|     5|  null|  1|
+------+------+------+---+



In [None]:
# 读取json文件的另一种形式 RDD

def json_dataset_example(spark):
    sc = spark.sparkContext
    
    '''
    path：string represents path to the JSON dataset,or RDD of Strings storing JSON objects.
    '''

    path = "data/people.json"
    peopleDF = spark.read.json(path)
    peopleDF.printSchema()
    
    peopleDF.createOrReplaceTempView("people")

    teenagerNamesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19")
    teenagerNamesDF.show()

    # Alternatively, a DataFrame can be created for a JSON dataset represented by
    # an RDD[String] storing one JSON object per string
    jsonStrings = ['{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}']
    otherPeopleRDD = sc.parallelize(jsonStrings)
    otherPeople = spark.read.json(otherPeopleRDD)
    otherPeople.show()

json_dataset_example(spark)

In [26]:
'''
连接 JDBC source

要将 mysql-connector-java-8.0.21.jar 放到 spark 的 jars 目录下

集群执行：spark-submit test.py --master spark://zgg:7077
'''

from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .appName("datasource")\
    .getOrCreate()

def jdbc_dataset_example(spark):
    # Note: JDBC loading and saving can be achieved via either the load/save or jdbc methods
    # Loading data from a JDBC source
    jdbcDF = spark.read \
        .format("jdbc") \
        .option("driver","com.mysql.cj.jdbc.Driver") \
        .option("url", "jdbc:mysql://localhost:3306") \
        .option("dbtable", "mysql.dept_emp") \
        .option("user", "root") \
        .option("password", "1234") \
        .load()

    jdbcDF.write \
        .format("jdbc") \
        .option("driver","com.mysql.cj.jdbc.Driver") \
        .option("url", "jdbc:mysql://localhost:3306") \
        .option("dbtable", "mysql.dept_emp_bk") \
        .option("user", "root") \
        .option("password", "1234") \
        .save()

jdbc_dataset_example(spark)  

'''
    jdbcDF2 = spark.read \
        .jdbc("jdbc:postgresql:dbserver", "schema.tablename",
              properties={"user": "username", "password": "password"})

    # Specifying dataframe column data types on read
    jdbcDF3 = spark.read \
        .format("jdbc") \
        .option("url", "jdbc:postgresql:dbserver") \
        .option("dbtable", "schema.tablename") \
        .option("user", "username") \
        .option("password", "password") \
        .option("customSchema", "id DECIMAL(38, 0), name STRING") \
        .load()
        
    jdbcDF2.write \
        .jdbc("jdbc:postgresql:dbserver", "schema.tablename",
              properties={"user": "username", "password": "password"})

    # Specifying create table column data types on write
    jdbcDF.write \
        .option("createTableColumnTypes", "name CHAR(64), comments VARCHAR(1024)") \
        .jdbc("jdbc:postgresql:dbserver", "schema.tablename",
              properties={"user": "username", "password": "password"})
'''

Py4JJavaError: An error occurred while calling o1019.load.
: java.lang.ClassNotFoundException: com.mysql.cj.jdbc.Driver
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:45)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$5.apply(JDBCOptions.scala:99)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions$$anonfun$5.apply(JDBCOptions.scala:99)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:99)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:35)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:32)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:318)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:167)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
