In [2]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName('my_first_app_name') \
    .getOrCreate()

In [9]:
# 从pandas dataframe创建spark dataframe
colors = ['white','green','yellow','red','brown','pink', 'white']
color_df = pd.DataFrame(colors,columns=['color'])
color_df['length'] = color_df['color'].apply(len)
print(color_df)

color_sdf = spark.createDataFrame(color_df)
color_sdf.show()

    color  length
0   white       5
1   green       5
2  yellow       6
3     red       3
4   brown       5
5    pink       4
6   white       5
+------+------+
| color|length|
+------+------+
| white|     5|
| green|     5|
|yellow|     6|
|   red|     3|
| brown|     5|
|  pink|     4|
| white|     5|
+------+------+



In [6]:
# 1、分组统计
# 分组计算1
color_sdf.groupBy('length').count().show()

# 分组计算2：应用多函数
import pyspark.sql.functions as func
color_sdf.groupBy("color").agg(func.max("length"), func.sum("length")).show()

+------+-----+
|length|count|
+------+-----+
|     6|    1|
|     5|    4|
|     3|    1|
|     4|    1|
+------+-----+

+------+-----------+-----------+
| color|max(length)|sum(length)|
+------+-----------+-----------+
| green|          5|          5|
|yellow|          6|          6|
| white|          5|         10|
|  pink|          4|          4|
|   red|          3|          3|
| brown|          5|          5|
+------+-----------+-----------+



In [4]:
# 2、join操作
# 1.生成测试数据
employees = [(1, "John", 25), (2, "Ray", 35), (3,"Mike", 24), (4, "Jane", 28), 
             (5, "Kevin", 26), 
             (6, "Vincent", 35), (7,"James", 38), (8, "Shane", 32), 
             (9, "Larry", 29), (10, "Kimberly", 29),
             (11, "Alex", 28), (12, "Garry", 25), (13, "Max",31)]
employees=spark.createDataFrame(employees, schema=["emp_id","name","age"])
# employees.show()

salary=[(1,1000),(2,2000),(3,3000),(4,4000)]
salary=spark.createDataFrame(salary, schema=["emp_id","salary"])
# salary.show()

department=[(1,1000),(2,2000),(3,3000),(4,4000)]
department=spark.createDataFrame(department, schema=["emp_id","departement"])
# department.show()

# 2.连接
# join默认是内连接，最终结果会存在重复列名
# 如果是pandas,重复列会用_x,_y等后缀标识出来，但spark不会
# join会在最后的dataframe中存在重复列
final_data = employees.join(salary, employees.emp_id == salary.emp_id, how='left')\
    .join(department, employees.emp_id==department.emp_id, how='left')
final_data.show()

# 3.如果两边的关联字段名相同，也可以省去很多麻烦
final_data2 = employees.join(salary, on='emp_id', how='left')\
    .join(department, on='emp_id', how='left')
final_data2.show()

+------+--------+---+------+------+------+-----------+
|emp_id|    name|age|emp_id|salary|emp_id|departement|
+------+--------+---+------+------+------+-----------+
|     7|   James| 38|  null|  null|  null|       null|
|     6| Vincent| 35|  null|  null|  null|       null|
|     9|   Larry| 29|  null|  null|  null|       null|
|     5|   Kevin| 26|  null|  null|  null|       null|
|     1|    John| 25|     1|  1000|     1|       1000|
|    10|Kimberly| 29|  null|  null|  null|       null|
|     3|    Mike| 24|     3|  3000|     3|       3000|
|    12|   Garry| 25|  null|  null|  null|       null|
|     8|   Shane| 32|  null|  null|  null|       null|
|    11|    Alex| 28|  null|  null|  null|       null|
|     2|     Ray| 35|     2|  2000|     2|       2000|
|     4|    Jane| 28|     4|  4000|     4|       4000|
|    13|     Max| 31|  null|  null|  null|       null|
+------+--------+---+------+------+------+-----------+

+------+--------+---+------+-----------+
|emp_id|    name|age|sa

In [33]:
# 3. 缺失值处理（缺失值类型为：<class 'NoneType'>）
# 1.删除有缺失值的行
clean_data = final_data2.na.drop()
clean_data.show()

# 2.用均值替换缺失值
import math
from pyspark.sql import functions as func  # 导入spark内置函数
# 计算缺失值，collect()函数将数据返回到driver端，为Row对象，[0]可以获取Row的值
mean_salary = final_data2.select(func.mean('salary'), func.mean('departement')).collect()
print(mean_salary)
clean_data = final_data2.na.fill({'salary':mean_salary[0][0],"departement":mean_salary[0][1]})
clean_data.show()

# 3.如果一行至少2个缺失值才删除该行
# final_data2.na.drop(thresh=1).show() # 不生效，不知原因

# 4.填充缺失值
# 对所有列用同一个值填充缺失值。注意：填充值必须和被填充列数据类型相同，否则不生效。
# final_data2.na.fill('unknown').show()

# 5.不同的列用不同的值填充。注意：填充值必须和被填充列数据类型相同，否则不生效。
# final_data2.na.fill({'salary':'--', 'departement':'unknown'}).show()

+------+----+---+------+-----------+
|emp_id|name|age|salary|departement|
+------+----+---+------+-----------+
|     1|John| 25|  1000|       1000|
|     3|Mike| 24|  3000|       3000|
|     2| Ray| 35|  2000|       2000|
|     4|Jane| 28|  4000|       4000|
+------+----+---+------+-----------+

+------+--------+---+------+-----------+
|emp_id|    name|age|salary|departement|
+------+--------+---+------+-----------+
|     7|   James| 38|  2500|       2500|
|     6| Vincent| 35|  2500|       2500|
|     9|   Larry| 29|  2500|       2500|
|     5|   Kevin| 26|  2500|       2500|
|     1|    John| 25|  1000|       1000|
|    10|Kimberly| 29|  2500|       2500|
|     3|    Mike| 24|  3000|       3000|
|    12|   Garry| 25|  2500|       2500|
|     8|   Shane| 32|  2500|       2500|
|    11|    Alex| 28|  2500|       2500|
|     2|     Ray| 35|  2000|       2000|
|     4|    Jane| 28|  4000|       4000|
|    13|     Max| 31|  2500|       2500|
+------+--------+---+------+-----------+



In [72]:
# 4、空值判断
# 有两种空值判断，一种是浮点型空值nan（就是Numpy、Pandas中的np.nan），另一种是Python中普通的None
from pyspark.sql.functions import isnull, isnan

# 1.None的空值判断：isnull只能判断None
df = spark.createDataFrame([(1, None), (None, 2)], ("a", "b"))
df.select(isnull("a").alias("r1"), isnull(df.b).alias("r2"), isnan("a").alias("r11"), isnan(df.b).alias("r22")).show()

# 2.浮点型nan的空值判断：isnan只能判断float('nan') 等价于 np.nan
df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))
df.select(isnull("a").alias("r1"), isnull(df.b).alias("r2"), isnan("a").alias("r11"), isnan(df.b).alias("r22")).show()

+-----+-----+-----+-----+
|   r1|   r2|  r11|  r22|
+-----+-----+-----+-----+
|false| true|false|false|
| true|false|false|false|
+-----+-----+-----+-----+

+-----+-----+-----+-----+
|   r1|   r2|  r11|  r22|
+-----+-----+-----+-----+
|false|false|false| true|
|false|false| true|false|
+-----+-----+-----+-----+



In [15]:
# Pandas的series或Dataframe.isnull()方法都可以判断出来；np.isnan方法也都可以判断出来。
color_df.loc[0,"length"] = np.nan
color_df.loc[1,"length"] = None
print(color_df.isnull())
print(np.isnan(color_df.loc[0,"length"]), np.isnan(color_df.loc[1,"length"]))

   color  length
0  False    True
1  False    True
2  False   False
3  False   False
4  False   False
5  False   False
6  False   False
True True


In [26]:
'''
# pandas 
# where即if-else函数
np.where(isnull(a),b,a) # 不清楚

# combine_first方法
# 如果a中值为空，就用b中的值填补
a[:-2].combine_first(b[2:])

# combine_first函数即对数据打补丁，用df2的数据填充df1中的缺失值
df1.combine_first(df2)
'''

# pyspark
import pyspark.sql.functions as func
df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))
df.select(func.nanvl("a", "b").alias("r1"), func.nanvl(df.a, df.b).alias("r2")).show()

+---+---+
| r1| r2|
+---+---+
|1.0|1.0|
|2.0|2.0|
+---+---+



In [33]:
# 5、离群点
import math
# 需要提醒的是，列的计算都是放在select里面的

# 5.1.先计算均值
mean_salary = final_data2.select(func.mean('salary')).collect()[0][0]
print(mean_salary)

# 5.2.再计算方差
devs = final_data2.select(((final_data2.salary-mean_salary)**2).alias('deviation'))
devs.show()

# 5.3.再计算标准差
stddev = math.floor(math.sqrt(devs.groupBy().avg('deviation').first()[0]))
print(stddev)

# 5.4.用均值的两倍标准差替代离群值
no_outlier = final_data2.select(
    final_data2.emp_id, final_data2.name, final_data2.age, final_data2.salary,
    # between：salary在区间中则返回True； when：当条件为True返回指定值，否则返回otherwise指定的值。
    func.when(final_data2.salary.between(mean_salary-2*stddev, mean_salary+2*stddev), final_data2.salary).otherwise(mean_salary).alias("updated_salary")
    )
no_outlier.show()

2500.0
+---------+
|deviation|
+---------+
|     null|
|     null|
|     null|
|     null|
|2250000.0|
|     null|
| 250000.0|
|     null|
|     null|
|     null|
| 250000.0|
|2250000.0|
|     null|
+---------+

1118
+------+--------+---+------+--------------+
|emp_id|    name|age|salary|updated_salary|
+------+--------+---+------+--------------+
|     7|   James| 38|  null|        2500.0|
|     6| Vincent| 35|  null|        2500.0|
|     9|   Larry| 29|  null|        2500.0|
|     5|   Kevin| 26|  null|        2500.0|
|     1|    John| 25|  1000|        1000.0|
|    10|Kimberly| 29|  null|        2500.0|
|     3|    Mike| 24|  3000|        3000.0|
|    12|   Garry| 25|  null|        2500.0|
|     8|   Shane| 32|  null|        2500.0|
|    11|    Alex| 28|  null|        2500.0|
|     2|     Ray| 35|  2000|        2000.0|
|     4|    Jane| 28|  4000|        4000.0|
|    13|     Max| 31|  null|        2500.0|
+------+--------+---+------+--------------+



In [34]:
# 5.5、func中有现成的常用统计函数，更加方便
# 1.计算均值
mean_salary = final_data2.select(func.mean('salary')).collect()[0][0]
# 2.计算标准差
stddev2 = final_data2.select(func.stddev('salary')).collect()[0][0]
# 离群值替代就和上面的一致了
no_outlier2 = final_data2.select(
    final_data2.emp_id, final_data2.name, final_data2.age, final_data2.salary,
    # between：salary在区间中则返回True； when：当条件为True返回指定值，否则返回otherwise指定的值。
    func.when(final_data2.salary.between(mean_salary-2*stddev2, mean_salary+2*stddev2), final_data2.salary).otherwise(mean_salary).alias("updated_salary")
    )
no_outlier2.show()

+------+--------+---+------+--------------+
|emp_id|    name|age|salary|updated_salary|
+------+--------+---+------+--------------+
|     7|   James| 38|  null|        2500.0|
|     6| Vincent| 35|  null|        2500.0|
|     9|   Larry| 29|  null|        2500.0|
|     5|   Kevin| 26|  null|        2500.0|
|     1|    John| 25|  1000|        1000.0|
|    10|Kimberly| 29|  null|        2500.0|
|     3|    Mike| 24|  3000|        3000.0|
|    12|   Garry| 25|  null|        2500.0|
|     8|   Shane| 32|  null|        2500.0|
|    11|    Alex| 28|  null|        2500.0|
|     2|     Ray| 35|  2000|        2000.0|
|     4|    Jane| 28|  4000|        4000.0|
|    13|     Max| 31|  null|        2500.0|
+------+--------+---+------+--------------+



In [6]:
# 6、重复值
# 重复值的处理，和pandas很像啊
authors = [['Thomas','Hardy','June 2,1840'],
            ['Thomas','Hardy','June 2,1840'],
            ['Thomas','H',None],
            ['Jane','Austen','16 December 1775'],
            ['Emily',None,None]]

sdf = spark.createDataFrame(authors, schema=["FirstName","LastName","Dob"])
sdf.show()

# 删除重复值行（整行数据都重复才去重）
sdf.dropDuplicates().show()

# 指定列有重复值，则去重
sdf.dropDuplicates(subset=['FirstName']).show()


# pandas的方法
df = pd.DataFrame(authors, columns=["FirstName","LastName","Dob"])
df.drop_duplicates(subset=['FirstName']) # 指定列有重复值，则去重

+---------+--------+----------------+
|FirstName|LastName|             Dob|
+---------+--------+----------------+
|   Thomas|   Hardy|     June 2,1840|
|   Thomas|   Hardy|     June 2,1840|
|   Thomas|       H|            null|
|     Jane|  Austen|16 December 1775|
|    Emily|    null|            null|
+---------+--------+----------------+

+---------+--------+----------------+
|FirstName|LastName|             Dob|
+---------+--------+----------------+
|     Jane|  Austen|16 December 1775|
|    Emily|    null|            null|
|   Thomas|   Hardy|     June 2,1840|
|   Thomas|       H|            null|
+---------+--------+----------------+

+---------+--------+----------------+
|FirstName|LastName|             Dob|
+---------+--------+----------------+
|    Emily|    null|            null|
|     Jane|  Austen|16 December 1775|
|   Thomas|   Hardy|     June 2,1840|
+---------+--------+----------------+



Unnamed: 0,FirstName,LastName,Dob
0,Thomas,Hardy,"June 2,1840"
3,Jane,Austen,16 December 1775
4,Emily,,


In [7]:
# 7、生成新列
# 数据转换，可以理解成列与列的运算
# 注意自定义函数的调用方式

# 0.创建udf自定义函数，对于简单的lambda函数不需要指定返回值类型
from pyspark.sql.functions import udf
concat_func = udf(lambda name ,age : name + '_' + str(age))

# 1.应用自定义函数
concat_df = final_data2.withColumn("name_age", 
                                   concat_func(final_data.name, final_data.age))
concat_df.show()

# 2.通过列生成另一列
data_new = concat_df.withColumn("age_incremented", concat_df.age+1)
data_new.show()

# 3.某些列是自带一些常用的方法的
sdf.withColumn('Initial', sdf.LastName.substr(1,1)).show()

# 4.顺便增加一新列
from pyspark.sql.functions import lit
sdf.withColumn('newCol', lit(0)).show()

+------+--------+---+------+-----------+-----------+
|emp_id|    name|age|salary|departement|   name_age|
+------+--------+---+------+-----------+-----------+
|     7|   James| 38|  null|       null|   James_38|
|     6| Vincent| 35|  null|       null| Vincent_35|
|     9|   Larry| 29|  null|       null|   Larry_29|
|     5|   Kevin| 26|  null|       null|   Kevin_26|
|     1|    John| 25|  1000|       1000|    John_25|
|    10|Kimberly| 29|  null|       null|Kimberly_29|
|     3|    Mike| 24|  3000|       3000|    Mike_24|
|    12|   Garry| 25|  null|       null|   Garry_25|
|     8|   Shane| 32|  null|       null|   Shane_32|
|    11|    Alex| 28|  null|       null|    Alex_28|
|     2|     Ray| 35|  2000|       2000|     Ray_35|
|     4|    Jane| 28|  4000|       4000|    Jane_28|
|    13|     Max| 31|  null|       null|     Max_31|
+------+--------+---+------+-----------+-----------+

+------+--------+---+------+-----------+-----------+---------------+
|emp_id|    name|age|salary|d

NameError: name 'df1' is not defined

In [10]:
# 8、类eval操作
# 传入一个操作字符串，然后转成python代码执行，就像python的eval一样。
from pyspark.sql.functions import expr
color_sdf.select(expr('length(color)')).show()

+-------------+
|length(color)|
+-------------+
|            5|
|            5|
|            6|
|            3|
|            5|
|            4|
|            5|
+-------------+



In [16]:
# 9、行的最大最小值
# 测试数据
sdf = [(1,1000),(2,2000),(3,3000),(4,4000)]
sdf = spark.createDataFrame(sdf, schema=["emp_id","salary"])
sdf.show()

# 求行的最大最小值：比较列之间的值大小（不是求一列的最大/最小值）
from pyspark.sql.functions import greatest, least
sdf.select(greatest('emp_id','salary').alias('greatest'),
          least('emp_id','salary').alias('least')
          ).show()

+------+------+
|emp_id|salary|
+------+------+
|     1|  1000|
|     2|  2000|
|     3|  3000|
|     4|  4000|
+------+------+

+--------+-----+
|greatest|least|
+--------+-----+
|    1000|    1|
|    2000|    2|
|    3000|    3|
|    4000|    4|
+--------+-----+



In [24]:
# 10、when操作
from pyspark.sql.functions import when

# 1.case when age=2 then 3 else 4
sdf.select(when(sdf['emp_id'] == 2, 3).otherwise(4).alias("emp_id")).show()

# # 2.case when age=2 when age=age+1 
sdf.select(when(sdf.emp_id == 2, sdf.emp_id + 1).otherwise(5).alias("emp_id")).show()

+------+
|emp_id|
+------+
|     4|
|     3|
|     4|
|     4|
+------+

+------+
|emp_id|
+------+
|     5|
|     3|
|     5|
|     5|
+------+



In [26]:
# 11、lag,lead平移
# 很好用的函数啊，特别是在处理时间序列的时候，和pandas的shift很像。
from pyspark.sql.functions import lag, lead
df = spark.createDataFrame([(1, 2, 3) if i % 2 == 0 else (i, 2 * i, i % 4) 
                               for i in range(5)],
                           ["a", "b", "c"])
df.show()

# 报错，不知原因
# df.select(lag('a', 1, 0).alias('lag')).show()
# df.select(lead('a', 1, 0).alias('lag')).show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1|  2|  3|
|  1|  2|  1|
|  1|  2|  3|
|  3|  6|  3|
|  1|  2|  3|
+---+---+---+



Py4JJavaError: An error occurred while calling o477.showString.
: java.lang.UnsupportedOperationException: Cannot evaluate expression: lag(input[0, bigint, true], 1, 0)
	at org.apache.spark.sql.catalyst.expressions.Unevaluable$class.doGenCode(Expression.scala:261)
	at org.apache.spark.sql.catalyst.expressions.OffsetWindowFunction.doGenCode(windowExpressions.scala:337)
	at org.apache.spark.sql.catalyst.expressions.Expression$$anonfun$genCode$2.apply(Expression.scala:108)
	at org.apache.spark.sql.catalyst.expressions.Expression$$anonfun$genCode$2.apply(Expression.scala:105)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:105)
	at org.apache.spark.sql.catalyst.expressions.Cast.doGenCode(Cast.scala:660)
	at org.apache.spark.sql.catalyst.expressions.Expression$$anonfun$genCode$2.apply(Expression.scala:108)
	at org.apache.spark.sql.catalyst.expressions.Expression$$anonfun$genCode$2.apply(Expression.scala:105)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:105)
	at org.apache.spark.sql.catalyst.expressions.Cast.genCode(Cast.scala:655)
	at org.apache.spark.sql.catalyst.expressions.Alias.genCode(namedExpressions.scala:155)
	at org.apache.spark.sql.execution.ProjectExec$$anonfun$6.apply(basicPhysicalOperators.scala:60)
	at org.apache.spark.sql.execution.ProjectExec$$anonfun$6.apply(basicPhysicalOperators.scala:60)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.AbstractTraversable.map(Traversable.scala:104)
	at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:60)
	at org.apache.spark.sql.execution.CodegenSupport$class.consume(WholeStageCodegenExec.scala:189)
	at org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:374)
	at org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:403)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:374)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:45)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$produce$1.apply(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.CodegenSupport$class.produce(WholeStageCodegenExec.scala:85)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:35)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:544)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:598)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:339)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.GeneratedMethodAccessor67.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
