In [2]:
import numpy as np
import pandas as pd
import datetime
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName('my_first_app_name') \
    .getOrCreate()

In [16]:
# 1、Creates a new map column.
from pyspark.sql.functions import create_map

df = spark.createDataFrame([(1, "John Doe", 21), (2, "mary", 23)], ("id", "name", "age"))
df.show()

temp = df.select(create_map([df.name, df.age]).alias("map"))
print(temp, temp.dtypes, type(temp))
temp.show()
print(temp.take(1)[0][0], type(temp.take(1)[0][0]), temp.take(1)[0][0]["John Doe"], type(temp.take(1)[0][0]["John Doe"]))

+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|John Doe| 21|
|  2|    mary| 23|
+---+--------+---+

DataFrame[map: map<string,bigint>] [('map', 'map<string,bigint>')] <class 'pyspark.sql.dataframe.DataFrame'>
+----------------+
|             map|
+----------------+
|[John Doe -> 21]|
|    [mary -> 23]|
+----------------+

{'John Doe': 21} <class 'dict'> 21 <class 'int'>


In [19]:
# 2、创建列表
from pyspark.sql.functions import array
temp = df.select(array('name', 'age').alias("arr")) # str和int 类型进数组，int类型自动转换为str类型
print(temp, temp.dtypes, type(temp))
temp.show()
print(temp.take(1)[0][0], type(temp.take(1)[0][0]), temp.take(1)[0][0][1], type(temp.take(1)[0][0][1]))

DataFrame[arr: array<string>] [('arr', 'array<string>')] <class 'pyspark.sql.dataframe.DataFrame'>
+--------------+
|           arr|
+--------------+
|[John Doe, 21]|
|    [mary, 23]|
+--------------+

['John Doe', '21'] <class 'list'> 21 <class 'str'>


In [23]:
# 3、元素存在判断
# 相当于 pandas.isin, pandas.notin
from pyspark.sql.functions import array_contains

df = spark.createDataFrame([(["a", "b", "c"],), ([],), (["aa", "bb"],)], ['data'])
df.show()
df.select(array_contains(df.data, "a")).show()

+---------+
|     data|
+---------+
|[a, b, c]|
|       []|
| [aa, bb]|
+---------+

+-----------------------+
|array_contains(data, a)|
+-----------------------+
|                   true|
|                  false|
|                  false|
+-----------------------+



In [26]:
# 4、数据拉直
# Returns a new row for each element in the given array or map
from pyspark.sql import Row
from pyspark.sql.functions import explode
df = spark.createDataFrame([(["a", "b", "c"], "mary", 23), ([],"join", 33), (["aa", "bb"], "jack", 4)], ['data',"name", "age"])
df.show()
df.select(explode('data').alias("str")).show()

eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
eDF.show()
eDF.select(explode('intlist').alias("anInt")).show()
eDF.select(explode('mapfield').alias("key", "value")).show()

+---------+----+---+
|     data|name|age|
+---------+----+---+
|[a, b, c]|mary| 23|
|       []|join| 33|
| [aa, bb]|jack|  4|
+---------+----+---+

+---+
|str|
+---+
|  a|
|  b|
|  c|
| aa|
| bb|
+---+

+---+---------+--------+
|  a|  intlist|mapfield|
+---+---------+--------+
|  1|[1, 2, 3]|[a -> b]|
+---+---------+--------+

+-----+
|anInt|
+-----+
|    1|
|    2|
|    3|
+-----+

+---+-----+
|key|value|
+---+-----+
|  a|    b|
+---+-----+



In [36]:
# 5、posexplode
# # Returns a new row for each element with position in the given array or map.
from pyspark.sql import Row
from pyspark.sql.functions import posexplode
df = spark.createDataFrame([(["a", "b", "c"], "mary", 23), ([],"join", 33), (["aa", "bb"], "jack", 4)], ['data',"name", "age"])
df.show()
df.select(posexplode('data')).show()

eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b", "c":"d"})])
eDF.show()
eDF.select(posexplode('intlist')).show()
eDF.select(posexplode('mapfield')).show()

+---------+----+---+
|     data|name|age|
+---------+----+---+
|[a, b, c]|mary| 23|
|       []|join| 33|
| [aa, bb]|jack|  4|
+---------+----+---+

+---+---+
|pos|col|
+---+---+
|  0|  a|
|  1|  b|
|  2|  c|
|  0| aa|
|  1| bb|
+---+---+

+---+---------+----------------+
|  a|  intlist|        mapfield|
+---+---------+----------------+
|  1|[1, 2, 3]|[a -> b, c -> d]|
+---+---------+----------------+

+---+---+
|pos|col|
+---+---+
|  0|  1|
|  1|  2|
|  2|  3|
+---+---+

+---+---+-----+
|pos|key|value|
+---+---+-----+
|  0|  a|    b|
|  1|  c|    d|
+---+---+-----+



In [None]:
# 6. json操作
# 6.1. getjsonobject
# 6.2. json_tuple
# 6.3. from_json
# 6.4. to_json

In [None]:
# 7、列表排序
from pyspark.sql.functions import sort_array

df = spark.createDataFrame([([2, 1, 3],),([1],),([],)], ['data'])

df.select(sort_array(df.data).alias('r')).show()
df.select(sort_array(df.data, asc=False).alias('r')).show()