In [2]:
import numpy as np
import pandas as pd
import datetime
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName('my_first_app_name') \
    .getOrCreate()

In [None]:
'''
自定义函数的重点在于定义返回值类型的数据格式，其数据类型基本都是从from pyspark.sql.types import * 导入，常用的包括：
StructType()：结构体
StructField()：结构体中的元素
LongType()：长整型
StringType()：字符串
IntegerType()：一般整型
FloatType()：浮点型
'''

In [3]:
from pyspark.sql.types import StructType, StructField, LongType, StringType
schema = StructType([
    StructField("id", LongType(), True),
    StructField("name", StringType(), True),
    StructField("age", LongType(), True),
    StructField("eyeColor", StringType(), True)
])

In [4]:
colors = ['white','green','yellow','red','brown','pink', 'white']
color_df = pd.DataFrame(colors,columns=['color'])
color_df['length'] = color_df['color'].apply(len)
print(color_df)

color_sdf = spark.createDataFrame(color_df)
color_sdf.show()

    color  length
0   white       5
1   green       5
2  yellow       6
3     red       3
4   brown       5
5    pink       4
6   white       5
+------+------+
| color|length|
+------+------+
| white|     5|
| green|     5|
|yellow|     6|
|   red|     3|
| brown|     5|
|  pink|     4|
| white|     5|
+------+------+



In [19]:
# 1、自定义函数的一般流程
# 1.创建普通的python函数
def toDate(s):
    return str(s)+'-'

# 2.注册自定义函数
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# 根据python的返回值类型定义好spark对应的数据类型
# python函数中返回的是string，对应的pyspark是StringType
toDateUDF = udf(toDate, StringType()) 

# 使用自定义函数
color_sdf.withColumn('color1',toDateUDF('color')).show()

# 使用 lambda表达式
temp_fun = udf(lambda x: str(x) + "=", StringType())
color_sdf = color_sdf.withColumn('color2', temp_fun("color"))
color_sdf.show()
print(color_sdf.take(1)[0], type(color_sdf.take(1)[0]))
print(color_sdf.take(1)[0][2], type(color_sdf.take(1)[0][2]))

+------+------+-------+-------+
| color|length| color2| color1|
+------+------+-------+-------+
| white|     5| white=| white-|
| green|     5| green=| green-|
|yellow|     6|yellow=|yellow-|
|   red|     3|   red=|   red-|
| brown|     5| brown=| brown-|
|  pink|     4|  pink=|  pink-|
| white|     5| white=| white-|
+------+------+-------+-------+

+------+------+-------+
| color|length| color2|
+------+------+-------+
| white|     5| white=|
| green|     5| green=|
|yellow|     6|yellow=|
|   red|     3|   red=|
| brown|     5| brown=|
|  pink|     4|  pink=|
| white|     5| white=|
+------+------+-------+



AttributeError: dtypes