In [1]:
import numpy as np
import pandas as pd
import datetime
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName('my_first_app_name') \
    .getOrCreate()

In [2]:
# 1、字符串拼接
from pyspark.sql.functions import concat, concat_ws
df = spark.createDataFrame([('abcd','123')], ['s', 'd'])

# 1.直接拼接
df.select(concat(df.s, df.d).alias('s')).show()
# abcd123

# 2.指定拼接符
df.select(concat_ws('-', df.s, df.d).alias('s')).show()
# 'abcd-123'

+-------+
|      s|
+-------+
|abcd123|
+-------+

+--------+
|       s|
+--------+
|abcd-123|
+--------+



In [3]:
# 2、字符串格式化
from pyspark.sql.functions import format_string

df = spark.createDataFrame([(5, "hello")], ['a', 'b'])
df.select(format_string('%d %s', df.a, df.b).alias('v')).show()
# 5 hello

+-------+
|      v|
+-------+
|5 hello|
+-------+



In [4]:
# 3、查找字符串位置
from pyspark.sql.functions import instr

df = spark.createDataFrame([('abcd',)], ['s',])
df.select(instr(df.s, 'b').alias('s')).show()

+---+
|  s|
+---+
|  2|
+---+



In [5]:
# 4、字符串截取
from pyspark.sql.functions import substring

df = spark.createDataFrame([('abcd',)], ['s',])
df.select(substring(df.s, 1, 2).alias('s')).show()

+---+
|  s|
+---+
| ab|
+---+



In [6]:
# 5、正则表达式
from pyspark.sql.functions import regexp_extract

df = spark.createDataFrame([('100-200',)], ['str'])
df.select(regexp_extract('str', '(\d+)-(\d+)', 1).alias('d')).show()
# '100'

df = spark.createDataFrame([('foo',)], ['str'])
df.select(regexp_extract('str', '(\d+)', 1).alias('d')).show()

+---+
|  d|
+---+
|100|
+---+

+---+
|  d|
+---+
|   |
+---+



In [7]:
# 6、正则表达式替换
from pyspark.sql.functions import regexp_replace

df = spark.createDataFrame([('100-200',)], ['str'])
df.select(regexp_replace('str', '(\\d+)', '--').alias('d')).collect()

[Row(d='-----')]

In [8]:
# 7、字符串复制
from pyspark.sql.functions import repeat

df = spark.createDataFrame([('ab',)], ['s',])
df.select(repeat(df.s, 3).alias('s')).collect()

[Row(s='ababab')]

In [None]:
# 8、字符串分割
from pyspark.sql.functions import split

df = spark.createDataFrame([('ab12cd',)], ['s',])
df.select(split(df.s, '[0-9]+').alias('s')).collect() # 使用正则表达式