In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.functions import sha1, rand
import time

In [2]:
spark = SparkSession.builder.appName('udf').getOrCreate()

In [4]:
def manual_split(x):
  return x.split("e")

In [6]:

manualSplitPythonUDF = spark.udf.register("manualSplitSQLUDF", manual_split, StringType())

In [13]:
randomDf = spark.range(100,1000*10).\
    withColumn('random_v', rand(seed=10).cast("string") )\
    .withColumn("hash", sha1("random_v"))\
  .drop("random_value")

In [14]:
randomDf.show()

+---+-------------------+--------------------+
| id|           random_v|                hash|
+---+-------------------+--------------------+
|100|0.41371264720975787|6f65a37773e1a173a...|
|101| 0.7311719281896606|3c5f154011aa08151...|
|102| 0.9031701155118229|28fc878d1f1942bd7...|
|103|0.09430205113458567|9c54ebb1c11c92d4c...|
|104|0.38340505276222947|46500976da0a25029...|
|105| 0.5569246135523511|9d8e41374ef6637e2...|
|106| 0.4977441406613893|29d9fa515d557568a...|
|107| 0.2076666106201438|672da7735c57d6707...|
|108| 0.9571919406508957|e6fb1c43359cff2d7...|
|109| 0.7429395461204413|8e3f2b2dfae9a25aa...|
|110| 0.3383362304807752|9b6f99aeded8d7460...|
|111| 0.6701724731609291|455139cdd2f5c2459...|
|112| 0.6417696089901257|38ee7efbe6c15078b...|
|113| 0.7241109765059401|d81a35b183bf54307...|
|114|0.34089575652338666|1bd6062a7dec0b5a8...|
|115|0.24856531779931312|9b691311770dcc119...|
|116| 0.5334251467105187|fa17c59f6eaef857b...|
|117|0.06447333000037836|54d37471df5d8bbcf...|
|118|0.194264

In [17]:
randomAugmentedDF = randomDf.select("*", manualSplitPythonUDF("hash").alias("augmented_col"))

In [18]:
randomAugmentedDF.show()

+---+-------------------+--------------------+--------------------+
| id|           random_v|                hash|       augmented_col|
+---+-------------------+--------------------+--------------------+
|100|0.41371264720975787|6f65a37773e1a173a...|[6f65a37773, 1a17...|
|101| 0.7311719281896606|3c5f154011aa08151...|[3c5f154011aa0815...|
|102| 0.9031701155118229|28fc878d1f1942bd7...|[28fc878d1f1942bd...|
|103|0.09430205113458567|9c54ebb1c11c92d4c...|[9c54, bb1c11c92d...|
|104|0.38340505276222947|46500976da0a25029...|[46500976da0a2502...|
|105| 0.5569246135523511|9d8e41374ef6637e2...|[9d8, 41374, f663...|
|106| 0.4977441406613893|29d9fa515d557568a...|[29d9fa515d557568...|
|107| 0.2076666106201438|672da7735c57d6707...|[672da7735c57d670...|
|108| 0.9571919406508957|e6fb1c43359cff2d7...|[, 6fb1c43359cff2...|
|109| 0.7429395461204413|8e3f2b2dfae9a25aa...|[8, 3f2b2dfa, 9a2...|
|110| 0.3383362304807752|9b6f99aeded8d7460...|[9b6f99a, d, d8d7...|
|111| 0.6701724731609291|455139cdd2f5c2459...|[4

In [19]:
# task

In [20]:
def ip(ipstring):

    A,B,C,D = [int(i) for i in ipstring.split('.')]

    return A*256**3 + B*256**2 + C*256 + D

In [22]:
from pyspark.sql.types import LongType

IPConvertUDF = spark.udf.register("IPConvertUDF", ip, LongType())

In [23]:
IPDF = spark.createDataFrame([["123.123.123.123"], ["1.2.3.4"], ["127.0.0.0"]], ['ip'])

In [24]:
IPDFWithParsedIP = IPDF.withColumn("parsedIP", IPConvertUDF("ip")).show()

+---------------+----------+
|             ip|  parsedIP|
+---------------+----------+
|123.123.123.123|2071690107|
|        1.2.3.4|  16909060|
|      127.0.0.0|2130706432|
+---------------+----------+



In [25]:
# Advanced udf

In [26]:
def manual_add(x,y):

    return x+y

In [27]:
from pyspark.sql.types import IntegerType

manualAddPythonUDF = spark.udf.register("manualAddSQLUDF", manual_add, IntegerType())

In [28]:
integerDF = (spark.createDataFrame([
  (1, 2),
  (3, 4),
  (5, 6)
], ["col1", "col2"]))

In [31]:
integerDF.select('*',manualAddPythonUDF("col1", "col2").alias('sum')).show()

+----+----+---+
|col1|col2|sum|
+----+----+---+
|   1|   2|  3|
|   3|   4|  7|
|   5|   6| 11|
+----+----+---+



In [32]:
from pyspark.sql.types import FloatType,StringType,StructType,StructField

In [33]:
mathOperationsSchema = StructType([
  StructField("sum", FloatType(), True), 
  StructField("multiplication", FloatType(), True), 
  StructField("division", FloatType(), True) 
])

In [34]:
def manual_math(x, y):
  return (float(x + y), float(x * y), x / float(y))

manual_math(1, 2)

(3.0, 2.0, 0.5)

In [35]:
manualMathPythonUDF = spark.udf.register("manualMathSQLUDF", manual_math, mathOperationsSchema)

In [36]:
integerDF.select("*", manualMathPythonUDF("col1", "col2").alias("sum"))

DataFrame[col1: bigint, col2: bigint, sum: struct<sum:float,multiplication:float,division:float>]

In [37]:
from pyspark.sql.functions import pandas_udf,PandasUDFType

In [42]:
@pandas_udf('double', PandasUDFType.SCALAR)
def pandas_plus_one(v):
    return v + 1

In [47]:
df = spark.range(0, 10 * 1000 * 1000)
df.withColumn('id_transformed', pandas_plus_one("id"))

DataFrame[id: bigint, id_transformed: double]