Spark allows us to create user-defined functions and such functions are called UDFs.

In [0]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
# optional
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark

#### User-Defined Function(UDF)
Create a user-defined function (UDF) that increments a passed numerical value by 1.

In [0]:
from pyspark.sql.functions import udf

def add_one(x: float):
    return x+1

# create a Spark UDF from the python function
incremented_num = udf(add_one)

# lets check the UDF
print(add_one(1))

2


Sweet. It appears that our UDF named **add_one** is worked as desired.

SImilar UDFs can be created to achieve different results as desired. Here, let us create a new UDF that takes in a column of a **SQL DataFrame** with floating point datatype and creates a new column that contains the interger value of that existing floating point value.<br><br>
Note that the integer datatype needs to be explicitly imported in Spark if the return type of the created UDF is integer data type.

In [0]:
from pyspark.sql.types import IntegerType

def convert_to_integer(x: float):
    return int(x)

# create a Spark UDF from the python function
convert_to_int_udf = udf(convert_to_integer, IntegerType())

# import dataframe from an existing data source to use the created UDF
path_to_filestore_tables = "/FileStore/tables/"
training_csv_filename = "train.csv"
titanic_df = spark.read.csv(path_to_filestore_tables+training_csv_filename,
                            header=True,
                            sep=",",
                           inferSchema = True)
titanic_df


PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Tim...",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. ...",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Osc...",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nich...",female,14.0,1,0,237736,30.0708,,C


In [0]:
# create a new column in the DataFrame by applying the UDF to a column
titanic_df.withColumn("Int Fare", convert_to_int_udf(titanic_df.Fare))

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Int Fare
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S,7
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C,71
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S,7
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S,53
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S,8
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,8
7,0,1,"McCarthy, Mr. Tim...",male,54.0,0,0,17463,51.8625,E46,S,51
8,0,3,"Palsson, Master. ...",male,2.0,3,1,349909,21.075,,S,21
9,1,3,"Johnson, Mrs. Osc...",female,27.0,0,2,347742,11.1333,,S,11
10,1,2,"Nasser, Mrs. Nich...",female,14.0,1,0,237736,30.0708,,C,30
