In [1]:
import findspark

In [2]:
findspark.init("/opt/manual/spark/")

In [3]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *

In [4]:
spark = (SparkSession.builder
        .appName("udf")
        .master("local[2]")
         .getOrCreate()     
        
        ) 

In [5]:
df = spark.read \
.option("header", True) \
.option("inferSchema", True) \
.option("compression", "gzip") \
.csv("file:///home/train/datasets/Hotel_Reviews.csv.gz")


In [8]:
df2 = df.withColumn("Tags", 
                    F.split(F.col("Tags"),",")
                   .cast(ArrayType(StringType()))) \
.withColumn("Review_Date",F.to_date(F.col("Review_Date"),"M/d/yyyy"))

In [9]:
# create function

def upper_case(x):
    return x.upper()

In [10]:
upper_case("hello")

'HELLO'

In [12]:
#format
# spark.udf.register(name,f,returnType=none)

from pyspark.sql.types import StringType

upper_case_udf = spark.udf.register("upper_case", upper_case)

In [13]:
df2.select(upper_case_udf("Hotel_name")).show(4)

+----------------------+
|upper_case(Hotel_name)|
+----------------------+
|           HOTEL ARENA|
|           HOTEL ARENA|
|           HOTEL ARENA|
|           HOTEL ARENA|
+----------------------+
only showing top 4 rows



In [14]:
# UDF in multiple cols

# Hotel_name, Reviewer_Nationality

def hotel_and_country(hotel,country):
    return "{} - {}".format(hotel,country)

In [15]:
hotel_and_country_udf = spark.udf.register("hotel_and_country",hotel_and_country, StringType())

In [16]:
df2.select(hotel_and_country_udf("Hotel_name", "Reviewer_Nationality")).show(n=4, truncate=False)

+---------------------------------------------------+
|hotel_and_country(Hotel_name, Reviewer_Nationality)|
+---------------------------------------------------+
|Hotel Arena -  Russia                              |
|Hotel Arena -  Ireland                             |
|Hotel Arena -  Australia                           |
|Hotel Arena -  United Kingdom                      |
+---------------------------------------------------+
only showing top 4 rows



In [17]:
#to Pandas df
# F.pandas_udf(f, returnType)

import pandas as pd

  return f(*args, **kwds)


In [18]:
def pd_hotel_and_country(hotel:pd.Series,country:pd.Series) -> pd.Series:
    return hotel + ' ' + country

In [23]:
pd_hotel_and_country_udf = F.pandas_udf(pd_hotel_and_country, StringType())

In [24]:
df2.select(pd_hotel_and_country_udf("Hotel_name","Reviewer_Nationality")).show(n=4,truncate=False)

+------------------------------------------------------+
|pd_hotel_and_country(Hotel_name, Reviewer_Nationality)|
+------------------------------------------------------+
|Hotel Arena  Russia                                   |
|Hotel Arena  Ireland                                  |
|Hotel Arena  Australia                                |
|Hotel Arena  United Kingdom                           |
+------------------------------------------------------+
only showing top 4 rows



In [25]:
spark.stop()