In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, udf
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName('Spark DF').getOrCreate()



spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
spark.conf.set('spark.sql.repl.eagerEval.maxNumRows', 10)
fp = '../FileStore/tables/OfficeData.csv'
df = spark.read.options(header=True, inferSchema=True).csv(fp)

In [8]:
df

employee_name,department,state,salary,age,bonus
James,Sales,NY,90000,34,10000
Michael,Sales,NY,86000,56,20000
Robert,Sales,CA,81000,30,23000
Maria,Finance,CA,90000,24,23000
Raman,Finance,CA,99000,40,24000
Scott,Finance,NY,83000,36,19000
Jen,Finance,NY,79000,53,15000
Jeff,Marketing,CA,80000,25,18000
Kumar,Marketing,NY,91000,50,21000


In [13]:
def get_salary(salary, bonus):
    return salary + bonus

total_salary_UDF = udf(lambda x, y: get_salary(x, y), IntegerType())


df.withColumn('total_salary', total_salary_UDF(col('salary'), col('bonus')))


employee_name,department,state,salary,age,bonus,total_salary
James,Sales,NY,90000,34,10000,100000
Michael,Sales,NY,86000,56,20000,106000
Robert,Sales,CA,81000,30,23000,104000
Maria,Finance,CA,90000,24,23000,113000
Raman,Finance,CA,99000,40,24000,123000
Scott,Finance,NY,83000,36,19000,102000
Jen,Finance,NY,79000,53,15000,94000
Jeff,Marketing,CA,80000,25,18000,98000
Kumar,Marketing,NY,91000,50,21000,112000


In [15]:
@udf
def get_fullname(x, y):
    return f'{x.lower()} {y.upper()[::-1]}'

df.withColumn('full_name', get_fullname(col('employee_name'), col('employee_name')))

employee_name,department,state,salary,age,bonus,full_name
James,Sales,NY,90000,34,10000,james SEMAJ
Michael,Sales,NY,86000,56,20000,michael LEAHCIM
Robert,Sales,CA,81000,30,23000,robert TREBOR
Maria,Finance,CA,90000,24,23000,maria AIRAM
Raman,Finance,CA,99000,40,24000,raman NAMAR
Scott,Finance,NY,83000,36,19000,scott TTOCS
Jen,Finance,NY,79000,53,15000,jen NEJ
Jeff,Marketing,CA,80000,25,18000,jeff FFEJ
Kumar,Marketing,NY,91000,50,21000,kumar RAMUK
