### Spark Session

In [None]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Adding AWS S3 Minio configs
sparkConf = (
    SparkConf()
    .set("spark.jars.ivy","/home/brijeshdhaker/.ivy2")
    .set("spark.ui.port", "4042")
    #.set("spark.jars.packages","org.apache.hadoop:hadoop-aws:3.0.0,io.delta:delta-spark_2.12:3.3.2")
    #.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    #.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    #.set("spark.executor.heartbeatInterval", "300000")
    #.set("spark.network.timeout", "400000")
    #.set("spark.hadoop.fs.s3a.endpoint", "http://minio.sandbox.net:9010")
    #.set("spark.hadoop.fs.s3a.access.key", "pgm2H2bR7a5kMc5XCYdO")
    #.set("spark.hadoop.fs.s3a.secret.key", "zjd8T0hXFGtfemVQ6AH3yBAPASJNXNbVSx5iddqG")
    #.set("spark.hadoop.fs.s3a.path.style.access", "true")
    #.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    #.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    #.set("spark.eventLog.enabled", "true")
    #.set("spark.eventLog.dir", "file:///apps/var/logs/spark-events")
)

spark = (
    SparkSession.builder.master("local[*]").
        appName('spark-sql-notebook').
        config(conf=sparkConf).
        getOrCreate()
)

spark.sparkContext.setLogLevel('ERROR')
spark

25/10/03 09:58:22 WARN Utils: Your hostname, vmware-ubuntu-24.04 resolves to a loopback address: 127.0.1.1; using 192.168.154.133 instead (on interface ens33)
25/10/03 09:58:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/03 09:58:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Spark Configurations

#### Set

In [None]:
# Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [None]:
spark.conf.set("spark.sql.shuffle.partitions", "8")

#### Get

In [None]:
print(spark.conf.get("spark.sql.shuffle.partitions"))

# Check Spark defaultParallelism
print(spark.sparkContext.defaultParallelism)

### Broadcast Variables

In [None]:

states = {"NY":"New York", "CA":"California", "FL":"Florida", "TX":"Texas", "CH":"Chicago"} 
broadcastStates = spark.sparkContext.broadcast(states)

print("{}".format(broadcastStates.value))

data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]

columns = ["firstname","lastname","country","state"]
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)

def state_convert(code):
    return broadcastStates.value[code]

result = df.rdd.map(lambda x: (x[0],x[1],x[2],state_convert(x[3]))).toDF(columns)
result.show(truncate=False)

## Broadcast variable on filter
filteDf= df.where((df['state'].isin(list(broadcastStates.value.keys()))))
filteDf.show(truncate=False)

### Accumulators Variables

In [None]:
# Create Accumulator
sc_acc = spark.sparkContext.accumulator(0)
print("Accumulator initial value: {}".format(sc_acc.value))


### Read Data

In [None]:
## By default, Spark's CSV reader treats empty strings ("") and blank values (e.g., ,, where nothing is between the commas) as null

# This code will read "N/A" as null
dfFromCSV = spark.read \
    .option("header", True) \
    .option("delimiter", ',') \
    .option("emptyValue", 'N/A') \
    .option("nullValue", 0) \
    .option("treatEmptyValuesAsNulls", True) \
    .options(inferSchema=True, delimiter=',') \
    .csv("file:///apps/sandbox/defaultfs/employee.csv")

#dfFromCSV.printSchema()
dfFromCSV.show(truncate=False)


In [5]:
dfFromJSON = spark.read.format("json").load("file:///apps/sandbox/defaultfs/cdrs.json")
dfFromJSON.printSchema()
dfFromJSON.show(truncate=False)

root
 |-- CallCharge: long (nullable = true)
 |-- DateTime: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- OriginatingNum: long (nullable = true)
 |-- TerminatingNum: long (nullable = true)

+----------+-------------------+--------------+--------------+--------------+--------------+
|CallCharge|DateTime           |Dest          |Origin        |OriginatingNum|TerminatingNum|
+----------+-------------------+--------------+--------------+--------------+--------------+
|549       |02/11/2016 01:51:41|Birmingham    |London        |797308107     |797131221     |
|2645      |05/02/2016 01:26:54|London        |Manchester    |777121117     |777440392     |
|1233      |01/12/2016 21:12:54|Manchester    |Victoria      |797009202     |784243404     |
|2651      |07/11/2016 01:07:34|Victoria      |Twickenham    |777557705     |798420467     |
|3162      |02/11/2016 22:22:26|Scotland      |Leeds         |785434022     |779086250     |
|2246   

In [4]:
dfParquet = spark.read.format("parquet").load("file:///apps/sandbox/defaultfs/taxi-data/yellow_tripdata_2021-04.parquet")
dfParquet.printSchema()
dfParquet.show(truncate=False)

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+----

In [None]:
employee_schema = StructType() \
    .add("emp_id", IntegerType(), True) \
    .add("emp_name", StringType(), True) \
    .add("emp_role", StringType(), True) \
    .add("emp_manager", StringType(), True) \
    .add("emp_hiredate", DateType(), True) \
    .add("emp_salary", IntegerType(), True) \
    .add("emp_comm", IntegerType(), True) \
    .add("emp_dept", IntegerType(), True)

df_with_schema = spark.read.format("csv") \
    .option("header", True) \
    .schema(employee_schema) \
    .load("file:///apps/sandbox/defaultfs/employee.csv")

#df_with_schema.printSchema()

df_with_schema.show(truncate=False)

### Handeling Null & Empyty Values

In [None]:
#
## Replacing null values with some value
#
df_with_schema.fillna(value=0).show()
df_with_schema.fillna(value=0,subset=["emp_comm"]).show() # only for emp_comm column
df_with_schema.fillna(10,["emp_comm"]) \
    .fillna("---",["emp_manager"]).show()

In [None]:
#
## Replacing null values with some value using na
#
df_with_schema.na.fill(value=0).show()
df_with_schema.na.fill(value=0,subset=["emp_comm"]).show()

In [None]:

employee_columns = ['emp_id', 'emp_name', 'emp_role', 'emp_manager', 'emp_hiredate', 'emp_salary', 'emp_comm', 'emp_dept']

employee_schema = StructType() \
    .add("emp_id", IntegerType(), True) \
    .add("emp_name", StringType(), True) \
    .add("emp_role", StringType(), True) \
    .add("emp_manager", StringType(), True) \
    .add("emp_hiredate", DateType(), True) \
    .add("emp_salary", IntegerType(), True) \
    .add("emp_comm", IntegerType(), True) \
    .add("emp_dept", IntegerType(), True)

employee_df = spark.read.csv("file:///apps/sandbox/defaultfs/employee.csv",
    header=True,
    schema=employee_schema
)

#
# employee_df.printSchema()

#
# print(employee_df.rdd.getNumPartitions())

#
employee_df.show(truncate=False)

In [None]:

dept_columns = ['dept_id', 'dept_name', 'dept_location']

dept_schema = StructType() \
    .add("dept_id", IntegerType(), True) \
    .add("dept_name", StringType(), True) \
    .add("dept_location", StringType(), True)

dept_df = spark.read.format("csv") \
    .option("header", True) \
    .schema(dept_schema) \
    .load("file:///apps/sandbox/defaultfs/departments.csv")

#dept_df.printSchema()

dept_df.show(truncate=False)

### Data Repartition

In [None]:
#
employee_df = employee_df.repartition(2)

#
print(employee_df.rdd.getNumPartitions())

#
# employee_df.show(truncate=False)

## Process Dataframe

### Map Dataframe Partitions

In [None]:

def applyMap(row):
    bonus = 0
    salary = 0

    if row.emp_comm is None :
        bonus = 0
    else :
        bonus = row.emp_comm


    if row.emp_role == 'ANALYST':
        salary = row.emp_salary + 1000
        bonus = bonus + 100
    elif row.emp_role == 'CLERK':
        salary = row.emp_salary + 1500
        bonus = bonus + 150
    elif row.emp_role == 'MANAGER':
        salary = row.emp_salary + 2000
        bonus = bonus + 200
    elif row.emp_role == 'SALESMAN':
        salary = row.emp_salary + 2500
        bonus = bonus + 250
    else:
        salary = row.emp_salary
        bonus = 0
    
    return (row.emp_id, row.emp_name, row.emp_role, row.emp_manager, row.emp_hiredate, salary, bonus, row.emp_dept)


df1_columns = ["emp_id","emp_name", "emp_role", "emp_manager", "emp_hiredate", "emp_salary", "emp_comm", "emp_dept"]

##
#df1 = employee_df.rdd.map(lambda r: (r.emp_id, r.emp_name, r.emp_role, r.emp_manager, r.emp_hiredate, r.emp_salary, 225, r.emp_dept)).toDF(df1_columns)

##
df1 = employee_df.rdd.map(applyMap).toDF(df1_columns)
df1.show(truncate=False)


### flatMap Dataframe Partitions

### Process Dataframe Partitions
Unfortunately, PySpark DataFame doesn’t have flatMap() transformation however, DataFrame has explode() SQL function that is used to flatten the column. Below is a complete example.

In [None]:
arrayData = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})]
df = spark.createDataFrame(data=arrayData, schema = ['name','knownLanguages','properties'])

from pyspark.sql.functions import explode
df2 = df.select(df.name, explode(df.knownLanguages))
df2.printSchema()
df2.show()

In [None]:
# Sample DataFrame
data = [("Spark is great",), ("Map and FlatMap are useful",)]
df = spark.createDataFrame(data, ["sentence"])

# Use 'explode' function to achieve flatMap-like behavior
df_words = df.withColumn("words", explode(split(col("sentence"), " ")))

df_words.show()

In [None]:
# Using mapPartitions with yield
def formatWithYield(partition_data):
    for record in partition_data:
        role = 'ANALYST' if  record.emp_role == 'CLERK' else 'MANAGER'
        salary = record.emp_salary
        if record.emp_salary <= 1000 :
            salary = record.emp_salary * 10
        
        bonus = record.emp_comm 
        if record.emp_comm is None :
            bonus = 50
        yield (record.emp_id, record.emp_name, record.emp_role, record.emp_manager, record.emp_hiredate, salary, bonus, record.emp_dept)




df2_columns = ["emp_id","emp_name", "emp_role", "emp_manager", "emp_hiredate", "emp_salary", "emp_comm", "emp_dept"]
df2 = employee_df.rdd.mapPartitions(formatWithYield).toDF(df2_columns)
df2.show(truncate=False)



In [None]:
# Using mapPartitions with iterator
def formatWithIter(partition_data):
    p_data = []
    for record in partition_data:
        role = 'ANALYST' if  record.emp_role == 'CLERK' else 'MANAGER'
        salary = record.emp_salary
        if record.emp_salary <= 1000 :
            salary = record.emp_salary * 10
        
        bonus = record.emp_comm 
        if record.emp_comm is None:
            bonus = 50
        
        p_data.append([record.emp_id, record.emp_name, record.emp_role, record.emp_manager, record.emp_hiredate, salary, bonus, record.emp_dept])
    return iter(p_data)

df3_columns = ["emp_id","emp_name", "emp_role", "emp_manager", "emp_hiredate", "emp_salary", "emp_comm", "emp_dept"]
df3 = employee_df.rdd.mapPartitions(formatWithIter).toDF(df3_columns)
df3.show(truncate=False)

### Process Dataframe with mapPartitionsWithIndex

In [None]:
#
#
#
def formatWithMapPartitionsWithIndex(partitionIndex, iterator):
    for paartion_data in iterator:
        role = 'ANALYST' if  paartion_data.emp_role == 'CLERK' else 'MANAGER'
        salary = paartion_data.emp_salary
        if paartion_data.emp_salary <= 1000 :
            salary = paartion_data.emp_salary * 10
        
        bonus = paartion_data.emp_comm 
        if paartion_data.emp_comm is None:
            bonus = 50
        
        yield (partitionIndex+1, paartion_data.emp_id, paartion_data.emp_name, role, paartion_data.emp_manager, paartion_data.emp_hiredate, salary, bonus, paartion_data.emp_dept)
    
    #yield (partitionIndex, len(list(iterator)))
    

df4_columns = ["partition","emp_id","emp_name", "emp_role", "emp_manager", "emp_hiredate", "emp_salary", "emp_comm", "emp_dept"]
df4 = employee_df.rdd.mapPartitionsWithIndex(formatWithMapPartitionsWithIndex).toDF(df4_columns)
df4.show(truncate=False)

In [None]:
employee_df = employee_df.filter(col("emp_salary") > 2000)\
    .select("emp_id", "emp_name", "emp_dept", "emp_salary")\
    .groupby("emp_dept")\
    .count()

In [None]:
employee_df.collect()

In [None]:
df = spark.range(1,6)
df.show()

In [None]:
df.select("id").show()

In [None]:
df.select((df.id).alias("##"), col("id").alias("#ID") ,(df.id + 10).alias('###COL')).show()

In [None]:
df.selectExpr("id * 5" , "id").show()

In [None]:
names = spark.createDataFrame(data=[(1000, 'Nick'), (1001, 'John'), (1002, 'Frank')], schema=['id', 'name'])
names.printSchema()

In [None]:
names.select('id', 'name').show()

#### Q-001. If this value is set to a number other than 200, the number of tasks in the shuffle stage will reflect that new value.

`Adaptive Query Execution (AQE)`: In Spark 3.0 and later, Adaptive Query Execution (AQE) is an optimization that can dynamically adjust the number of shuffle partitions during runtime. AQE can coalesce small shuffle partitions into larger ones, effectively reducing the number of tasks if the data distribution allows for it. If AQE is enabled and determines that fewer partitions are optimal, you will see fewer than 200 tasks.

`Data Volume and Distribution`: If the amount of data being processed is very small, or if the data is highly skewed (meaning a few keys have a disproportionately large amount of data), Spark might not utilize all 200 partitions efficiently, or AQE might optimize the partition count.

`Coalesce or Repartition before GroupBy`: If a coalesce or repartition operation was performed on the DataFrame immediately before the groupBy, it could explicitly set the number of partitions, overriding the default shuffle partition setting for that specific operation.

`Specific Optimization Strategies`: Certain optimization strategies or custom partitioning schemes might be in place that influence the number of partitions used during the groupBy operation, leading to a task count different from 200.

In [None]:
names.select(col("id")).show()

In [None]:
from pyspark.sql import SparkSession

# Spark session & context
spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext

# Sum of the first 100 whole numbers
rdd = sc.parallelize(range(100 + 1))
rdd.sum()
# 5050

In [None]:
##
## spark.read.format("csv").option("header", "true").load("file:///apps/sandbox/defaultfs/employee.csv")
## Load data from csv
##

employee_df = spark.read.csv("file:///apps/sandbox/defaultfs/employee.csv",
    header=True,
    nullValue="NA",
    inferSchema=True
)

#employee_df.printSchema()
print(employee_df.rdd.getNumPartitions())

employee_df = employee_df.repartition(2)
print(employee_df.rdd.getNumPartitions())

employee_df = employee_df.filter(col("emp_salary") > 2000)\
    .select("emp_id", "emp_name", "emp_dept", "emp_salary")\
    .groupby("emp_dept")\
    .count()


employee_df.collect()

#### Get highest salary of each group  

In [None]:
#Get highest salary of each group  
w3 = Window.partitionBy("department").orderBy(col("salary").desc())
df.withColumn("row",row_number().over(w3)) \
  .filter(col("row") == 1).drop("row") \
  .show()

#### Get max, min, avg, sum of each group

In [None]:
w4 = Window.partitionBy("department")
df.withColumn("row",row_number().over(w3)) \
  .withColumn("avg", avg(col("salary")).over(w4)) \
  .withColumn("sum", sum(col("salary")).over(w4)) \
  .withColumn("min", min(col("salary")).over(w4)) \
  .withColumn("max", max(col("salary")).over(w4)) \
  .where(col("row")==1).select("department","avg","sum","min","max") \
  .show()