In [None]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *


sparkConf = SparkConf() \
    .set("spark.eventLog.enabled", "true") \
    .set("spark.eventLog.dir", "file:///apps/var/logs/spark-events")


spark = (
    SparkSession.builder.master("local[4]").
        appName('Sample Spark Application').
        config(conf=sparkConf).
        getOrCreate()
)
spark.sparkContext.setLogLevel('WARN')

spark

In [None]:
spark.conf.set("spark.sql.shuffle.partitions", "8")
print(spark.conf.get("spark.sql.shuffle.partitions"))

In [None]:
##
## spark.read.format("csv").option("header", "true").load("file:///apps/sandbox/defaultfs/employee.csv")
## Load data from csv
##
employee_df = spark.read.csv("file:///apps/sandbox/defaultfs/employee.csv",
    header=True,
    nullValue="NA",
    inferSchema=True
)
#employee_df.printSchema()

print(employee_df.rdd.getNumPartitions())

## 1 Job = 1 Stage = 1 Task      For Read Operation
## 1 Job = 1 Stage = 1 Task      For inferSchema Operation

In [None]:
employee_df = employee_df.repartition(2)
print(employee_df.rdd.getNumPartitions())

In [None]:
employee_df = employee_df.filter(col("emp_salary") > 2000)\
    .select("emp_id", "emp_name", "emp_dept", "emp_salary")\
    .groupby("emp_dept")\
    .count()

In [None]:
employee_df.collect()

In [1]:
##
## spark.read.format("csv").option("header", "true").load("file:///apps/sandbox/defaultfs/employee.csv")
## Load data from csv
##
employee_df = spark.read.csv("file:///apps/sandbox/defaultfs/employee.csv",
    header=True,
    nullValue="NA",
    inferSchema=True
)

#employee_df.printSchema()
print(employee_df.rdd.getNumPartitions())

employee_df = employee_df.repartition(2)
print(employee_df.rdd.getNumPartitions())

employee_df = employee_df.filter(col("emp_salary") > 2000)\
    .select("emp_id", "emp_name", "emp_dept", "emp_salary")\
    .groupby("emp_dept")\
    .count()


employee_df.collect()

NameError: name 'spark' is not defined

#### Q-001. If this value is set to a number other than 200, the number of tasks in the shuffle stage will reflect that new value.

If this value is set to a number other than 200, the number of tasks in the shuffle stage will reflect that new value.

`Adaptive Query Execution (AQE)`: In Spark 3.0 and later, Adaptive Query Execution (AQE) is an optimization that can dynamically adjust the number of shuffle partitions during runtime. AQE can coalesce small shuffle partitions into larger ones, effectively reducing the number of tasks if the data distribution allows for it. If AQE is enabled and determines that fewer partitions are optimal, you will see fewer than 200 tasks.

`Data Volume and Distribution`: If the amount of data being processed is very small, or if the data is highly skewed (meaning a few keys have a disproportionately large amount of data), Spark might not utilize all 200 partitions efficiently, or AQE might optimize the partition count.

`Coalesce or Repartition before GroupBy`: If a coalesce or repartition operation was performed on the DataFrame immediately before the groupBy, it could explicitly set the number of partitions, overriding the default shuffle partition setting for that specific operation.

`Specific Optimization Strategies`: Certain optimization strategies or custom partitioning schemes might be in place that influence the number of partitions used during the groupBy operation, leading to a task count different from 200.


In [None]:
employee_df.show()

In [None]:
df = spark.range(1,6)
df.show()

In [None]:
df.select("id").show()

In [None]:
from pyspark.sql.functions import *

df.select((df.id).alias("##"), col("id").alias("#ID") ,(df.id + 10).alias('###COL')).show()

In [None]:
df.select(expr("id * 5").alias("##") , "id").show()

In [None]:
df.selectExpr("id * 5" , "id").show()

In [None]:
names = spark.createDataFrame(data=[(1000, 'Nick'), (1001, 'John'), (1002, 'Frank')], schema=['id', 'name'])
names.printSchema()

In [None]:
names.select('id', 'name').show()

## Hello
>This is good

In [None]:
names.select(col("id")).show()

In [None]:
from pyspark.sql import SparkSession

# Spark session & context
spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext

# Sum of the first 100 whole numbers
rdd = sc.parallelize(range(100 + 1))
rdd.sum()
# 5050

In [1]:
pip install ipython-sql pymysql

Note: you may need to restart the kernel to use updated packages.


In [1]:
%load_ext sql

In [2]:
%sql mysql+pymysql://mysqladmin:mysqladmin@mysqlserver.sandbox.net:3306/NEETASTUDIO

In [8]:
%sql USE NEETASTUDIO;

 * mysql+pymysql://mysqladmin:***@mysqlserver.sandbox.net:3306/NEETASTUDIO
0 rows affected.


[]

In [11]:
%config SqlMagic.style = '_DEPRECATED_DEFAULT'

In [12]:
%sql select * from CUSTOMER_ENQUIRIES LIMIT 5

 * mysql+pymysql://mysqladmin:***@mysqlserver.sandbox.net:3306/NEETASTUDIO
3 rows affected.


ID,NAME,EMAIL,PHONE,ENQUIRY_FOR,MESSAGE,ADD_TS
1,Deepika D.,deepikadhaker@gmail.com,+91 9820937445,maternity,I am looking for maternity photoshoot,2025-09-30 04:57:33
2,Swati R.,swatir@gmail.com,+91 9820937448,kids,I am looking for kid photography,2025-09-30 04:57:33
3,Tejas D.,tejasdhaker@gmail.com,+91 9820937445,maternity,I am looking for maternity photoshoot,2025-09-30 04:57:33


In [13]:
%%sql
 
select * from CUSTOMER_ENQUIRIES LIMIT 5

 * mysql+pymysql://mysqladmin:***@mysqlserver.sandbox.net:3306/NEETASTUDIO
3 rows affected.


ID,NAME,EMAIL,PHONE,ENQUIRY_FOR,MESSAGE,ADD_TS
1,Deepika D.,deepikadhaker@gmail.com,+91 9820937445,maternity,I am looking for maternity photoshoot,2025-09-30 04:57:33
2,Swati R.,swatir@gmail.com,+91 9820937448,kids,I am looking for kid photography,2025-09-30 04:57:33
3,Tejas D.,tejasdhaker@gmail.com,+91 9820937445,maternity,I am looking for maternity photoshoot,2025-09-30 04:57:33
