In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructField, IntegerType, DoubleType, StructType

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = SparkSession.builder.appName('DataFrameSQL').getOrCreate()

In [5]:
df_path = r'./data/persons.csv'
df = spark.read.csv(df_path, header=True, inferSchema=True)

In [9]:
df.show(5)

+--------------+---+------+------+
|          name|age|gender|salary|
+--------------+---+------+------+
|      John Doe| 30|  Male| 50000|
|    Jane Smith| 25|Female| 45000|
| David Johnson| 35|  Male| 60000|
|   Emily Davis| 28|Female| 52000|
|Michael Wilson| 40|  Male| 75000|
+--------------+---+------+------+
only showing top 5 rows



In [10]:
df.createOrReplaceTempView('my_table')

In [11]:
result = spark.sql('SELECT * FROM my_table WHERE AGE > 25')
result.show()

+------------------+---+------+------+
|              name|age|gender|salary|
+------------------+---+------+------+
|          John Doe| 30|  Male| 50000|
|     David Johnson| 35|  Male| 60000|
|       Emily Davis| 28|Female| 52000|
|    Michael Wilson| 40|  Male| 75000|
|       Sarah Brown| 32|Female| 58000|
|        Robert Lee| 29|  Male| 51000|
|       Lisa Garcia| 27|Female| 49000|
|    James Martinez| 38|  Male| 70000|
|Jennifer Rodriguez| 26|Female| 47000|
|  William Anderson| 33|  Male| 62000|
|   Karen Hernandez| 31|Female| 55000|
|Christopher Taylor| 37|  Male| 69000|
|     Matthew Davis| 36|  Male| 67000|
|    Patricia White| 29|Female| 50000|
|     Daniel Miller| 34|  Male| 64000|
| Elizabeth Jackson| 30|Female| 52000|
|     Joseph Harris| 28|  Male| 53000|
|      Linda Martin| 39|Female| 71000|
+------------------+---+------+------+



In [12]:
avg_salary_by_gender = spark.sql('SELECT gender, AVG(salary) FROM my_table GROUP BY gender')
avg_salary_by_gender.show()

+------+-----------+
|gender|avg(salary)|
+------+-----------+
|Female|    52300.0|
|  Male|    62100.0|
+------+-----------+



In [13]:
df.createOrReplaceTempView('people')

In [14]:
result = spark.sql('SELECT * FROM people WHERE age > 25')
result.show()

+------------------+---+------+------+
|              name|age|gender|salary|
+------------------+---+------+------+
|          John Doe| 30|  Male| 50000|
|     David Johnson| 35|  Male| 60000|
|       Emily Davis| 28|Female| 52000|
|    Michael Wilson| 40|  Male| 75000|
|       Sarah Brown| 32|Female| 58000|
|        Robert Lee| 29|  Male| 51000|
|       Lisa Garcia| 27|Female| 49000|
|    James Martinez| 38|  Male| 70000|
|Jennifer Rodriguez| 26|Female| 47000|
|  William Anderson| 33|  Male| 62000|
|   Karen Hernandez| 31|Female| 55000|
|Christopher Taylor| 37|  Male| 69000|
|     Matthew Davis| 36|  Male| 67000|
|    Patricia White| 29|Female| 50000|
|     Daniel Miller| 34|  Male| 64000|
| Elizabeth Jackson| 30|Female| 52000|
|     Joseph Harris| 28|  Male| 53000|
|      Linda Martin| 39|Female| 71000|
+------------------+---+------+------+



In [15]:
view_exists = spark.catalog.tableExists('people')
view_exists

True

In [16]:
spark.catalog.dropTempView('people')

True

In [17]:
view_exists = spark.catalog.tableExists('people')
view_exists

False

In [18]:
employee_data = [
    (1, "John"), (2, "Alice"), (3, "Bob"), (4, "Emily"),
    (5, "David"), (6, "Sarah"), (7, "Michael"), (8, "Lisa"),
    (9, "William")
]
employees = spark.createDataFrame(employee_data, ["id", "name"])

salary_data = [
    ("HR", 1, 60000), ("HR", 2, 55000), ("HR", 3, 58000),
    ("IT", 4, 70000), ("IT", 5, 72000), ("IT", 6, 68000),
    ("Sales", 7, 75000), ("Sales", 8, 78000), ("Sales", 9, 77000)
]

salaries = spark.createDataFrame(salary_data, ['department', 'id', 'salary'])

In [19]:
employees.show(5)

+---+-----+
| id| name|
+---+-----+
|  1| John|
|  2|Alice|
|  3|  Bob|
|  4|Emily|
|  5|David|
+---+-----+
only showing top 5 rows



In [20]:
salaries.show(5)

+----------+---+------+
|department| id|salary|
+----------+---+------+
|        HR|  1| 60000|
|        HR|  2| 55000|
|        HR|  3| 58000|
|        IT|  4| 70000|
|        IT|  5| 72000|
+----------+---+------+
only showing top 5 rows



In [21]:
employees.createOrReplaceTempView('employees')
salaries.createOrReplaceTempView('salaries')

In [24]:
result = spark.sql("""
    SELECT name
    FROM employees
    WHERE id IN (
        SELECT id
        FROM salaries
        WHERE salary > (SELECT AVG(salary) FROM salaries)
    )
""")

In [25]:
result.show()

+-------+
|   name|
+-------+
|  Emily|
|  David|
|Michael|
|   Lisa|
|William|
+-------+



In [26]:
employee_salary = spark.sql("""
    SELECT salaries.*, employees.name
    FROM salaries
    LEFT JOIN employees ON salaries.id = employees.id
""")

employee_salary.show()

+----------+---+------+-------+
|department| id|salary|   name|
+----------+---+------+-------+
|        HR|  1| 60000|   John|
|        HR|  2| 55000|  Alice|
|        HR|  3| 58000|    Bob|
|        IT|  4| 70000|  Emily|
|        IT|  5| 72000|  David|
|        IT|  6| 68000|  Sarah|
|     Sales|  7| 75000|Michael|
|     Sales|  8| 78000|   Lisa|
|     Sales|  9| 77000|William|
+----------+---+------+-------+



In [27]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F

In [28]:
window_spec = Window.partitionBy('department').orderBy(F.desc('salary'))

In [29]:
employee_salary.withColumn('rank', F.row_number().over(window_spec)).show()

+----------+---+------+-------+----+
|department| id|salary|   name|rank|
+----------+---+------+-------+----+
|        HR|  1| 60000|   John|   1|
|        HR|  3| 58000|    Bob|   2|
|        HR|  2| 55000|  Alice|   3|
|        IT|  5| 72000|  David|   1|
|        IT|  4| 70000|  Emily|   2|
|        IT|  6| 68000|  Sarah|   3|
|     Sales|  8| 78000|   Lisa|   1|
|     Sales|  9| 77000|William|   2|
|     Sales|  7| 75000|Michael|   3|
+----------+---+------+-------+----+



In [30]:
spark.stop()