In [23]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark sql exercises") \
    .getOrCreate()


from pyspark.sql import Row

# Sample employee data
data = [
Row(EmpID=101, Name="Ravi", Department="Engineering", Project="AI Engine", Salary=95000, HoursPerWeek=42),
Row(EmpID=102, Name="Sneha", Department="Engineering", Project="Data Platform", Salary=87000, HoursPerWeek=45),
Row(EmpID=103, Name="Kabir", Department="Marketing", Project="Product Launch", Salary=65000, HoursPerWeek=40),
Row(EmpID=104, Name="Anita", Department="Sales", Project="Client Outreach", Salary=70000, HoursPerWeek=38),
Row(EmpID=105, Name="Divya", Department="Engineering", Project="AI Engine", Salary=99000, HoursPerWeek=48),
Row(EmpID=106, Name="Amit", Department="Marketing", Project="Social Media",Salary=62000, HoursPerWeek=35),
Row(EmpID=107, Name="Priya", Department="HR", Project="Policy Revamp", Salary=58000, HoursPerWeek=37),
Row(EmpID=108, Name="Manav", Department="Sales", Project="Lead Gen", Salary=73000, HoursPerWeek=41),
Row(EmpID=109, Name="Neha", Department="Engineering", Project="Security Suite",Salary=91000, HoursPerWeek=46),
Row(EmpID=110, Name="Farah", Department="HR", Project="Onboarding", Salary=60000, HoursPerWeek=36)
]

df=spark.createDataFrame(data)
df.show(truncate=False)

+-----+-----+-----------+---------------+------+------------+
|EmpID|Name |Department |Project        |Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|101  |Ravi |Engineering|AI Engine      |95000 |42          |
|102  |Sneha|Engineering|Data Platform  |87000 |45          |
|103  |Kabir|Marketing  |Product Launch |65000 |40          |
|104  |Anita|Sales      |Client Outreach|70000 |38          |
|105  |Divya|Engineering|AI Engine      |99000 |48          |
|106  |Amit |Marketing  |Social Media   |62000 |35          |
|107  |Priya|HR         |Policy Revamp  |58000 |37          |
|108  |Manav|Sales      |Lead Gen       |73000 |41          |
|109  |Neha |Engineering|Security Suite |91000 |46          |
|110  |Farah|HR         |Onboarding     |60000 |36          |
+-----+-----+-----------+---------------+------+------------+



In [74]:
df.createOrReplaceTempView("employee_local")


In [75]:
df.createOrReplaceGlobalTempView("employee_global")

In [52]:
# Part A: Exercises on Local View (employees_local )
spark.sql("select * from employee_local where project='AI Engine'").show()
print("------------------------------------------------------")

spark.sql("select*from employee_local where Department='Marketing'and Salary>60000").show()

print("------------------------------------------------------")
spark.sql("select Department,avg(Salary)as average_salary from employee_local group by Department").show()
print("------------------------------------------------------")

spark.sql("select EmpID,name,salary from employee_local order by salary desc limit 3").show()

print("------------------------------------------------------")

spark.sql("select empid,name,hoursperweek from employee_local where hoursperweek>40").show()

print("------------------------------------------------------")

spark.sql("select project,count(empid)from employee_local group by project").show()

print("------------------------------------------------------")







+-----+-----+-----------+---------+------+------------+
|EmpID| Name| Department|  Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------+------+------------+
|  101| Ravi|Engineering|AI Engine| 95000|          42|
|  105|Divya|Engineering|AI Engine| 99000|          48|
+-----+-----+-----------+---------+------+------------+

------------------------------------------------------
+-----+-----+----------+--------------+------+------------+
|EmpID| Name|Department|       Project|Salary|HoursPerWeek|
+-----+-----+----------+--------------+------+------------+
|  103|Kabir| Marketing|Product Launch| 65000|          40|
|  106| Amit| Marketing|  Social Media| 62000|          35|
+-----+-----+----------+--------------+------+------------+

------------------------------------------------------
+-----------+--------------+
| Department|average_salary|
+-----------+--------------+
|      Sales|       71500.0|
|Engineering|       93000.0|
|  Marketing|       63500.0|
|         HR|     

In [53]:
spark.catalog.dropTempView("employee_local")

spark.sql("select*from employee_local")

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `employee_local` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 12;
'Project [*]
+- 'UnresolvedRelation [employee_local], [], false


In [62]:
#Part B: Exercises on Global View (employees_global )
spark.sql("select*from global_temp.employee_global where hoursperweek<38").show()
print("---------------------------------------")

spark.sql("select department,sum(salary)from global_temp.employee_global group by department").show()
print("---------------------------------------")

spark.sql("""select*,
              case
                 when hoursperweek>45 then 'overworked'
                 else'normal'
              end as status
              from global_temp.employee_global""").show()


print("---------------------------------------")

spark.sql("""
           select project,count(empid)from global_temp.employee_global group by project""").show()

print("---------------------------------------")
spark.sql(""" select empid,name,salary as above_average_salary from global_temp.employee_global where salary>(select avg(salary)from global_temp.employee_global)""").show()

print("---------------------------------------")








+-----+-----+----------+-------------+------+------------+
|EmpID| Name|Department|      Project|Salary|HoursPerWeek|
+-----+-----+----------+-------------+------+------------+
|  106| Amit| Marketing| Social Media| 62000|          35|
|  107|Priya|        HR|Policy Revamp| 58000|          37|
|  110|Farah|        HR|   Onboarding| 60000|          36|
+-----+-----+----------+-------------+------+------------+

---------------------------------------
+-----------+-----------+
| department|sum(salary)|
+-----------+-----------+
|      Sales|     143000|
|Engineering|     372000|
|  Marketing|     127000|
|         HR|     118000|
+-----------+-----------+

---------------------------------------
+-----+-----+-----------+---------------+------+------------+----------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|    status|
+-----+-----+-----------+---------------+------+------------+----------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|    normal|
|  10

In [64]:
new_session=SparkSession.builder.appName("new_session").getOrCreate()
new_session.sql("select*from global_temp.employee_global").show()

+-----+-----+-----------+---------------+------+------------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|
|  104|Anita|      Sales|Client Outreach| 70000|          38|
|  105|Divya|Engineering|      AI Engine| 99000|          48|
|  106| Amit|  Marketing|   Social Media| 62000|          35|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|
|  109| Neha|Engineering| Security Suite| 91000|          46|
|  110|Farah|         HR|     Onboarding| 60000|          36|
+-----+-----+-----------+---------------+------+------------+



In [84]:
#bonus challenges
spark.sql("""select* ,
                   rank() over(partition by department
                   order by salary desc
                   )as depttRank
                   from global_temp.employee_global""").show()
print("---------------------------------------")


spark.sql("""
  CREATE OR REPLACE TEMP VIEW engineering_only AS
  SELECT *
  FROM employee_local
  WHERE Department = 'Engineering'
""")

spark.sql("SELECT * FROM engineering_only").show()

print("--------------------------------")

spark.sql("""create or replace temp view active_employees as
            select* from global_temp.employee_global where hoursperweek>=38 """)

spark.sql("SELECT * FROM active_employees").show()

+-----+-----+-----------+---------------+------+------------+---------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|depttRank|
+-----+-----+-----------+---------------+------+------------+---------+
|  105|Divya|Engineering|      AI Engine| 99000|          48|        1|
|  101| Ravi|Engineering|      AI Engine| 95000|          42|        2|
|  109| Neha|Engineering| Security Suite| 91000|          46|        3|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|        4|
|  110|Farah|         HR|     Onboarding| 60000|          36|        1|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|        2|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|        1|
|  106| Amit|  Marketing|   Social Media| 62000|          35|        2|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|        1|
|  104|Anita|      Sales|Client Outreach| 70000|          38|        2|
+-----+-----+-----------+---------------+------+------------+---