In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# Create a Spark session
spark = SparkSession.builder.appName("Day3").getOrCreate()

In [0]:
# Sample data 
data = [
    ("Alice", "HR", 50000, "New York"),
    ("Bob", "Engineering", 60000, "San Francisco"),
    ("Charlie", "HR", 55000, "Los Angeles"),
    ("David", "Engineering", 62000, "Seattle"),
    ("Eva", "Finance", 70000, "Chicago"),
    ("Frank", "Finance", 75000, "Houston"),
    ("Grace", "Engineering", 65000, "Boston"),
    ("Hannah", "HR", 48000, "Miami"),
    ("Ian", "Finance", 68000, "Dallas"),
    ("Jessica", "Engineering", 63000, "Atlanta")
]

# Define the schema for the DataFrame
schema = ["name", "dept", "salary", "city"]

# Create the DataFrame
df = spark.createDataFrame(data, schema)

df.show()



# Sample data 
j_data = [
    ("Alice", "Active"),
    ("Bob", "Inactive"),
    ("Charlie", "Inactive"),
    ("David", "Active"),
    ("Eva", "Active"),
    ("Frank", "Active")
]

# Define the schema for the DataFrame
j_schema = ["name", "emp_status"]

# Create the DataFrame
j_df = spark.createDataFrame(j_data, j_schema)

j_df.show()

+-------+-----------+------+-------------+
|   name|       dept|salary|         city|
+-------+-----------+------+-------------+
|  Alice|         HR| 50000|     New York|
|    Bob|Engineering| 60000|San Francisco|
|Charlie|         HR| 55000|  Los Angeles|
|  David|Engineering| 62000|      Seattle|
|    Eva|    Finance| 70000|      Chicago|
|  Frank|    Finance| 75000|      Houston|
|  Grace|Engineering| 65000|       Boston|
| Hannah|         HR| 48000|        Miami|
|    Ian|    Finance| 68000|       Dallas|
|Jessica|Engineering| 63000|      Atlanta|
+-------+-----------+------+-------------+

+-------+----------+
|   name|emp_status|
+-------+----------+
|  Alice|    Active|
|    Bob|  Inactive|
|Charlie|  Inactive|
|  David|    Active|
|    Eva|    Active|
|  Frank|    Active|
+-------+----------+



In [0]:
#creating table from dataframe 
df.write.format("parquet").saveAsTable("day_three")

In [0]:
%sql
select * from day_three

name,dept,salary,city
Bob,Engineering,60000,San Francisco
Grace,Engineering,65000,Boston
Charlie,HR,55000,Los Angeles
Frank,Finance,75000,Houston
Ian,Finance,68000,Dallas
Jessica,Engineering,63000,Atlanta
David,Engineering,62000,Seattle
Eva,Finance,70000,Chicago
Alice,HR,50000,New York
Hannah,HR,48000,Miami


In [0]:
# insert data 
df.write.insertInto("day_three")

In [0]:
%sql
select * from day_three

name,dept,salary,city
Bob,Engineering,60000,San Francisco
Bob,Engineering,60000,San Francisco
Grace,Engineering,65000,Boston
Grace,Engineering,65000,Boston
Charlie,HR,55000,Los Angeles
Charlie,HR,55000,Los Angeles
Frank,Finance,75000,Houston
Frank,Finance,75000,Houston
Ian,Finance,68000,Dallas
Jessica,Engineering,63000,Atlanta


In [0]:
#create table with specific column
df.select("name","city").write.format("parquet").saveAsTable("day_three_spec_col")

In [0]:
%sql
select * from day_three_spec_col

name,city
Charlie,Los Angeles
Bob,San Francisco
Alice,New York
Frank,Houston
Grace,Boston
Hannah,Miami
Ian,Dallas
Jessica,Atlanta
David,Seattle
Eva,Chicago


In [0]:
# Aggregate with alias 
df.groupBy("dept").agg(F.count("*").alias("count_emp")).show()

+-----------+---------+
|       dept|count_emp|
+-----------+---------+
|         HR|        3|
|Engineering|        4|
|    Finance|        3|
+-----------+---------+



In [0]:
#nested subquery
df.filter(F.col("dept")=="HR").alias("fil_hr").filter("fil_hr.city=='Miami'").show()

+------+----+------+-----+
|  name|dept|salary| city|
+------+----+------+-----+
|Hannah|  HR| 48000|Miami|
+------+----+------+-----+



In [0]:
#cross join
df.crossJoin(j_df).show()

+-------+-----------+------+-------------+-------+----------+
|   name|       dept|salary|         city|   name|emp_status|
+-------+-----------+------+-------------+-------+----------+
|  Alice|         HR| 50000|     New York|  Alice|    Active|
|  Alice|         HR| 50000|     New York|    Bob|  Inactive|
|  Alice|         HR| 50000|     New York|Charlie|  Inactive|
|  Alice|         HR| 50000|     New York|  David|    Active|
|  Alice|         HR| 50000|     New York|    Eva|    Active|
|  Alice|         HR| 50000|     New York|  Frank|    Active|
|    Bob|Engineering| 60000|San Francisco|  Alice|    Active|
|    Bob|Engineering| 60000|San Francisco|    Bob|  Inactive|
|    Bob|Engineering| 60000|San Francisco|Charlie|  Inactive|
|    Bob|Engineering| 60000|San Francisco|  David|    Active|
|    Bob|Engineering| 60000|San Francisco|    Eva|    Active|
|    Bob|Engineering| 60000|San Francisco|  Frank|    Active|
|Charlie|         HR| 55000|  Los Angeles|  Alice|    Active|
|Charlie

In [0]:
#group by havig count greater then 
df.groupBy("dept").agg(F.count("*").alias("cnt_emp")).alias("cnt")\
    .filter("cnt.cnt_emp>3").show()

+-----------+-------+
|       dept|cnt_emp|
+-----------+-------+
|Engineering|      4|
+-----------+-------+



In [0]:
# alais for table join ( defualt is inner)
df.alias("a").join(j_df.alias("b"),F.col("a.name")==F.col("b.name")).show()

+-------+-----------+------+-------------+-------+----------+
|   name|       dept|salary|         city|   name|emp_status|
+-------+-----------+------+-------------+-------+----------+
|  Alice|         HR| 50000|     New York|  Alice|    Active|
|    Bob|Engineering| 60000|San Francisco|    Bob|  Inactive|
|Charlie|         HR| 55000|  Los Angeles|Charlie|  Inactive|
|  David|Engineering| 62000|      Seattle|  David|    Active|
|    Eva|    Finance| 70000|      Chicago|    Eva|    Active|
|  Frank|    Finance| 75000|      Houston|  Frank|    Active|
+-------+-----------+------+-------------+-------+----------+



In [0]:
#select from multiple table 
df.alias("a").join(j_df.alias("b"),F.col("a.name")==F.col("b.name")).select("a.dept","b.emp_status").show()
#or 
df.join(j_df,df.name==j_df.name).select(df.dept,j_df.emp_status).show()

+-----------+----------+
|       dept|emp_status|
+-----------+----------+
|         HR|    Active|
|Engineering|  Inactive|
|         HR|  Inactive|
|Engineering|    Active|
|    Finance|    Active|
|    Finance|    Active|
+-----------+----------+

+-----------+----------+
|       dept|emp_status|
+-----------+----------+
|         HR|    Active|
|Engineering|  Inactive|
|         HR|  Inactive|
|Engineering|    Active|
|    Finance|    Active|
|    Finance|    Active|
+-----------+----------+

