In [0]:
# string concatenation
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,avg,when,countDistinct,substring,concat,lit,concat_ws
# Create a Spark session

spark = SparkSession.builder.appName("Day1").getOrCreate()

In [0]:
# Sample data
data = [("Alice", 34, "New York", 120000),
    ("Bob", 28, "Chicago", 95000),
    ("Charlie", 40, "Los Angeles", 150000),
    ("David", 23, "Houston", 80000),
    ("Eve", 36, "Boston", 135000)]

# Define the schema for the DataFrame
schema = ["Name", "Age", "City", "Salary"]

# Create a DataFrame using the provided data and schema
df = spark.createDataFrame(data, schema)

df.show()

# Sample data for join 
data_j = [("Alice", "IT"), ("Bob", "HR"), ("Charlie", "HR")]
# Define the schema for the DataFrame
schema_j = ["Name", "Dept"]

# Create a DataFrame using the provided data and schema
df_j = spark.createDataFrame(data_j, schema_j)
df_j.show()

+-------+---+-----------+------+
|   Name|Age|       City|Salary|
+-------+---+-----------+------+
|  Alice| 34|   New York|120000|
|    Bob| 28|    Chicago| 95000|
|Charlie| 40|Los Angeles|150000|
|  David| 23|    Houston| 80000|
|    Eve| 36|     Boston|135000|
+-------+---+-----------+------+

+-------+----+
|   Name|Dept|
+-------+----+
|  Alice|  IT|
|    Bob|  HR|
|Charlie|  HR|
+-------+----+



In [0]:
# count
df.count()

Out[133]: 5

In [0]:
#Select columns
df.select("Name","City").show()

+-------+-----------+
|   Name|       City|
+-------+-----------+
|  Alice|   New York|
|    Bob|    Chicago|
|Charlie|Los Angeles|
|  David|    Houston|
|    Eve|     Boston|
+-------+-----------+



In [0]:
#Filter Rows
df.filter(col("Age")>30).show()

+-------+---+-----------+------+
|   Name|Age|       City|Salary|
+-------+---+-----------+------+
|  Alice| 34|   New York|120000|
|Charlie| 40|Los Angeles|150000|
|    Eve| 36|     Boston|135000|
+-------+---+-----------+------+



In [0]:
#Avg of column & Alias 
df.select(avg("salary").alias("Avg_sal")).show()

+--------+
| Avg_sal|
+--------+
|116000.0|
+--------+



In [0]:
#group by
df.groupBy("City").count().show()

+-----------+-----+
|       City|count|
+-----------+-----+
|   New York|    1|
|    Chicago|    1|
|Los Angeles|    1|
|    Houston|    1|
|     Boston|    1|
+-----------+-----+



In [0]:
#order by
df.orderBy(col("salary")).show()
# order by salary descending 
df.orderBy(col("salary").desc()).show()

+-------+---+-----------+------+
|   Name|Age|       City|Salary|
+-------+---+-----------+------+
|  David| 23|    Houston| 80000|
|    Bob| 28|    Chicago| 95000|
|  Alice| 34|   New York|120000|
|    Eve| 36|     Boston|135000|
|Charlie| 40|Los Angeles|150000|
+-------+---+-----------+------+

+-------+---+-----------+------+
|   Name|Age|       City|Salary|
+-------+---+-----------+------+
|Charlie| 40|Los Angeles|150000|
|    Eve| 36|     Boston|135000|
|  Alice| 34|   New York|120000|
|    Bob| 28|    Chicago| 95000|
|  David| 23|    Houston| 80000|
+-------+---+-----------+------+



In [0]:
#Join  
df.join(df_j,df.Name==df_j.Name).show()
#using Alias in join 
df.alias("A").join(df_j.alias("B"),df.Name==df_j.Name).select("A.Name","A.Age","B.Dept").show()

+-------+---+-----------+------+-------+----+
|   Name|Age|       City|Salary|   Name|Dept|
+-------+---+-----------+------+-------+----+
|  Alice| 34|   New York|120000|  Alice|  IT|
|    Bob| 28|    Chicago| 95000|    Bob|  HR|
|Charlie| 40|Los Angeles|150000|Charlie|  HR|
+-------+---+-----------+------+-------+----+

+-------+---+----+
|   Name|Age|Dept|
+-------+---+----+
|  Alice| 34|  IT|
|    Bob| 28|  HR|
|Charlie| 40|  HR|
+-------+---+----+



In [0]:
#unionAll
df.unionAll(df).show()

+-------+---+-----------+------+
|   Name|Age|       City|Salary|
+-------+---+-----------+------+
|  Alice| 34|   New York|120000|
|    Bob| 28|    Chicago| 95000|
|Charlie| 40|Los Angeles|150000|
|  David| 23|    Houston| 80000|
|    Eve| 36|     Boston|135000|
|  Alice| 34|   New York|120000|
|    Bob| 28|    Chicago| 95000|
|Charlie| 40|Los Angeles|150000|
|  David| 23|    Houston| 80000|
|    Eve| 36|     Boston|135000|
+-------+---+-----------+------+



In [0]:
#union
df.union(df).show()

+-------+---+-----------+------+
|   Name|Age|       City|Salary|
+-------+---+-----------+------+
|  Alice| 34|   New York|120000|
|    Bob| 28|    Chicago| 95000|
|Charlie| 40|Los Angeles|150000|
|  David| 23|    Houston| 80000|
|    Eve| 36|     Boston|135000|
|  Alice| 34|   New York|120000|
|    Bob| 28|    Chicago| 95000|
|Charlie| 40|Los Angeles|150000|
|  David| 23|    Houston| 80000|
|    Eve| 36|     Boston|135000|
+-------+---+-----------+------+



In [0]:
# union & unionAll both have same result 
# to remove duplicate you have to use distinct or dropDuplicates
union_df=df.union(df)
unionAll_df=df.union(df)

In [0]:
#distinct
union_df.distinct().show()
unionAll_df.distinct().show()

+-------+---+-----------+------+
|   Name|Age|       City|Salary|
+-------+---+-----------+------+
|  Alice| 34|   New York|120000|
|    Bob| 28|    Chicago| 95000|
|Charlie| 40|Los Angeles|150000|
|  David| 23|    Houston| 80000|
|    Eve| 36|     Boston|135000|
+-------+---+-----------+------+

+-------+---+-----------+------+
|   Name|Age|       City|Salary|
+-------+---+-----------+------+
|  Alice| 34|   New York|120000|
|    Bob| 28|    Chicago| 95000|
|Charlie| 40|Los Angeles|150000|
|  David| 23|    Houston| 80000|
|    Eve| 36|     Boston|135000|
+-------+---+-----------+------+



In [0]:
#dropDuplicates
union_df.dropDuplicates().show()
unionAll_df.dropDuplicates().show()

+-------+---+-----------+------+
|   Name|Age|       City|Salary|
+-------+---+-----------+------+
|  Alice| 34|   New York|120000|
|    Bob| 28|    Chicago| 95000|
|Charlie| 40|Los Angeles|150000|
|  David| 23|    Houston| 80000|
|    Eve| 36|     Boston|135000|
+-------+---+-----------+------+

+-------+---+-----------+------+
|   Name|Age|       City|Salary|
+-------+---+-----------+------+
|  Alice| 34|   New York|120000|
|    Bob| 28|    Chicago| 95000|
|Charlie| 40|Los Angeles|150000|
|  David| 23|    Houston| 80000|
|    Eve| 36|     Boston|135000|
+-------+---+-----------+------+



In [0]:
#limit
df.limit(2).show()

+-----+---+--------+------+
| Name|Age|    City|Salary|
+-----+---+--------+------+
|Alice| 34|New York|120000|
|  Bob| 28| Chicago| 95000|
+-----+---+--------+------+



In [0]:
#new Column (10% hike )
revised_Salary_df=df.withColumn("revised_Salary",col("Salary")+col("Salary")*10/100)
revised_Salary_df.show()

+-------+---+-----------+------+--------------+
|   Name|Age|       City|Salary|revised_Salary|
+-------+---+-----------+------+--------------+
|  Alice| 34|   New York|120000|      132000.0|
|    Bob| 28|    Chicago| 95000|      104500.0|
|Charlie| 40|Los Angeles|150000|      165000.0|
|  David| 23|    Houston| 80000|       88000.0|
|    Eve| 36|     Boston|135000|      148500.0|
+-------+---+-----------+------+--------------+



In [0]:
# Filter on multiple column 
df.filter((col("Age")>30) & (col("Salary")>130000)).show()

+-------+---+-----------+------+
|   Name|Age|       City|Salary|
+-------+---+-----------+------+
|Charlie| 40|Los Angeles|150000|
|    Eve| 36|     Boston|135000|
+-------+---+-----------+------+



In [0]:
#Subquery
df.filter(col("Age")>30).alias("new_df").select("new_df.Name").show()

+-------+
|   Name|
+-------+
|  Alice|
|Charlie|
|    Eve|
+-------+



In [0]:
#Between
df.filter(col("Age").between(25,30)).show()

+----+---+-------+------+
|Name|Age|   City|Salary|
+----+---+-------+------+
| Bob| 28|Chicago| 95000|
+----+---+-------+------+



In [0]:
#like 
df.filter(col("City").like("%on")).show()

+-----+---+-------+------+
| Name|Age|   City|Salary|
+-----+---+-------+------+
|David| 23|Houston| 80000|
|  Eve| 36| Boston|135000|
+-----+---+-------+------+



In [0]:
#Case 
df_j.withColumn("dept_full_name",when(col("Dept")=="IT","Information technology")\
    .when(col("Dept")=="HR","Human resources")\
        .otherwise(None)).show(truncate=False)

+-------+----+----------------------+
|Name   |Dept|dept_full_name        |
+-------+----+----------------------+
|Alice  |IT  |Information technology|
|Bob    |HR  |Human resources       |
|Charlie|HR  |Human resources       |
+-------+----+----------------------+



In [0]:
#Cast data type 
df.select(col("Salary").cast("float")).show()

+--------+
|  Salary|
+--------+
|120000.0|
| 95000.0|
|150000.0|
| 80000.0|
|135000.0|
+--------+



In [0]:
# countDistinct
union_df.select(countDistinct("Name")).show()

+--------------------+
|count(DISTINCT Name)|
+--------------------+
|                   5|
+--------------------+



In [0]:
#substring
df.select(substring("Name",1,3).alias("substring_name")).show()

+--------------+
|substring_name|
+--------------+
|           Ali|
|           Bob|
|           Cha|
|           Dav|
|           Eve|
+--------------+



In [0]:
#lit
df.withColumn("new_dummy_col",lit("dummy")).show()

+-------+---+-----------+------+-------------+
|   Name|Age|       City|Salary|new_dummy_col|
+-------+---+-----------+------+-------------+
|  Alice| 34|   New York|120000|        dummy|
|    Bob| 28|    Chicago| 95000|        dummy|
|Charlie| 40|Los Angeles|150000|        dummy|
|  David| 23|    Houston| 80000|        dummy|
|    Eve| 36|     Boston|135000|        dummy|
+-------+---+-----------+------+-------------+



In [0]:
#concat column
df.select(concat("Name",lit(" : "),"City").alias("concat_col")).show(truncate=False)

+---------------------+
|concat_col           |
+---------------------+
|Alice : New York     |
|Bob : Chicago        |
|Charlie : Los Angeles|
|David : Houston      |
|Eve : Boston         |
+---------------------+



In [0]:
# string concatenation
df.withColumn("concat_ws",concat_ws(" : ","Name","City","salary")).show( truncate =False)

+-------+---+-----------+------+------------------------------+
|Name   |Age|City       |Salary|concat_ws                     |
+-------+---+-----------+------+------------------------------+
|Alice  |34 |New York   |120000|Alice : New York : 120000     |
|Bob    |28 |Chicago    |95000 |Bob : Chicago : 95000         |
|Charlie|40 |Los Angeles|150000|Charlie : Los Angeles : 150000|
|David  |23 |Houston    |80000 |David : Houston : 80000       |
|Eve    |36 |Boston     |135000|Eve : Boston : 135000         |
+-------+---+-----------+------+------------------------------+

