In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local[*]")\
        .appName("Spark Demo")\
        .getOrCreate()
        
        

In [None]:
A PySpark DataFrame is a distributed, schema-based table that allows you to process 
large-scale data efficiently using Sparkâ€™s optimized engine.

Why DataFrame in PySpark?
Before DataFrames, Spark mainly used RDD (Resilient Distributed Dataset).
DataFrames were introduced to:
    Improve performance (via Catalyst Optimizer)
    Provide SQL-like querying
    Simplify big data processing
    Reduce code complexity

When to Use DataFrame?
    Use PySpark DataFrame when:
    Working with structured data
    Writing SQL-like queries
    Performance matters
    Handling large datasets (GBs/TBs/PBs)

In [4]:
df =spark.range(0,11,2)
df.show(10)

+---+
| id|
+---+
|  0|
|  2|
|  4|
|  6|
|  8|
| 10|
+---+



In [5]:
df.printSchema()

root
 |-- id: long (nullable = false)



In [6]:
df.describe()

DataFrame[summary: string, id: string]

In [7]:
df.count()

6

In [5]:
df.describe()

DataFrame[summary: string, id: string]

In [8]:
df.dtypes

[('id', 'bigint')]

In [12]:
# creating dataframe using list of tuples
data = [
    (1, "Ravi", 50000),
    (2, "Priya", 60000),
    (3, "John", 45000)
]

columns = 'id int,name string,salary int'

df = spark.createDataFrame(data, columns)
df.show()
df.printSchema()

+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1| Ravi| 50000|
|  2|Priya| 60000|
|  3| John| 45000|
+---+-----+------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)



In [21]:
df.dtypes

[('id int', 'bigint'), ('name string', 'string'), ('salary float', 'bigint')]

In [None]:
In PySpark, StructType is used to define the schema (structure) of a DataFrame explicitly.
It tells Spark:
    Column names
    Data types
    Whether a column can contain null values
    Nested structure (complex columns)

In [13]:
# creating data frame wit schema
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])
data = [
    (1, "Kumar", 28),
    (2, "Anita", 32),
    (3, "George", 40)
]
df2 = spark.createDataFrame(data, schema)
df2.show()


+---+------+---+
| id|  name|age|
+---+------+---+
|  1| Kumar| 28|
|  2| Anita| 32|
|  3|George| 40|
+---+------+---+



In [16]:
df.head(2)
df.tail(2)

[Row(id=2, name='Priya', salary=60000), Row(id=3, name='John', salary=45000)]

In [14]:
df2.dtypes

[('id', 'int'), ('name', 'string'), ('age', 'int')]

In [23]:
# creating dataframe using dictionary
data = [
    {"id": 1, "name": "Ravi", "salary": 50000},
    {"id": 2, "name": "Priya", "salary": 60000},
    {"id": 3, "name": "John", "salary": 45000}
]

df = spark.createDataFrame(data)
df.show()

+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1| Ravi| 50000|
|  2|Priya| 60000|
|  3| John| 45000|
+---+-----+------+



In [2]:

df = spark.read.csv('c:/data/Orders1', header=True,inferSchema=True)

In [33]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- status: string (nullable = true)



In [26]:
df = spark.read.csv('c:/data/Orders1', header=True, inferSchema=True)
df.show()

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|         status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|
|       6|2013-07-25 00:00:00|       7130|       COMPLETE|
|       7|2013-07-25 00:00:00|       4530|       COMPLETE|
|       8|2013-07-25 00:00:00|       2911|     PROCESSING|
|       9|2013-07-25 00:00:00|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|       1837|         CLOSED|
|      13|2013-07-25 00:00:00|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:00|       9842|     PROCESSIN

In [27]:
df.describe()

DataFrame[summary: string, order_id: string, customer_id: string, status: string]

In [28]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- status: string (nullable = true)



In [44]:
df.select(df["order_id"].alias("orderId"), df["status"]).show()

+-------+---------------+
|orderId|         status|
+-------+---------------+
|      1|         CLOSED|
|      2|PENDING_PAYMENT|
|      3|       COMPLETE|
|      4|         CLOSED|
|      5|       COMPLETE|
|      6|       COMPLETE|
|      7|       COMPLETE|
|      8|     PROCESSING|
|      9|PENDING_PAYMENT|
|     10|PENDING_PAYMENT|
|     11| PAYMENT_REVIEW|
|     12|         CLOSED|
|     13|PENDING_PAYMENT|
|     14|     PROCESSING|
|     15|       COMPLETE|
|     16|PENDING_PAYMENT|
|     17|       COMPLETE|
|     18|         CLOSED|
|     19|PENDING_PAYMENT|
|     20|     PROCESSING|
+-------+---------------+
only showing top 20 rows


In [47]:
from pyspark.sql.functions import col

df.select(col("order_id").alias("orderId"), col("status").alias("orderStatus")).show()

+-------+---------------+
|orderId|    orderStatus|
+-------+---------------+
|      1|         CLOSED|
|      2|PENDING_PAYMENT|
|      3|       COMPLETE|
|      4|         CLOSED|
|      5|       COMPLETE|
|      6|       COMPLETE|
|      7|       COMPLETE|
|      8|     PROCESSING|
|      9|PENDING_PAYMENT|
|     10|PENDING_PAYMENT|
|     11| PAYMENT_REVIEW|
|     12|         CLOSED|
|     13|PENDING_PAYMENT|
|     14|     PROCESSING|
|     15|       COMPLETE|
|     16|PENDING_PAYMENT|
|     17|       COMPLETE|
|     18|         CLOSED|
|     19|PENDING_PAYMENT|
|     20|     PROCESSING|
+-------+---------------+
only showing top 20 rows


In [53]:
from pyspark.sql.functions import lit

df.select("order_id",lit("hello"),"status").show()

+--------+-----+---------------+
|order_id|hello|         status|
+--------+-----+---------------+
|       1|hello|         CLOSED|
|       2|hello|PENDING_PAYMENT|
|       3|hello|       COMPLETE|
|       4|hello|         CLOSED|
|       5|hello|       COMPLETE|
|       6|hello|       COMPLETE|
|       7|hello|       COMPLETE|
|       8|hello|     PROCESSING|
|       9|hello|PENDING_PAYMENT|
|      10|hello|PENDING_PAYMENT|
|      11|hello| PAYMENT_REVIEW|
|      12|hello|         CLOSED|
|      13|hello|PENDING_PAYMENT|
|      14|hello|     PROCESSING|
|      15|hello|       COMPLETE|
|      16|hello|PENDING_PAYMENT|
|      17|hello|       COMPLETE|
|      18|hello|         CLOSED|
|      19|hello|PENDING_PAYMENT|
|      20|hello|     PROCESSING|
+--------+-----+---------------+
only showing top 20 rows


In [59]:
df1=df.withColumnRenamed("status","OrderStatus")\
  .withColumnRenamed("customer_id","CustomerId")

In [60]:
df1.show(4)

+--------+-------------------+----------+---------------+
|order_id|         order_date|CustomerId|    OrderStatus|
+--------+-------------------+----------+---------------+
|       1|2013-07-25 00:00:00|     11599|         CLOSED|
|       2|2013-07-25 00:00:00|       256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|     12111|       COMPLETE|
|       4|2013-07-25 00:00:00|      8827|         CLOSED|
+--------+-------------------+----------+---------------+
only showing top 4 rows


In [61]:
col='order_item_id int, order_id int, product_id int, quantity int, subtotal float, price float'
ordItems_df = spark.read.csv('c:/data/OrderItems', col)

In [62]:
ordItems_df.show(4)

+-------------+--------+----------+--------+--------+------+
|order_item_id|order_id|product_id|quantity|subtotal| price|
+-------------+--------+----------+--------+--------+------+
|            1|       1|       957|       1|  299.98|299.98|
|            2|       2|      1073|       1|  199.99|199.99|
|            3|       2|       502|       5|   250.0|  50.0|
|            4|       2|       403|       1|  129.99|129.99|
+-------------+--------+----------+--------+--------+------+
only showing top 4 rows


In [70]:
ordItems_df.printSchema()

root
 |-- order_item_id: integer (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- subtotal: float (nullable = true)
 |-- price: float (nullable = true)



In [75]:
from pyspark.sql.functions import col,round

In [82]:
ordItems_df.withColumn('discount', col("subtotal") * 0.1).show()

+-------------+--------+----------+--------+--------+------+------------------+
|order_item_id|order_id|product_id|quantity|subtotal| price|          discount|
+-------------+--------+----------+--------+--------+------+------------------+
|            1|       1|       957|       1|  299.98|299.98|29.998001098632812|
|            2|       2|      1073|       1|  199.99|199.99|19.999000549316406|
|            3|       2|       502|       5|   250.0|  50.0|              25.0|
|            4|       2|       403|       1|  129.99|129.99|12.999000549316406|
|            5|       4|       897|       2|   49.98| 24.99|4.9979999542236335|
|            6|       4|       365|       5|  299.95| 59.99|29.995001220703127|
|            7|       4|       502|       3|   150.0|  50.0|              15.0|
|            8|       4|      1014|       4|  199.92| 49.98|19.991999816894534|
|            9|       5|       957|       1|  299.98|299.98|29.998001098632812|
|           10|       5|       365|     

In [84]:
ordItems_df.drop("quantity")\
          .drop("price").show()

+-------------+--------+----------+--------+
|order_item_id|order_id|product_id|subtotal|
+-------------+--------+----------+--------+
|            1|       1|       957|  299.98|
|            2|       2|      1073|  199.99|
|            3|       2|       502|   250.0|
|            4|       2|       403|  129.99|
|            5|       4|       897|   49.98|
|            6|       4|       365|  299.95|
|            7|       4|       502|   150.0|
|            8|       4|      1014|  199.92|
|            9|       5|       957|  299.98|
|           10|       5|       365|  299.95|
|           11|       5|      1014|   99.96|
|           12|       5|       957|  299.98|
|           13|       5|       403|  129.99|
|           14|       7|      1073|  199.99|
|           15|       7|       957|  299.98|
|           16|       7|       926|   79.95|
|           17|       8|       365|  179.97|
|           18|       8|       365|  299.95|
|           19|       8|      1014|  199.92|
|         

In [87]:
from pyspark.sql.functions import *
ordItems_df.withColumn("Status",
                      when(col("quantity") >5 , "HIGH")\
                      .when(col("quantity") >3 , "Mediam").otherwise("LOW")).show()

+-------------+--------+----------+--------+--------+------+------+
|order_item_id|order_id|product_id|quantity|subtotal| price|Status|
+-------------+--------+----------+--------+--------+------+------+
|            1|       1|       957|       1|  299.98|299.98|   LOW|
|            2|       2|      1073|       1|  199.99|199.99|   LOW|
|            3|       2|       502|       5|   250.0|  50.0|Mediam|
|            4|       2|       403|       1|  129.99|129.99|   LOW|
|            5|       4|       897|       2|   49.98| 24.99|   LOW|
|            6|       4|       365|       5|  299.95| 59.99|Mediam|
|            7|       4|       502|       3|   150.0|  50.0|   LOW|
|            8|       4|      1014|       4|  199.92| 49.98|Mediam|
|            9|       5|       957|       1|  299.98|299.98|   LOW|
|           10|       5|       365|       5|  299.95| 59.99|Mediam|
|           11|       5|      1014|       2|   99.96| 49.98|   LOW|
|           12|       5|       957|       1|  29

In [89]:
ordItems_df.filter(col("quantity") >=5).show()

+-------------+--------+----------+--------+--------+-----+
|order_item_id|order_id|product_id|quantity|subtotal|price|
+-------------+--------+----------+--------+--------+-----+
|            3|       2|       502|       5|   250.0| 50.0|
|            6|       4|       365|       5|  299.95|59.99|
|           10|       5|       365|       5|  299.95|59.99|
|           16|       7|       926|       5|   79.95|15.99|
|           18|       8|       365|       5|  299.95|59.99|
|           33|      11|      1014|       5|   249.9|49.98|
|           37|      12|       191|       5|  499.95|99.99|
|           38|      12|       502|       5|   250.0| 50.0|
|           49|      16|       365|       5|  299.95|59.99|
|           60|      20|       502|       5|   250.0| 50.0|
|           63|      20|       365|       5|  299.95|59.99|
|           71|      24|       502|       5|   250.0| 50.0|
|           84|      29|      1014|       5|   249.9|49.98|
|           89|      31|       191|     

In [103]:
ordItems_df.where( (col("quantity") >4) & (col("product_id") ==502) ).show()

+-------------+--------+----------+--------+--------+-----+
|order_item_id|order_id|product_id|quantity|subtotal|price|
+-------------+--------+----------+--------+--------+-----+
|            3|       2|       502|       5|   250.0| 50.0|
|           38|      12|       502|       5|   250.0| 50.0|
|           60|      20|       502|       5|   250.0| 50.0|
|           71|      24|       502|       5|   250.0| 50.0|
|          244|     107|       502|       5|   250.0| 50.0|
|          282|     120|       502|       5|   250.0| 50.0|
|          288|     121|       502|       5|   250.0| 50.0|
|          311|     132|       502|       5|   250.0| 50.0|
|          349|     148|       502|       5|   250.0| 50.0|
|          482|     197|       502|       5|   250.0| 50.0|
|          485|     200|       502|       5|   250.0| 50.0|
|          529|     219|       502|       5|   250.0| 50.0|
|          565|     229|       502|       5|   250.0| 50.0|
|          640|     254|       502|     

In [29]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("order_date", TimestampType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("order_status", StringType(), True),
        
])
df = spark.read.csv('c:/data/Orders',schema=schema)
df.show()

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|
|       6|2013-07-25 00:00:00|       7130|       COMPLETE|
|       7|2013-07-25 00:00:00|       4530|       COMPLETE|
|       8|2013-07-25 00:00:00|       2911|     PROCESSING|
|       9|2013-07-25 00:00:00|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|       1837|         CLOSED|
|      13|2013-07-25 00:00:00|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:00|       9842|     PROCESSIN

In [16]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [30]:
df.select("order_id","order_date").show()

+--------+-------------------+
|order_id|         order_date|
+--------+-------------------+
|       1|2013-07-25 00:00:00|
|       2|2013-07-25 00:00:00|
|       3|2013-07-25 00:00:00|
|       4|2013-07-25 00:00:00|
|       5|2013-07-25 00:00:00|
|       6|2013-07-25 00:00:00|
|       7|2013-07-25 00:00:00|
|       8|2013-07-25 00:00:00|
|       9|2013-07-25 00:00:00|
|      10|2013-07-25 00:00:00|
|      11|2013-07-25 00:00:00|
|      12|2013-07-25 00:00:00|
|      13|2013-07-25 00:00:00|
|      14|2013-07-25 00:00:00|
|      15|2013-07-25 00:00:00|
|      16|2013-07-25 00:00:00|
|      17|2013-07-25 00:00:00|
|      18|2013-07-25 00:00:00|
|      19|2013-07-25 00:00:00|
|      20|2013-07-25 00:00:00|
+--------+-------------------+
only showing top 20 rows


In [31]:
df.select(df.order_id,df.order_date,df.order_status).show()

+--------+-------------------+---------------+
|order_id|         order_date|   order_status|
+--------+-------------------+---------------+
|       1|2013-07-25 00:00:00|         CLOSED|
|       2|2013-07-25 00:00:00|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|       COMPLETE|
|       4|2013-07-25 00:00:00|         CLOSED|
|       5|2013-07-25 00:00:00|       COMPLETE|
|       6|2013-07-25 00:00:00|       COMPLETE|
|       7|2013-07-25 00:00:00|       COMPLETE|
|       8|2013-07-25 00:00:00|     PROCESSING|
|       9|2013-07-25 00:00:00|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|         CLOSED|
|      13|2013-07-25 00:00:00|PENDING_PAYMENT|
|      14|2013-07-25 00:00:00|     PROCESSING|
|      15|2013-07-25 00:00:00|       COMPLETE|
|      16|2013-07-25 00:00:00|PENDING_PAYMENT|
|      17|2013-07-25 00:00:00|       COMPLETE|
|      18|2013-07-25 00:00:00|         CLOSED|
|      19|201

In [32]:
from pyspark.sql.functions import col
df.select(col("order_id").alias("id"),
          (col("order_id") * 100).alias("order_id"),
          col("order_date").alias("date")).show()

+---+--------+-------------------+
| id|order_id|               date|
+---+--------+-------------------+
|  1|     100|2013-07-25 00:00:00|
|  2|     200|2013-07-25 00:00:00|
|  3|     300|2013-07-25 00:00:00|
|  4|     400|2013-07-25 00:00:00|
|  5|     500|2013-07-25 00:00:00|
|  6|     600|2013-07-25 00:00:00|
|  7|     700|2013-07-25 00:00:00|
|  8|     800|2013-07-25 00:00:00|
|  9|     900|2013-07-25 00:00:00|
| 10|    1000|2013-07-25 00:00:00|
| 11|    1100|2013-07-25 00:00:00|
| 12|    1200|2013-07-25 00:00:00|
| 13|    1300|2013-07-25 00:00:00|
| 14|    1400|2013-07-25 00:00:00|
| 15|    1500|2013-07-25 00:00:00|
| 16|    1600|2013-07-25 00:00:00|
| 17|    1700|2013-07-25 00:00:00|
| 18|    1800|2013-07-25 00:00:00|
| 19|    1900|2013-07-25 00:00:00|
| 20|    2000|2013-07-25 00:00:00|
+---+--------+-------------------+
only showing top 20 rows


In [33]:
df.select( [x for x in df.columns if x !='order_status']).show()

+--------+-------------------+-----------+
|order_id|         order_date|customer_id|
+--------+-------------------+-----------+
|       1|2013-07-25 00:00:00|      11599|
|       2|2013-07-25 00:00:00|        256|
|       3|2013-07-25 00:00:00|      12111|
|       4|2013-07-25 00:00:00|       8827|
|       5|2013-07-25 00:00:00|      11318|
|       6|2013-07-25 00:00:00|       7130|
|       7|2013-07-25 00:00:00|       4530|
|       8|2013-07-25 00:00:00|       2911|
|       9|2013-07-25 00:00:00|       5657|
|      10|2013-07-25 00:00:00|       5648|
|      11|2013-07-25 00:00:00|        918|
|      12|2013-07-25 00:00:00|       1837|
|      13|2013-07-25 00:00:00|       9149|
|      14|2013-07-25 00:00:00|       9842|
|      15|2013-07-25 00:00:00|       2568|
|      16|2013-07-25 00:00:00|       7276|
|      17|2013-07-25 00:00:00|       2667|
|      18|2013-07-25 00:00:00|       1205|
|      19|2013-07-25 00:00:00|       9488|
|      20|2013-07-25 00:00:00|       9198|
+--------+-

In [35]:
df.selectExpr("order_id","order_date","upper(order_status) as order_status" ).show(truncate=False)

+--------+-------------------+---------------+
|order_id|order_date         |order_status   |
+--------+-------------------+---------------+
|1       |2013-07-25 00:00:00|CLOSED         |
|2       |2013-07-25 00:00:00|PENDING_PAYMENT|
|3       |2013-07-25 00:00:00|COMPLETE       |
|4       |2013-07-25 00:00:00|CLOSED         |
|5       |2013-07-25 00:00:00|COMPLETE       |
|6       |2013-07-25 00:00:00|COMPLETE       |
|7       |2013-07-25 00:00:00|COMPLETE       |
|8       |2013-07-25 00:00:00|PROCESSING     |
|9       |2013-07-25 00:00:00|PENDING_PAYMENT|
|10      |2013-07-25 00:00:00|PENDING_PAYMENT|
|11      |2013-07-25 00:00:00|PAYMENT_REVIEW |
|12      |2013-07-25 00:00:00|CLOSED         |
|13      |2013-07-25 00:00:00|PENDING_PAYMENT|
|14      |2013-07-25 00:00:00|PROCESSING     |
|15      |2013-07-25 00:00:00|COMPLETE       |
|16      |2013-07-25 00:00:00|PENDING_PAYMENT|
|17      |2013-07-25 00:00:00|COMPLETE       |
|18      |2013-07-25 00:00:00|CLOSED         |
|19      |201

In [36]:
from pyspark.sql.functions import *
df.withColumn("order_status_upper", upper(col( "order_status"))).show() 

+--------+-------------------+-----------+---------------+------------------+
|order_id|         order_date|customer_id|   order_status|order_status_upper|
+--------+-------------------+-----------+---------------+------------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|            CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|   PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|          COMPLETE|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|            CLOSED|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|          COMPLETE|
|       6|2013-07-25 00:00:00|       7130|       COMPLETE|          COMPLETE|
|       7|2013-07-25 00:00:00|       4530|       COMPLETE|          COMPLETE|
|       8|2013-07-25 00:00:00|       2911|     PROCESSING|        PROCESSING|
|       9|2013-07-25 00:00:00|       5657|PENDING_PAYMENT|   PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|       5648|PENDING_PAYMENT|   PEN

In [37]:
df.withColumn("order_id * 100", col("order_id") * 100).show()

+--------+-------------------+-----------+---------------+--------------+
|order_id|         order_date|customer_id|   order_status|order_id * 100|
+--------+-------------------+-----------+---------------+--------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|           100|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|           200|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|           300|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|           400|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|           500|
|       6|2013-07-25 00:00:00|       7130|       COMPLETE|           600|
|       7|2013-07-25 00:00:00|       4530|       COMPLETE|           700|
|       8|2013-07-25 00:00:00|       2911|     PROCESSING|           800|
|       9|2013-07-25 00:00:00|       5657|PENDING_PAYMENT|           900|
|      10|2013-07-25 00:00:00|       5648|PENDING_PAYMENT|          1000|
|      11|2013-07-25 00:00:00|        

In [58]:
df.withColumnRenamed("order_id", "orderId").show()

+-------+-------------------+-----------+---------------+
|orderId|         order_date|customer_id|   order_status|
+-------+-------------------+-----------+---------------+
|      1|2013-07-25 00:00:00|      11599|         CLOSED|
|      2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|      3|2013-07-25 00:00:00|      12111|       COMPLETE|
|      4|2013-07-25 00:00:00|       8827|         CLOSED|
|      5|2013-07-25 00:00:00|      11318|       COMPLETE|
|      6|2013-07-25 00:00:00|       7130|       COMPLETE|
|      7|2013-07-25 00:00:00|       4530|       COMPLETE|
|      8|2013-07-25 00:00:00|       2911|     PROCESSING|
|      9|2013-07-25 00:00:00|       5657|PENDING_PAYMENT|
|     10|2013-07-25 00:00:00|       5648|PENDING_PAYMENT|
|     11|2013-07-25 00:00:00|        918| PAYMENT_REVIEW|
|     12|2013-07-25 00:00:00|       1837|         CLOSED|
|     13|2013-07-25 00:00:00|       9149|PENDING_PAYMENT|
|     14|2013-07-25 00:00:00|       9842|     PROCESSING|
|     15|2013-

In [None]:
ordItems_df.withColumn("Type", "quan

In [35]:
data = [
    {"id": 1, "name": "Ravi", "salary": 50000, "deptno":10},
    {"id": 2, "name": "Priya", "salary": 60000, "deptno":10},
    {"id": 3, "name": "John", "salary": 45000, "deptno":20},
    {"id": 4, "name": "Rashmi", "salary": 80000, "deptno":20},
    {"id": 5, "name": "Prem", "salary": 25000, "deptno":30},
    {"id": 6, "name": "Raheem", "salary": 50000, "deptno":30},
    {"id": 7, "name": "Sunil", "salary": 30000, "deptno":10},
    {"id": 8, "name": "John", "salary": 35000, "deptno":10},
    {"id": 9, "name": "Susil", "salary": 45000, "deptno":20},
    {"id": 10, "name": "Jothi", "salary": 75000, "deptno":10}    
]

df = spark.createDataFrame(data)


In [36]:
df.show()

+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    10|  1|  Ravi| 50000|
|    10|  2| Priya| 60000|
|    20|  3|  John| 45000|
|    20|  4|Rashmi| 80000|
|    30|  5|  Prem| 25000|
|    30|  6|Raheem| 50000|
|    10|  7| Sunil| 30000|
|    10|  8|  John| 35000|
|    20|  9| Susil| 45000|
|    10| 10| Jothi| 75000|
+------+---+------+------+



In [67]:
df.withColumn("Status",
              when(col("salary") > 50000, "HIGH").otherwise("LOW")).show()

+------+---+------+------+------+
|deptno| id|  name|salary|Status|
+------+---+------+------+------+
|    10|  1|  Ravi| 50000|   LOW|
|    10|  2| Priya| 60000|  HIGH|
|    20|  3|  John| 45000|   LOW|
|    20|  4|Rashmi| 80000|  HIGH|
|    30|  5|  Prem| 25000|   LOW|
|    30|  6|Raheem| 50000|   LOW|
|    10|  7| Sunil| 30000|   LOW|
|    10|  8|  John| 35000|   LOW|
|    20|  9| Susil| 45000|   LOW|
|    10| 10| Jothi| 75000|  HIGH|
+------+---+------+------+------+



In [38]:
from pyspark.sql.functions import *
df2 = df.withColumns({
    "name_upper": upper(col("name")),
    "salary_plus_10": col("salary") * 1.10,
    "salary_flag": when(col("salary") > 50000, "HIGH").otherwise("LOW")
})
df2.show()


+------+---+------+------+----------+------------------+-----------+
|deptno| id|  name|salary|name_upper|    salary_plus_10|salary_flag|
+------+---+------+------+----------+------------------+-----------+
|    10|  1|  Ravi| 50000|      RAVI| 55000.00000000001|        LOW|
|    10|  2| Priya| 60000|     PRIYA|           66000.0|       HIGH|
|    20|  3|  John| 45000|      JOHN| 49500.00000000001|        LOW|
|    20|  4|Rashmi| 80000|    RASHMI|           88000.0|       HIGH|
|    30|  5|  Prem| 25000|      PREM|27500.000000000004|        LOW|
|    30|  6|Raheem| 50000|    RAHEEM| 55000.00000000001|        LOW|
|    10|  7| Sunil| 30000|     SUNIL|           33000.0|        LOW|
|    10|  8|  John| 35000|      JOHN|           38500.0|        LOW|
|    20|  9| Susil| 45000|     SUSIL| 49500.00000000001|        LOW|
|    10| 10| Jothi| 75000|     JOTHI|           82500.0|       HIGH|
+------+---+------+------+----------+------------------+-----------+



In [41]:
df.drop("order_status").show()

+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    10|  1|  Ravi| 50000|
|    10|  2| Priya| 60000|
|    20|  3|  John| 45000|
|    20|  4|Rashmi| 80000|
|    30|  5|  Prem| 25000|
|    30|  6|Raheem| 50000|
|    10|  7| Sunil| 30000|
|    10|  8|  John| 35000|
|    20|  9| Susil| 45000|
|    10| 10| Jothi| 75000|
+------+---+------+------+



In [3]:
data = [
    (1, "Shiva", "HR"),
    (2, "Reddy", "Finance"),
    (3, "Shiva", "HR"),      # duplicate row
    (4, "Shiva", "Marketing")
]

df1 = spark.createDataFrame(data, ["id", "name", "department"])
df1.show()


+---+-----+----------+
| id| name|department|
+---+-----+----------+
|  1|Shiva|        HR|
|  2|Reddy|   Finance|
|  3|Shiva|        HR|
|  4|Shiva| Marketing|
+---+-----+----------+



In [4]:
df1.show()

+---+-----+----------+
| id| name|department|
+---+-----+----------+
|  1|Shiva|        HR|
|  2|Reddy|   Finance|
|  3|Shiva|        HR|
|  4|Shiva| Marketing|
+---+-----+----------+



In [5]:
df1.drop_duplicates().show()

+---+-----+----------+
| id| name|department|
+---+-----+----------+
|  1|Shiva|        HR|
|  2|Reddy|   Finance|
|  3|Shiva|        HR|
|  4|Shiva| Marketing|
+---+-----+----------+



In [6]:
df1.drop_duplicates(subset=["department","name"]).show()

+---+-----+----------+
| id| name|department|
+---+-----+----------+
|  1|Shiva|        HR|
|  2|Reddy|   Finance|
|  4|Shiva| Marketing|
+---+-----+----------+



In [8]:
df1.drop_duplicates(subset=["name"]).show()

+---+-----+----------+
| id| name|department|
+---+-----+----------+
|  2|Reddy|   Finance|
|  1|Shiva|        HR|
+---+-----+----------+



In [9]:
data = [
    (1, "Shiva",    60000),
    (2, None,       45000),
    (3, "Reddy",    None),
    (None, "Kumar", 30000),
    (5, None,       None)
]

df2 = spark.createDataFrame(data, ["id", "name", "salary"])
df2.show()


+----+-----+------+
|  id| name|salary|
+----+-----+------+
|   1|Shiva| 60000|
|   2| NULL| 45000|
|   3|Reddy|  NULL|
|NULL|Kumar| 30000|
|   5| NULL|  NULL|
+----+-----+------+



In [11]:
df2.dropna(thresh=2).show()

+----+-----+------+
|  id| name|salary|
+----+-----+------+
|   1|Shiva| 60000|
|   2| NULL| 45000|
|   3|Reddy|  NULL|
|NULL|Kumar| 30000|
+----+-----+------+



In [12]:
df2.dropna(subset=["salary"]).show()

+----+-----+------+
|  id| name|salary|
+----+-----+------+
|   1|Shiva| 60000|
|   2| NULL| 45000|
|NULL|Kumar| 30000|
+----+-----+------+



In [13]:
df2.dropna(subset=["salary","id"]).show()

+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1|Shiva| 60000|
|  2| NULL| 45000|
+---+-----+------+



In [15]:
# fill string  columns
df2.fillna(0).show()

+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1|Shiva| 60000|
|  2| NULL| 45000|
|  3|Reddy|     0|
|  0|Kumar| 30000|
|  5| NULL|     0|
+---+-----+------+



In [16]:
df2.fillna(0, subset=["salary"]).show()


+----+-----+------+
|  id| name|salary|
+----+-----+------+
|   1|Shiva| 60000|
|   2| NULL| 45000|
|   3|Reddy|     0|
|NULL|Kumar| 30000|
|   5| NULL|     0|
+----+-----+------+



In [126]:
# add constant column
from pyspark.sql.functions import lit
df.withColumn("country", lit("India")).show()

+------+---+------+------+-------+
|deptno| id|  name|salary|country|
+------+---+------+------+-------+
|    10|  1|  Ravi| 50000|  India|
|    10|  2| Priya| 60000|  India|
|    20|  3|  John| 45000|  India|
|    20|  4|Rashmi| 80000|  India|
|    30|  5|  Prem| 25000|  India|
|    30|  6|Raheem| 50000|  India|
|    10|  7| Sunil| 30000|  India|
|    10|  8|  John| 35000|  India|
|    20|  9| Susil| 45000|  India|
|    10| 10| Jothi| 75000|  India|
+------+---+------+------+-------+



In [53]:
df2.fillna({
    "name": "Not Available",
    "salary": 0,
    "id": -1
}).show()


+---+-------------+------+
| id|         name|salary|
+---+-------------+------+
|  1|        Shiva| 60000|
|  2|Not Available| 45000|
|  3|        Reddy|     0|
| -1|        Kumar| 30000|
|  5|Not Available|     0|
+---+-------------+------+



In [59]:
from pyspark.sql.functions import lit,concat
df.withColumn("Name_Country", concat(col("name") , lit(" belongs  India")  )).show(truncate=False)

+------+---+------+------+---------------------+
|deptno|id |name  |salary|Name_Country         |
+------+---+------+------+---------------------+
|10    |1  |Ravi  |50000 |Ravi belongs  India  |
|10    |2  |Priya |60000 |Priya belongs  India |
|20    |3  |John  |45000 |John belongs  India  |
|20    |4  |Rashmi|80000 |Rashmi belongs  India|
|30    |5  |Prem  |25000 |Prem belongs  India  |
|30    |6  |Raheem|50000 |Raheem belongs  India|
|10    |7  |Sunil |30000 |Sunil belongs  India |
|10    |8  |John  |35000 |John belongs  India  |
|20    |9  |Susil |45000 |Susil belongs  India |
|10    |10 |Jothi |75000 |Jothi belongs  India |
+------+---+------+------+---------------------+



In [18]:
df1.show(2)

+---+-----+----------+
| id| name|department|
+---+-----+----------+
|  1|Shiva|        HR|
|  2|Reddy|   Finance|
+---+-----+----------+
only showing top 2 rows


In [25]:
from pyspark.sql.functions import col, lit, concat

df2 = df1.withColumn(
    "id_name_dept",
    concat(col("id").cast("string"),lit("|"), col("name"),lit('|'),col("department"))
)

df2.show(truncate=False)


+---+-----+----------+-----------------+
|id |name |department|id_name_dept     |
+---+-----+----------+-----------------+
|1  |Shiva|HR        |1|Shiva|HR       |
|2  |Reddy|Finance   |2|Reddy|Finance  |
|3  |Shiva|HR        |3|Shiva|HR       |
|4  |Shiva|Marketing |4|Shiva|Marketing|
+---+-----+----------+-----------------+



In [31]:
from pyspark.sql.functions import col, lit, concat_ws

df2 = df1.withColumn(
    "name_dept",
    concat_ws(' ', col("name"), lit("belongs to"), col("department"))
)

In [34]:
df1.select("id","name",concat_ws(' ', col("name"), lit("belongs to"), col("department")).alias("dept")).show()

+---+-----+--------------------+
| id| name|                dept|
+---+-----+--------------------+
|  1|Shiva| Shiva belongs to HR|
|  2|Reddy|Reddy belongs to ...|
|  3|Shiva| Shiva belongs to HR|
|  4|Shiva|Shiva belongs to ...|
+---+-----+--------------------+



In [32]:
df2.show()

+---+-----+----------+--------------------+
| id| name|department|           name_dept|
+---+-----+----------+--------------------+
|  1|Shiva|        HR| Shiva belongs to HR|
|  2|Reddy|   Finance|Reddy belongs to ...|
|  3|Shiva|        HR| Shiva belongs to HR|
|  4|Shiva| Marketing|Shiva belongs to ...|
+---+-----+----------+--------------------+



In [61]:
df.select("id","name", lit("India").alias("country")).show()

+---+------+-------+
| id|  name|country|
+---+------+-------+
|  1|  Ravi|  India|
|  2| Priya|  India|
|  3|  John|  India|
|  4|Rashmi|  India|
|  5|  Prem|  India|
|  6|Raheem|  India|
|  7| Sunil|  India|
|  8|  John|  India|
|  9| Susil|  India|
| 10| Jothi|  India|
+---+------+-------+



In [40]:
df.filter(col("salary") > 50000).show()


+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    10|  2| Priya| 60000|
|    20|  4|Rashmi| 80000|
|    10| 10| Jothi| 75000|
+------+---+------+------+



In [72]:
df.filter(col("salary") > 50000).show()

+------+---+------+------+--------------------+
|deptno| id|  name|salary|           ename_sal|
+------+---+------+------+--------------------+
|    10|  2| Priya| 60000|Priya Salary is 6...|
|    20|  4|Rashmi| 80000|Rashmi Salary is ...|
|    10| 10| Jothi| 75000|Jothi Salary is 7...|
+------+---+------+------+--------------------+



In [73]:
df = df.drop("ename_sal")

In [47]:
df.filter( (( col("deptno") ==10)  & ( col("salary") > 25000))
           |
           (( col("deptno") ==20)  & ( col("salary") >50000))
         ).select("id","salary","deptno").show()


+---+------+------+
| id|salary|deptno|
+---+------+------+
|  1| 50000|    10|
|  2| 60000|    10|
|  4| 80000|    20|
|  7| 30000|    10|
|  8| 35000|    10|
| 10| 75000|    10|
+---+------+------+



In [49]:
# Filter using SQL-like string condition
df.filter("salary > 40000 and deptno ==10").show()

+------+---+-----+------+
|deptno| id| name|salary|
+------+---+-----+------+
|    10|  1| Ravi| 50000|
|    10|  2|Priya| 60000|
|    10| 10|Jothi| 75000|
+------+---+-----+------+



In [55]:
df.filter(~col("deptno").isin(10,20)).orderBy("deptno",ascending=False).show()


+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    30|  5|  Prem| 25000|
|    30|  6|Raheem| 50000|
+------+---+------+------+



In [78]:
df.filter(~col("deptno").isin(10,20)).show()


+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    30|  5|  Prem| 25000|
|    30|  6|Raheem| 50000|
+------+---+------+------+



In [58]:
df.filter(col("name").like("%em")).show()

+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    30|  5|  Prem| 25000|
|    30|  6|Raheem| 50000|
+------+---+------+------+



In [80]:
df.filter( ~col("name").like("Rav%")).show()

+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    10|  2| Priya| 60000|
|    20|  3|  John| 45000|
|    20|  4|Rashmi| 80000|
|    30|  5|  Prem| 25000|
|    30|  6|Raheem| 50000|
|    10|  7| Sunil| 30000|
|    10|  8|  John| 35000|
|    20|  9| Susil| 45000|
|    10| 10| Jothi| 75000|
+------+---+------+------+



In [82]:
df.filter(col("name").like("%em")).show()

+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    30|  5|  Prem| 25000|
|    30|  6|Raheem| 50000|
+------+---+------+------+



In [84]:
df.filter(col("name").like("%Kumar%")).show()

+------+---+----+------+
|deptno| id|name|salary|
+------+---+----+------+
+------+---+----+------+



In [59]:
df.filter(col("salary").between(20000, 50000)).show()


+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    10|  1|  Ravi| 50000|
|    20|  3|  John| 45000|
|    30|  5|  Prem| 25000|
|    30|  6|Raheem| 50000|
|    10|  7| Sunil| 30000|
|    10|  8|  John| 35000|
|    20|  9| Susil| 45000|
+------+---+------+------+



In [86]:
df.filter(~col("salary").between(20000, 50000)).show()


+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    10|  2| Priya| 60000|
|    20|  4|Rashmi| 80000|
|    10| 10| Jothi| 75000|
+------+---+------+------+



In [60]:
df.select("id","name","salary").filter(col("salary").isNull()).show()

+---+----+------+
| id|name|salary|
+---+----+------+
+---+----+------+



In [61]:
df.select("id","name","salary").filter(~col("salary").isNull()).show()

+---+------+------+
| id|  name|salary|
+---+------+------+
|  1|  Ravi| 50000|
|  2| Priya| 60000|
|  3|  John| 45000|
|  4|Rashmi| 80000|
|  5|  Prem| 25000|
|  6|Raheem| 50000|
|  7| Sunil| 30000|
|  8|  John| 35000|
|  9| Susil| 45000|
| 10| Jothi| 75000|
+---+------+------+



In [87]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.getOrCreate()

data = [
    (1, "Ravi", 50000),
    (2, None, 60000),
    (3, "John", None),
    (4, None, None),
    (5, "Priya", 45000)
]

df2 = spark.createDataFrame(data, ["id", "name", "salary"])
df2.show()


+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1| Ravi| 50000|
|  2| NULL| 60000|
|  3| John|  NULL|
|  4| NULL|  NULL|
|  5|Priya| 45000|
+---+-----+------+



In [89]:
df2.filter(col("salary").isNull()).show()


+---+----+------+
| id|name|salary|
+---+----+------+
|  3|John|  NULL|
|  4|NULL|  NULL|
+---+----+------+



In [92]:
df2.filter(col("name").isNotNull()).show()

+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1| Ravi| 50000|
|  3| John|  NULL|
|  5|Priya| 45000|
+---+-----+------+



In [97]:
df.where( (col("salary") >50000) | (col("deptno") ==10)   ).show()

+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    10|  1|  Ravi| 50000|
|    10|  2| Priya| 60000|
|    20|  4|Rashmi| 80000|
|    10|  7| Sunil| 30000|
|    10|  8|  John| 35000|
|    10| 10| Jothi| 75000|
+------+---+------+------+



In [63]:
df.orderBy(col("salary").desc()).show()


+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    20|  4|Rashmi| 80000|
|    10| 10| Jothi| 75000|
|    10|  2| Priya| 60000|
|    10|  1|  Ravi| 50000|
|    30|  6|Raheem| 50000|
|    20|  9| Susil| 45000|
|    20|  3|  John| 45000|
|    10|  8|  John| 35000|
|    10|  7| Sunil| 30000|
|    30|  5|  Prem| 25000|
+------+---+------+------+



In [99]:
df.orderBy(col("salary").asc()).show()


+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    30|  5|  Prem| 25000|
|    10|  7| Sunil| 30000|
|    10|  8|  John| 35000|
|    20|  3|  John| 45000|
|    20|  9| Susil| 45000|
|    10|  1|  Ravi| 50000|
|    30|  6|Raheem| 50000|
|    10|  2| Priya| 60000|
|    10| 10| Jothi| 75000|
|    20|  4|Rashmi| 80000|
+------+---+------+------+



In [100]:
df.orderBy(
    col("deptno").asc(),
    col("salary").desc()
).show()


+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    10| 10| Jothi| 75000|
|    10|  2| Priya| 60000|
|    10|  1|  Ravi| 50000|
|    10|  8|  John| 35000|
|    10|  7| Sunil| 30000|
|    20|  4|Rashmi| 80000|
|    20|  3|  John| 45000|
|    20|  9| Susil| 45000|
|    30|  6|Raheem| 50000|
|    30|  5|  Prem| 25000|
+------+---+------+------+



In [65]:
df.rdd.getNumPartitions()

10

In [66]:
df =df.coalesce(2)

In [67]:
df.rdd.getNumPartitions()

2

In [68]:
df.sortWithinPartitions( "salary").show()


+------+---+------+------+
|deptno| id|  name|salary|
+------+---+------+------+
|    30|  5|  Prem| 25000|
|    20|  3|  John| 45000|
|    10|  1|  Ravi| 50000|
|    10|  2| Priya| 60000|
|    20|  4|Rashmi| 80000|
|    10|  7| Sunil| 30000|
|    10|  8|  John| 35000|
|    20|  9| Susil| 45000|
|    30|  6|Raheem| 50000|
|    10| 10| Jothi| 75000|
+------+---+------+------+



In [69]:
df.head(4)

[Row(deptno=10, id=1, name='Ravi', salary=50000),
 Row(deptno=10, id=2, name='Priya', salary=60000),
 Row(deptno=20, id=3, name='John', salary=45000),
 Row(deptno=20, id=4, name='Rashmi', salary=80000)]

In [70]:
df.tail(4)

[Row(deptno=10, id=7, name='Sunil', salary=30000),
 Row(deptno=10, id=8, name='John', salary=35000),
 Row(deptno=20, id=9, name='Susil', salary=45000),
 Row(deptno=10, id=10, name='Jothi', salary=75000)]

In [71]:
# set operation
df1 = spark.createDataFrame(
    [(1, "A"), (2, "B"), (3, "C")],
    ["id", "val"]
)

df2 = spark.createDataFrame(
    [(3, "C"), (4, "D")],
    ["id", "val"]
)


In [72]:
df1.unionAll(df2).show()

+---+---+
| id|val|
+---+---+
|  1|  A|
|  2|  B|
|  3|  C|
|  3|  C|
|  4|  D|
+---+---+



In [73]:
df1.union(df2).show()

+---+---+
| id|val|
+---+---+
|  1|  A|
|  2|  B|
|  3|  C|
|  3|  C|
|  4|  D|
+---+---+



In [109]:
df1.union(df2).distinct().show()

+---+---+
| id|val|
+---+---+
|  1|  A|
|  2|  B|
|  3|  C|
|  4|  D|
+---+---+



In [76]:
df3 = spark.createDataFrame(data=(('a',1),('b',2)),schema=('col1 string,col2 int'))
df4 = spark.createDataFrame(data=((2,'b'),(3,'c')),schema=('col2 int,col1 string'))
df3.unionByName(df4).show()

+----+----+
|col1|col2|
+----+----+
|   a|   1|
|   b|   2|
|   b|   2|
|   c|   3|
+----+----+



In [80]:
df1 = spark.createDataFrame(data=(('a',1),('a',1),('b',2)),schema=('col1 string,col2 int')) 
df2 = spark.createDataFrame(data=(('a',1),('a',1),('c',2)),schema=('col1 string,col2 int'))


In [81]:
df1.show()


+----+----+
|col1|col2|
+----+----+
|   a|   1|
|   a|   1|
|   b|   2|
+----+----+



In [82]:
df2.show()

+----+----+
|col1|col2|
+----+----+
|   a|   1|
|   a|   1|
|   c|   2|
+----+----+



In [83]:
df1.intersect(df2).show()

+----+----+
|col1|col2|
+----+----+
|   a|   1|
+----+----+



In [84]:
df1.intersectAll(df2).show()

+----+----+
|col1|col2|
+----+----+
|   a|   1|
|   a|   1|
+----+----+



In [86]:
df1.exceptAll(df2).show()

+----+----+
|col1|col2|
+----+----+
|   b|   2|
+----+----+



In [88]:

from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# EMP schema
emp_schema = StructType([
    StructField("emp_id", IntegerType(), True),
    StructField("ename", StringType(), True),
    StructField("dept_id", IntegerType(), True)
])

emp_data = [
    (1, "Smith", 10),
    (2, "Allen", 20),
    (3, "Ward", 10),
    (4, "Jones", 30),
    (5, "Martin", None)
]

emp = spark.createDataFrame(emp_data, emp_schema)

emp.show()

+------+------+-------+
|emp_id| ename|dept_id|
+------+------+-------+
|     1| Smith|     10|
|     2| Allen|     20|
|     3|  Ward|     10|
|     4| Jones|     30|
|     5|Martin|   NULL|
+------+------+-------+



In [89]:
# DEPT schema
dept_schema = StructType([
    StructField("dept_id", IntegerType(), True),
    StructField("dname", StringType(), True)
])

dept_data = [
    (10, "ACCOUNTING"),
    (20, "RESEARCH"),
    (30, "SALES"),
    (40, "OPERATIONS")
]

dept = spark.createDataFrame(dept_data, dept_schema)
dept.show()


+-------+----------+
|dept_id|     dname|
+-------+----------+
|     10|ACCOUNTING|
|     20|  RESEARCH|
|     30|     SALES|
|     40|OPERATIONS|
+-------+----------+



In [96]:

emp.join(dept, "dept_id", how= "inner").\
    select("emp_id","ename","dept_id","dname").show()


+------+-----+-------+----------+
|emp_id|ename|dept_id|     dname|
+------+-----+-------+----------+
|     1|Smith|     10|ACCOUNTING|
|     3| Ward|     10|ACCOUNTING|
|     2|Allen|     20|  RESEARCH|
|     4|Jones|     30|     SALES|
+------+-----+-------+----------+



In [97]:
emp.join(dept, "dept_id", "left").show()


+-------+------+------+----------+
|dept_id|emp_id| ename|     dname|
+-------+------+------+----------+
|     10|     1| Smith|ACCOUNTING|
|     20|     2| Allen|  RESEARCH|
|     10|     3|  Ward|ACCOUNTING|
|     30|     4| Jones|     SALES|
|   NULL|     5|Martin|      NULL|
+-------+------+------+----------+



In [98]:
emp.join(dept, "dept_id", "right").show()


+-------+------+-----+----------+
|dept_id|emp_id|ename|     dname|
+-------+------+-----+----------+
|     10|     3| Ward|ACCOUNTING|
|     10|     1|Smith|ACCOUNTING|
|     20|     2|Allen|  RESEARCH|
|     30|     4|Jones|     SALES|
|     40|  NULL| NULL|OPERATIONS|
+-------+------+-----+----------+



In [154]:
emp.join(dept, "dept_id", "full").show()


+-------+------+------+----------+
|dept_id|emp_id| ename|     dname|
+-------+------+------+----------+
|   NULL|     5|Martin|      NULL|
|     10|     3|  Ward|ACCOUNTING|
|     10|     1| Smith|ACCOUNTING|
|     20|     2| Allen|  RESEARCH|
|     30|     4| Jones|     SALES|
|     40|  NULL|  NULL|OPERATIONS|
+-------+------+------+----------+



In [101]:
# Returns only employees whose dept exists.
emp.join(dept, "dept_id", "leftsemi").show()


+-------+------+-----+
|dept_id|emp_id|ename|
+-------+------+-----+
|     10|     1|Smith|
|     10|     3| Ward|
|     20|     2|Allen|
|     30|     4|Jones|
+-------+------+-----+



In [102]:
# Employees who do not have a matching department.
emp.join(dept, "dept_id", "leftanti").show()


+-------+------+------+
|dept_id|emp_id| ename|
+-------+------+------+
|   NULL|     5|Martin|
+-------+------+------+



In [103]:
emp.crossJoin(dept).orderBy("dname").show()

+------+------+-------+-------+----------+
|emp_id| ename|dept_id|dept_id|     dname|
+------+------+-------+-------+----------+
|     1| Smith|     10|     10|ACCOUNTING|
|     5|Martin|   NULL|     10|ACCOUNTING|
|     4| Jones|     30|     10|ACCOUNTING|
|     3|  Ward|     10|     10|ACCOUNTING|
|     2| Allen|     20|     10|ACCOUNTING|
|     5|Martin|   NULL|     40|OPERATIONS|
|     4| Jones|     30|     40|OPERATIONS|
|     3|  Ward|     10|     40|OPERATIONS|
|     2| Allen|     20|     40|OPERATIONS|
|     1| Smith|     10|     40|OPERATIONS|
|     5|Martin|   NULL|     20|  RESEARCH|
|     4| Jones|     30|     20|  RESEARCH|
|     3|  Ward|     10|     20|  RESEARCH|
|     2| Allen|     20|     20|  RESEARCH|
|     1| Smith|     10|     20|  RESEARCH|
|     5|Martin|   NULL|     30|     SALES|
|     4| Jones|     30|     30|     SALES|
|     3|  Ward|     10|     30|     SALES|
|     2| Allen|     20|     30|     SALES|
|     1| Smith|     10|     30|     SALES|
+------+---