## Create an empty DataFrame

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName('sample1').master('local').getOrCreate()

In [11]:
df = spark.createDataFrame([], "id int, name String")

## Convert RDD to DataFrame

In [12]:
rdd = spark.sparkContext.parallelize([(1, 'Alice'),(2, 'Bob')])

In [13]:
columns = ['id','name']
df = rdd.toDF(schema=columns)
df.show()

25/02/07 16:02:59 INFO SparkContext: Starting job: runJob at PythonRDD.scala:166
25/02/07 16:02:59 INFO DAGScheduler: Got job 4 (runJob at PythonRDD.scala:166) with 1 output partitions
25/02/07 16:02:59 INFO DAGScheduler: Final stage: ResultStage 4 (runJob at PythonRDD.scala:166)
25/02/07 16:02:59 INFO DAGScheduler: Parents of final stage: List()
25/02/07 16:02:59 INFO DAGScheduler: Missing parents: List()
25/02/07 16:02:59 INFO DAGScheduler: Submitting ResultStage 4 (PythonRDD[23] at RDD at PythonRDD.scala:53), which has no missing parents
25/02/07 16:02:59 INFO MemoryStore: Block broadcast_4 stored as values in memory (estimated size 6.7 KiB, free 6.2 GiB)
25/02/07 16:02:59 INFO MemoryStore: Block broadcast_4_piece0 stored as bytes in memory (estimated size 4.2 KiB, free 6.2 GiB)
25/02/07 16:02:59 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on 1cdc7ec21abb:36743 (size: 4.2 KiB, free: 6.2 GiB)
25/02/07 16:02:59 INFO SparkContext: Created broadcast 4 from broadcast at DAG

+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
+---+-----+



25/02/07 16:03:00 INFO PythonRunner: Times: total = 90, boot = 4, init = 86, finish = 0
25/02/07 16:03:00 INFO Executor: Finished task 0.0 in stage 5.0 (TID 5). 1850 bytes result sent to driver
25/02/07 16:03:00 INFO TaskSetManager: Finished task 0.0 in stage 5.0 (TID 5) in 101 ms on 1cdc7ec21abb (executor driver) (1/1)
25/02/07 16:03:00 INFO TaskSchedulerImpl: Removed TaskSet 5.0, whose tasks have all completed, from pool 
25/02/07 16:03:00 INFO DAGScheduler: ResultStage 5 (showString at NativeMethodAccessorImpl.java:0) finished in 0.110 s
25/02/07 16:03:00 INFO DAGScheduler: Job 5 is finished. Cancelling potential speculative or zombie tasks for this job
25/02/07 16:03:00 INFO TaskSchedulerImpl: Killing all running tasks in stage 5: Stage finished
25/02/07 16:03:00 INFO DAGScheduler: Job 5 finished: showString at NativeMethodAccessorImpl.java:0, took 0.115673 s


## Convert DataFrame to Pandas

In [None]:
df.toPandas()

## StructType & StructField

In [None]:
# These classes are used to define the schema for DataFrames.
from pyspark.sql.types import IntegerType, StringType
schema = StructType([
    StructField(name='id', dataType=IntegerType()),
    StructField(name='name', dataType=StringType())
])
df = spark.createDataFrame([(1, "Alice"), (2, "Bob")], schema)
df.show()

## Column Class
The Column class represents a column in a DataFrame and is used for performing operations.

In [3]:
df.columns

['id', 'name']

## Select

In [4]:
df.select('id', 'name').show()

25/02/07 15:56:00 INFO CodeGenerator: Code generated in 129.527671 ms
25/02/07 15:56:00 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
25/02/07 15:56:00 INFO DAGScheduler: Got job 0 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
25/02/07 15:56:00 INFO DAGScheduler: Final stage: ResultStage 0 (showString at NativeMethodAccessorImpl.java:0)
25/02/07 15:56:00 INFO DAGScheduler: Parents of final stage: List()
25/02/07 15:56:00 INFO DAGScheduler: Missing parents: List()
25/02/07 15:56:00 INFO DAGScheduler: Submitting ResultStage 0 (MapPartitionsRDD[6] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
25/02/07 15:56:00 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 12.6 KiB, free 6.2 GiB)
25/02/07 15:56:00 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 6.6 KiB, free 6.2 GiB)
25/02/07 15:56:00 INFO BlockManagerInfo: Added broadcast_0_pie

+---+----+
| id|name|
+---+----+
+---+----+



25/02/07 15:56:01 INFO PythonRunner: Times: total = 707, boot = 630, init = 77, finish = 0
25/02/07 15:56:01 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 1789 bytes result sent to driver
25/02/07 15:56:01 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 1087 ms on 1cdc7ec21abb (executor driver) (1/1)
25/02/07 15:56:01 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
25/02/07 15:56:01 INFO PythonAccumulatorV2: Connected to AccumulatorServer at host: 127.0.0.1 port: 35311
25/02/07 15:56:02 INFO DAGScheduler: ResultStage 0 (showString at NativeMethodAccessorImpl.java:0) finished in 1.271 s
25/02/07 15:56:02 INFO DAGScheduler: Job 0 is finished. Cancelling potential speculative or zombie tasks for this job
25/02/07 15:56:02 INFO TaskSchedulerImpl: Killing all running tasks in stage 0: Stage finished
25/02/07 15:56:02 INFO DAGScheduler: Job 0 finished: showString at NativeMethodAccessorImpl.java:0, took 1.327157 s
25/02/07 15:56

## collect()
collect() returns all the rows as a list of Row objects.

In [7]:
rows = df.collect()
print(rows)

[Row(id=1, name='Alice'), Row(id=2, name='Bob')]


25/02/07 15:56:49 INFO CodeGenerator: Code generated in 11.782167 ms
25/02/07 15:56:49 INFO SparkContext: Starting job: collect at /tmp/ipykernel_2986/1121679628.py:1
25/02/07 15:56:49 INFO DAGScheduler: Got job 3 (collect at /tmp/ipykernel_2986/1121679628.py:1) with 1 output partitions
25/02/07 15:56:49 INFO DAGScheduler: Final stage: ResultStage 3 (collect at /tmp/ipykernel_2986/1121679628.py:1)
25/02/07 15:56:49 INFO DAGScheduler: Parents of final stage: List()
25/02/07 15:56:49 INFO DAGScheduler: Missing parents: List()
25/02/07 15:56:49 INFO DAGScheduler: Submitting ResultStage 3 (MapPartitionsRDD[16] at collect at /tmp/ipykernel_2986/1121679628.py:1), which has no missing parents
25/02/07 15:56:49 INFO MemoryStore: Block broadcast_3 stored as values in memory (estimated size 14.7 KiB, free 6.2 GiB)
25/02/07 15:56:49 INFO MemoryStore: Block broadcast_3_piece0 stored as bytes in memory (estimated size 7.8 KiB, free 6.2 GiB)
25/02/07 15:56:49 INFO BlockManagerInfo: Added broadcast_3

# withColumn()
This method is used to add or modify a column.

In [14]:
df_b = df.withColumn('surname',F.lit('García'))

In [15]:
df_b.show()

+---+-----+-------+
| id| name|surname|
+---+-----+-------+
|  1|Alice| García|
|  2|  Bob| García|
+---+-----+-------+



25/02/07 16:03:15 INFO CodeGenerator: Code generated in 13.745858 ms
25/02/07 16:03:15 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
25/02/07 16:03:15 INFO DAGScheduler: Got job 6 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
25/02/07 16:03:15 INFO DAGScheduler: Final stage: ResultStage 6 (showString at NativeMethodAccessorImpl.java:0)
25/02/07 16:03:15 INFO DAGScheduler: Parents of final stage: List()
25/02/07 16:03:15 INFO DAGScheduler: Missing parents: List()
25/02/07 16:03:15 INFO DAGScheduler: Submitting ResultStage 6 (MapPartitionsRDD[31] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
25/02/07 16:03:15 INFO MemoryStore: Block broadcast_6 stored as values in memory (estimated size 15.1 KiB, free 6.2 GiB)
25/02/07 16:03:15 INFO MemoryStore: Block broadcast_6_piece0 stored as bytes in memory (estimated size 8.0 KiB, free 6.2 GiB)
25/02/07 16:03:15 INFO BlockManagerInfo: Added broadcast_6_pie

## withColumnRenamed()
Renames an existing column in the DataFrame.

In [22]:
df_renamed = df_b.withColumnRenamed('surname', 'apellidos')

In [23]:
df_renamed.show()

+---+-----+---------+
| id| name|apellidos|
+---+-----+---------+
|  1|Alice|   García|
|  2|  Bob|   García|
+---+-----+---------+



25/02/07 16:05:42 INFO SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
25/02/07 16:05:42 INFO DAGScheduler: Got job 9 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
25/02/07 16:05:42 INFO DAGScheduler: Final stage: ResultStage 9 (showString at NativeMethodAccessorImpl.java:0)
25/02/07 16:05:42 INFO DAGScheduler: Parents of final stage: List()
25/02/07 16:05:42 INFO DAGScheduler: Missing parents: List()
25/02/07 16:05:42 INFO DAGScheduler: Submitting ResultStage 9 (MapPartitionsRDD[37] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
25/02/07 16:05:42 INFO MemoryStore: Block broadcast_9 stored as values in memory (estimated size 15.1 KiB, free 6.2 GiB)
25/02/07 16:05:42 INFO MemoryStore: Block broadcast_9_piece0 stored as bytes in memory (estimated size 8.0 KiB, free 6.2 GiB)
25/02/07 16:05:42 INFO BlockManagerInfo: Added broadcast_9_piece0 in memory on 1cdc7ec21abb:36743 (size: 8.0 KiB, free: 6.2 GiB)
25

## where() & filter()
Both methods are used to filter rows based on conditions.

In [None]:
df.filter(df.name=='Bob').show()

In [None]:
df.where(df.id > 1).show()

## drop() & dropDuplicates()
Used to drop a column or remove duplicate rows.

In [None]:
df_renamed_dropped = df_renamed.drop('apellidos')
df_renamed_dropped.show()

In [None]:
from pyspark.sql import Row
df_with_duplicates = spark.createDataFrame([
    Row(name='Alice', age=5, height=80),
    Row(name='Alice', age=5, height=80),
    Row(name='Alice', age=10, height=80)
])
# Drop based on all columns
df_without_duplicates = df_with_duplicates.drop_duplicates()
df_without_duplicates.show()

# Drop based on specific column
df_without_duplicates = df_with_duplicates.drop_duplicates(subset=['age'])

25/02/07 17:21:03 INFO Executor: Told to re-register on heartbeat
25/02/07 17:21:03 INFO BlockManager: BlockManager BlockManagerId(driver, 1cdc7ec21abb, 36743, None) re-registering with master
25/02/07 17:21:03 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, 1cdc7ec21abb, 36743, None)
25/02/07 17:21:03 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, 1cdc7ec21abb, 36743, None)
25/02/07 17:21:03 INFO BlockManager: Reporting 8 blocks to the master.
25/02/07 17:21:03 INFO BlockManagerInfo: Updated broadcast_18_piece0 in memory on 1cdc7ec21abb:36743 (current size: 17.8 KiB, original size: 17.8 KiB, free: 6.2 GiB)
25/02/07 17:21:03 INFO BlockManagerInfo: Updated broadcast_17_piece0 in memory on 1cdc7ec21abb:36743 (current size: 11.7 KiB, original size: 11.7 KiB, free: 6.2 GiB)
25/02/07 17:21:03 INFO BlockManagerInfo: Updated broadcast_19_piece0 in memory on 1cdc7ec21abb:36743 (current size: 11.7 KiB, original size: 11.7 KiB, free: 6.2 GiB)
25/

In [None]:
spark.se

# orderBy() and sort()

In [52]:
data = [("Alice", 30),("Manu", 37),("Charlie", 51)]
df_sort = spark.createDataFrame(data, ["name", "age"])
df_sort.sort('age').show()


df_order_by = spark.createDataFrame(data, ["name", "age"])
df_order_by.orderBy("age", ascending=False).show()



25/02/07 17:36:28 INFO SparkContext: Starting job: showString at <unknown>:0
25/02/07 17:36:28 INFO DAGScheduler: Got job 29 (showString at <unknown>:0) with 1 output partitions
25/02/07 17:36:28 INFO DAGScheduler: Final stage: ResultStage 31 (showString at <unknown>:0)
25/02/07 17:36:28 INFO DAGScheduler: Parents of final stage: List()
25/02/07 17:36:28 INFO DAGScheduler: Missing parents: List()
25/02/07 17:36:28 INFO DAGScheduler: Submitting ResultStage 31 (MapPartitionsRDD[182] at showString at <unknown>:0), which has no missing parents
25/02/07 17:36:28 INFO MemoryStore: Block broadcast_29 stored as values in memory (estimated size 14.7 KiB, free 6.2 GiB)
25/02/07 17:36:28 INFO MemoryStore: Block broadcast_29_piece0 stored as bytes in memory (estimated size 7.5 KiB, free 6.2 GiB)
25/02/07 17:36:28 INFO BlockManagerInfo: Added broadcast_29_piece0 in memory on 1cdc7ec21abb:36743 (size: 7.5 KiB, free: 6.2 GiB)
25/02/07 17:36:28 INFO SparkContext: Created broadcast 29 from broadcast at

+-------+---+
|   name|age|
+-------+---+
|  Alice| 30|
|   Manu| 37|
|Charlie| 51|
+-------+---+

+-------+---+
|   name|age|
+-------+---+
|Charlie| 51|
|   Manu| 37|
|  Alice| 30|
+-------+---+



25/02/07 17:36:28 INFO PythonRunner: Times: total = 62, boot = -74, init = 136, finish = 0
25/02/07 17:36:28 INFO Executor: Finished task 0.0 in stage 32.0 (TID 30). 4365 bytes result sent to driver
25/02/07 17:36:28 INFO TaskSetManager: Finished task 0.0 in stage 32.0 (TID 30) in 69 ms on 1cdc7ec21abb (executor driver) (1/1)
25/02/07 17:36:28 INFO TaskSchedulerImpl: Removed TaskSet 32.0, whose tasks have all completed, from pool 
25/02/07 17:36:28 INFO DAGScheduler: ResultStage 32 (showString at <unknown>:0) finished in 0.075 s
25/02/07 17:36:28 INFO DAGScheduler: Job 30 is finished. Cancelling potential speculative or zombie tasks for this job
25/02/07 17:36:28 INFO TaskSchedulerImpl: Killing all running tasks in stage 32: Stage finished
25/02/07 17:36:28 INFO DAGScheduler: Job 30 finished: showString at <unknown>:0, took 0.078482 s
25/02/07 17:36:33 INFO Executor: Told to re-register on heartbeat
25/02/07 17:36:33 INFO BlockManager: BlockManager BlockManagerId(driver, 1cdc7ec21abb, 