In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# create spark session
spark = SparkSession.builder \
.appName('dataFrameOps')\
.getOrCreate()

# generate data
data = [
    (0, 'Alice', 25),
    (1, 'Bob', 28),
    (2, 'Charlie', 35)
]

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
])

df = spark.createDataFrame(data, schema)

df.show()

                                                                                

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  0|  Alice| 25|
|  1|    Bob| 28|
|  2|Charlie| 35|
+---+-------+---+



In [3]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [5]:
df.columns

['id', 'name', 'age']

In [7]:
df.describe().show()

[Stage 3:>                                                        (0 + 14) / 14]

+-------+---+-------+------------------+
|summary| id|   name|               age|
+-------+---+-------+------------------+
|  count|  3|      3|                 3|
|   mean|1.0|   NULL|29.333333333333332|
| stddev|1.0|   NULL| 5.131601439446884|
|    min|  0|  Alice|                25|
|    max|  2|Charlie|                35|
+-------+---+-------+------------------+



                                                                                

In [8]:
# select and filter use case

In [10]:
df.select('id', 'name').show()

+---+-------+
| id|   name|
+---+-------+
|  0|  Alice|
|  1|    Bob|
|  2|Charlie|
+---+-------+



In [11]:
df.filter(df.age > 30).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  2|Charlie| 35|
+---+-------+---+



In [13]:
df.where(df.name == 'Alice').show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  0|Alice| 25|
+---+-----+---+



In [14]:
# ordering

In [17]:
df.orderBy(df.age).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  0|  Alice| 25|
|  1|    Bob| 28|
|  2|Charlie| 35|
+---+-------+---+



                                                                                

In [18]:
df.orderBy(df.age.desc()).show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  2|Charlie| 35|
|  1|    Bob| 28|
|  0|  Alice| 25|
+---+-------+---+



In [19]:
# group by and aggregation

In [20]:
df.groupBy(df.name).count().show()

+-------+-----+
|   name|count|
+-------+-----+
|  Alice|    1|
|    Bob|    1|
|Charlie|    1|
+-------+-----+



In [25]:
df.agg({'age': 'max'}).show()

+--------+
|max(age)|
+--------+
|      35|
+--------+



In [26]:
df.agg({'age': 'avg'}).show()

+------------------+
|          avg(age)|
+------------------+
|29.333333333333332|
+------------------+



In [27]:
# Join

In [29]:
data2 = [
    (0, 'India'),
    (1, 'England'),
    (2, 'Australia'),
]

schema2 = StructType([
    StructField('id', IntegerType(), True),
    StructField('country', StringType(), True),
])

df2 = spark.createDataFrame(data2, schema2)

df2.show()

+---+---------+
| id|  country|
+---+---------+
|  0|    India|
|  1|  England|
|  2|Australia|
+---+---------+



In [30]:
df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  0|  Alice| 25|
|  1|    Bob| 28|
|  2|Charlie| 35|
+---+-------+---+



In [31]:
df.join(df2, 'id').show()



+---+-------+---+---------+
| id|   name|age|  country|
+---+-------+---+---------+
|  0|  Alice| 25|    India|
|  1|    Bob| 28|  England|
|  2|Charlie| 35|Australia|
+---+-------+---+---------+



                                                                                

In [32]:
df.join(df2, 'id').select(df.name, df2.country).show()



+-------+---------+
|   name|  country|
+-------+---------+
|  Alice|    India|
|    Bob|  England|
|Charlie|Australia|
+-------+---------+



                                                                                

In [33]:
spark.stop()