Spark DataFrame Operations

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (SparkSession.builder.remote("sc://localhost:15002").getOrCreate())

In [2]:
# 1. Creating DataFrames
# ----------------------
print("=== 1. Creating DataFrames ===")

# From a list of tuples
employees = spark.createDataFrame([
    ("Alice", 34, "Engineering", 75000),
    ("Bob", 28, "Marketing", 60000),
    ("Charlie", 45, "Engineering", 95000),
    ("Diana", 31, "Sales", 70000),
    ("Eve", 29, "Marketing", 62000)
], ["name", "age", "department", "salary"])

print("Employee DataFrame:")
employees.show()

=== 1. Creating DataFrames ===
Employee DataFrame:
+-------+---+-----------+------+
|   name|age| department|salary|
+-------+---+-----------+------+
|  Alice| 34|Engineering| 75000|
|    Bob| 28|  Marketing| 60000|
|Charlie| 45|Engineering| 95000|
|  Diana| 31|      Sales| 70000|
|    Eve| 29|  Marketing| 62000|
+-------+---+-----------+------+



In [3]:
# 2. Basic DataFrame Operations
# -----------------------------
print("\n=== 2. Basic Operations ===")

# Select specific columns
print("Select name and salary:")
employees.select("name", "salary").show()


=== 2. Basic Operations ===
Select name and salary:
+-------+------+
|   name|salary|
+-------+------+
|  Alice| 75000|
|    Bob| 60000|
|Charlie| 95000|
|  Diana| 70000|
|    Eve| 62000|
+-------+------+



In [4]:

# Filter rows
print("Employees older than 30:")
employees.filter(F.col("age") > 30).show()

Employees older than 30:
+-------+---+-----------+------+
|   name|age| department|salary|
+-------+---+-----------+------+
|  Alice| 34|Engineering| 75000|
|Charlie| 45|Engineering| 95000|
|  Diana| 31|      Sales| 70000|
+-------+---+-----------+------+



In [5]:
# 3. Column Operations
# --------------------
print("\n=== 3. Column Operations ===")

# Add a new column
print("Add bonus column (10% of salary):")
employees.withColumn("bonus", F.col("salary") * 0.10).show()

# Rename a column
print("Rename 'name' to 'employee_name':")
employees.withColumnRenamed("name", "employee_name").show()


=== 3. Column Operations ===
Add bonus column (10% of salary):
+-------+---+-----------+------+------+
|   name|age| department|salary| bonus|
+-------+---+-----------+------+------+
|  Alice| 34|Engineering| 75000|7500.0|
|    Bob| 28|  Marketing| 60000|6000.0|
|Charlie| 45|Engineering| 95000|9500.0|
|  Diana| 31|      Sales| 70000|7000.0|
|    Eve| 29|  Marketing| 62000|6200.0|
+-------+---+-----------+------+------+

Rename 'name' to 'employee_name':
+-------------+---+-----------+------+
|employee_name|age| department|salary|
+-------------+---+-----------+------+
|        Alice| 34|Engineering| 75000|
|          Bob| 28|  Marketing| 60000|
|      Charlie| 45|Engineering| 95000|
|        Diana| 31|      Sales| 70000|
|          Eve| 29|  Marketing| 62000|
+-------------+---+-----------+------+



In [6]:
# 4. Aggregations
# ---------------
print("\n=== 4. Aggregations ===")

# Group by and aggregate
print("Average salary by department:")
employees.groupBy("department").agg(
    F.avg("salary").alias("avg_salary"),
    F.count("*").alias("employee_count")
).show()


=== 4. Aggregations ===
Average salary by department:
+-----------+----------+--------------+
| department|avg_salary|employee_count|
+-----------+----------+--------------+
|  Marketing|   61000.0|             2|
|      Sales|   70000.0|             1|
|Engineering|   85000.0|             2|
+-----------+----------+--------------+



In [7]:
# 5. Sorting
# ----------
print("\n=== 5. Sorting ===")

print("Sort by salary (descending):")
employees.orderBy(F.col("salary").desc()).show()


=== 5. Sorting ===
Sort by salary (descending):
+-------+---+-----------+------+
|   name|age| department|salary|
+-------+---+-----------+------+
|Charlie| 45|Engineering| 95000|
|  Alice| 34|Engineering| 75000|
|  Diana| 31|      Sales| 70000|
|    Eve| 29|  Marketing| 62000|
|    Bob| 28|  Marketing| 60000|
+-------+---+-----------+------+



In [8]:
# 6. Useful Functions
# -------------------
print("\n=== 6. Useful Functions ===")

# Count rows
print(f"Total employees: {employees.count()}")

# Distinct values
print("Distinct departments:")
employees.select("department").distinct().show()

# Describe statistics
print("Statistical summary:")
employees.describe("age", "salary").show()


=== 6. Useful Functions ===
Total employees: 5
Distinct departments:
+-----------+
| department|
+-----------+
|  Marketing|
|      Sales|
|Engineering|
+-----------+

Statistical summary:
+-------+-----------------+------------------+
|summary|              age|            salary|
+-------+-----------------+------------------+
|  count|                5|                 5|
|   mean|             33.4|           72400.0|
| stddev|6.877499545619759|14010.710188994706|
|    min|               28|             60000|
|    max|               45|             95000|
+-------+-----------------+------------------+

