In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Week2_Day2_RDDs_DataFrames") \
    .getOrCreate()

sc = spark.sparkContext  # SparkContext gives you access to RDD API

print("✅ Spark Session & Context Created")

✅ Spark Session & Context Created


In [2]:
# Create an RDD from a Python list
data = [1, 2, 3, 4, 5]
rdd = sc.parallelize(data)

print("RDD count:", rdd.count())
print("RDD elements:", rdd.collect())

RDD count: 5
RDD elements: [1, 2, 3, 4, 5]


In [3]:
# Example: square each number
squared_rdd = rdd.map(lambda x: x * x)

print("Squared RDD:", squared_rdd.collect())

Squared RDD: [1, 4, 9, 16, 25]


In [4]:
df = spark.createDataFrame([(1, "Alice"), (2, "Bob"), (3, "Charlie")], ["id", "name"])
df.show()

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
+---+-------+



In [5]:
sales_df = spark.read.csv("/home/jovyan/work/data/sales.csv", header=True, inferSchema=True)
sales_df.printSchema()
sales_df.show(5)

root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- product: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- unit_price: double (nullable = true)

+--------+-----------+----------+--------+--------+----------+
|order_id|customer_id|order_date| product|quantity|unit_price|
+--------+-----------+----------+--------+--------+----------+
|       1|       1001|2024-01-01|Widget A|       2|      9.99|
|       2|       1002|2024-01-03|Widget B|       1|     19.99|
|       3|       1001|2024-01-07|Widget C|       5|       4.5|
|       4|       1003|2024-02-10|Widget A|       3|      9.99|
|       5|       1004|2024-02-15|Widget B|       2|     19.99|
+--------+-----------+----------+--------+--------+----------+
only showing top 5 rows



In [6]:
# RDD way: extract order_ids
order_ids_rdd = sales_df.rdd.map(lambda row: row.order_id)
print("First 5 order_ids (RDD):", order_ids_rdd.take(5))

# DataFrame way: select column
sales_df.select("order_id").show(5)

First 5 order_ids (RDD): [1, 2, 3, 4, 5]
+--------+
|order_id|
+--------+
|       1|
|       2|
|       3|
|       4|
|       5|
+--------+
only showing top 5 rows

