In [1]:
import os
from pyspark.sql import SparkSession

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = SparkSession.builder.appName('RDD-Demo').getOrCreate()

In [4]:
numbers = [1, 2, 3, 4, 5]
rdd = spark.sparkContext.parallelize(numbers)

In [5]:
rdd.collect()

[1, 2, 3, 4, 5]

In [6]:
data = [('Alice', 25), ("Bob", 30), ("Charlie", 35), ( "Alice", 40)]
rdd = spark.sparkContext.parallelize(data)

In [7]:
rdd.collect()

[('Alice', 25), ('Bob', 30), ('Charlie', 35), ('Alice', 40)]

In [8]:
count = rdd.count()
count

4

In [9]:
first_element = rdd.first()
first_element

('Alice', 25)

In [10]:
take_element = rdd.take(2)
take_element

[('Alice', 25), ('Bob', 30)]

In [11]:
rdd.foreach(lambda x: print(x))

In [12]:
mapped_rdd = rdd.map(lambda x: (x[0].upper(), x[1]))

In [13]:
result = mapped_rdd.collect()
result

[('ALICE', 25), ('BOB', 30), ('CHARLIE', 35), ('ALICE', 40)]

In [15]:
filtered_rdd = rdd.filter(lambda age: age[1] > 30)
filtered_rdd.collect()

[('Charlie', 35), ('Alice', 40)]

In [16]:
reduced_rdd = rdd.reduceByKey(lambda x, y: x + y)
reduced_rdd.collect()

[('Alice', 65), ('Charlie', 35), ('Bob', 30)]

In [17]:
sorted_rdd = rdd.sortBy(lambda age: age[1])
sorted_rdd.collect()

[('Alice', 25), ('Bob', 30), ('Charlie', 35), ('Alice', 40)]

In [19]:
rdd.saveAsTextFile('output.txt')

In [20]:
rdd_text = spark.read.text('output.txt')
rdd_text.collect()

[Row(value="('Charlie', 35)"),
 Row(value="('Alice', 25)"),
 Row(value="('Alice', 40)"),
 Row(value="('Bob', 30)")]