In [1]:
# Setup
# 1. JDK
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# 2. Spark
!wget -q https://downloads.apache.org/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
!tar xf spark-2.4.7-bin-hadoop2.7.tgz

# 3. Envs
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"

# 4. PySpark
!pip install -q findspark
import findspark
findspark.init()

In [4]:
from pyspark import SparkContext

# Local SparkContext using N threads (N = number of logical processors)
sc = SparkContext(master="local[*]", appName="range-RDD-stdout")

# 1. Input data: list of integers (unstructured batch)
data = range(1, 10001)
print(f"Input data: {len(data)} integers from {data[0]} to {data[9999]}")

# 2. Data processing
# Parallelize input data into a RDD (lazy evaluation) using * partitions
rangeRDD = sc.parallelize(data)
print(f"RDD has been created using {rangeRDD.getNumPartitions()} partitions")

# Process data using lambda functions and collect results:
# 1) substract 1 to all elements. 2) select those lower than 10.
out = (rangeRDD
       .map(lambda y: y - 1)
       .filter(lambda x: x < 10)
       .collect())

# 3. Output data: show result in the standard output (console)
print(f"The output is {out}")

Input data: 10000 integers from 1 to 10000
RDD has been created using 2 partitions
The output is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
