In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [3]:
import findspark
findspark.init()
from pyspark import SparkConf
from pyspark.context import SparkContext
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
sc

In [4]:
print("Default parallelism:", sc.defaultParallelism)

Default parallelism: 2


In [5]:
rng_nums = sc.parallelize(range(1_000_000))
print (rng_nums.getNumPartitions())

2


In [6]:
rng_nums = sc.parallelize(range(1_000_000), sc.defaultParallelism * 10)
print (rng_nums.getNumPartitions())

20


In [7]:
# Calculating squares on an array

nums = sc.parallelize( [1,2,3,4,5,6,7,8,9,10])
nums = nums.map(lambda x: x*x)
nums = nums.collect()
print(nums)

[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]


In [8]:
# Fun with counting prime numbers

def is_prime(n):
    if n < 2: return False
    for div in range(2, int(n**0.5)+1):
        if n % div == 0: return False
    return True

nums = range(1, 2_000_000)

In [9]:
# vanilla python (not Spark)

%time n = sum(map(is_prime, nums))
print(n)

CPU times: user 15.2 s, sys: 24.5 ms, total: 15.2 s
Wall time: 15.4 s
148933


In [10]:
# parallelized on Spark (2 partitions)

%time res = sc.parallelize( nums) # this creates an RDD 
%time res = res.filter( is_prime) # we get rid of all non-prime numbers
%time res = res.count()           # count the number of items in the filter
print(res)

CPU times: user 1.57 ms, sys: 0 ns, total: 1.57 ms
Wall time: 8.45 ms
CPU times: user 19 µs, sys: 0 ns, total: 19 µs
Wall time: 21.2 µs
CPU times: user 71.4 ms, sys: 8.34 ms, total: 79.7 ms
Wall time: 13.9 s
148933


In [11]:
# Sequential on Spark (1 partition)
%time res = sc.parallelize(nums, 1)
%time res = res.filter( is_prime)
%time res = res.count()
print(res)

CPU times: user 2.22 ms, sys: 6 µs, total: 2.22 ms
Wall time: 9.47 ms
CPU times: user 21 µs, sys: 0 ns, total: 21 µs
Wall time: 24.6 µs
CPU times: user 74 ms, sys: 8.26 ms, total: 82.2 ms
Wall time: 14.2 s
148933


In [12]:
# many partitions (10x default)
%time res = sc.parallelize( nums, sc.defaultParallelism * 10) 
%time res = res.filter( is_prime)
%time res = res.count()          
print(res)

CPU times: user 2.46 ms, sys: 13 µs, total: 2.47 ms
Wall time: 11.2 ms
CPU times: user 22 µs, sys: 0 ns, total: 22 µs
Wall time: 25.7 µs
CPU times: user 67.5 ms, sys: 14.3 ms, total: 81.8 ms
Wall time: 13.7 s
148933
