In [None]:
!pip install pyspark



In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

import random


In [4]:
spark = SparkSession.builder \
  .master("spark://10.32.45.215:7077") \
  .getOrCreate()

spark


In [5]:
# sc = spark.sparkContext

sc = SparkContext.getOrCreate()
sc


# API

In [6]:
# creates a RDD with 10 partitions
count = sc.parallelize(range(0, 1000), 10)

# get the number of partitions
print(count.getNumPartitions())

# 



10


# Examples


In [7]:
%%time

# Compute value of PI using Monte Carlo Approach
# Source: http://spark.apache.org/examples.html
NUM_SAMPLES = 10000000

def inside(p):
    x, y = random.random(), random.random()
    return x*x + y*y < 1

# filter: http://spark.apache.org/docs/latest/api/python/pyspark.html
count = sc.parallelize(range(0, NUM_SAMPLES), 10) \
             .filter(inside).count()

print("Pi is roughly %f" % (4.0 * count / NUM_SAMPLES))


[Stage 0:>                                                        (0 + 10) / 10]

Pi is roughly 3.142372
CPU times: user 9.56 ms, sys: 937 µs, total: 10.5 ms
Wall time: 1.61 s


                                                                                

In [None]:
%%time

from multiprocessing import Pool

import multiprocessing
import random

N = NUM_SAMPLES
process_num = multiprocessing.cpu_count()
print('You have {0:1d} CPUs'.format(process_num))

def make_pi(end):
    count_inbound = 0
    for x in range(end):
        the_x = random.random()
        the_y = random.random()
        if((the_x**2 + the_y**2) <= 1):
            count_inbound += 1
    return count_inbound

if __name__ == "__main__":

    # multiprocessing code
    p = Pool(processes = process_num)
    count_in = p.map(make_pi, [int(N/process_num) for x in range(process_num)])
    print(4*sum(count_in)/N)

    # normal code
    # print(4*make_pi(N)/N)

You have 2 CPUs
3.1410612
CPU times: user 58.3 ms, sys: 31.6 ms, total: 89.9 ms
Wall time: 4.41 s
