In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
import pyspark

In [4]:
sc = pyspark.SparkContext(appName='myAppName')

In [5]:
def isprime(n):
    """
    check if integer n is a prime
    """
    # make sure n is a positive integer
    n = abs(int(n))
    # 0 and 1 are not primes
    if n < 2:
        return False
    # 2 is the only even prime number
    if n == 2:
        return True
    # all other even numbers are not primes
    if not n & 1:
        return False
    # range starts with 3 and only needs to go up the square root of n
    # for all odd numbers
    for x in range(3, int(n**0.5)+1, 2):
        if n % x == 0:
            return False
    return True

# Create an RDD of numbers from 0 to 1,000,000
nums = sc.parallelize(xrange(1000000))

# Compute the number of primes in the RDD
print nums.filter(isprime).count()

78498


In [6]:
# Define one or more RDDs either through accessing data stored on disk (HDFS, Cassandra, HBase, Local Disk), parallelizing some collection in memory, transforming an existing RDD, or by caching or saving.
# Invoke operations on the RDD by passing closures (functions) to each element of the RDD. Spark offers over 80 high level operators beyond Map and Reduce.
# Use the resulting RDDs with actions (e.g. count, collect, save, etc.). Actions kick off the computing on the cluster.

In [8]:
text = sc.textFile('shakespeare.txt')

In [9]:
print text

MapPartitionsRDD[3] at textFile at NativeMethodAccessorImpl.java:-2


In [10]:
# The textFile method loads the complete works of Shakespeare into an RDD named text. If you inspect the RDD you 
# can see that it is a MappedRDD and that the path to the file is a relative path from the current working directory 
# (pass in a correct path to the shakespeare.txt file on your system). 
# Let's start to transform this RDD in order to compute the "hello world" of distributed computing: "word count."

In [11]:
from operator import add

In [12]:
def tokenize(text):
    return text.split()

In [13]:
words = text.flatMap(tokenize)

In [14]:
print words

PythonRDD[4] at RDD at PythonRDD.scala:43


In [15]:
wc = words.map(lambda x: (x,1))

In [16]:
print wc.toDebugString()

(2) PythonRDD[5] at RDD at PythonRDD.scala:43 []
 |  MapPartitionsRDD[3] at textFile at NativeMethodAccessorImpl.java:-2 []
 |  shakespeare.txt HadoopRDD[2] at textFile at NativeMethodAccessorImpl.java:-2 []


In [17]:
counts = wc.reduceByKey(add)

In [18]:
counts.saveAsTextFile("wc")

In [23]:
sc.stop()