In [1]:
#This file demonstrate RDD operations

from pyspark import SparkContext

# instantiate spark context
sc = SparkContext("local", "rdd-sample")

words = sc.parallelize(["scala", "java", "C#", "vb.net", "F#", "objective-c", "python", "javascript", "react", "hadoop", "hive"])



In [2]:
#count() Number of elements in the RDD in returned.

counts = words.count()
print("Number of elements in RDD: %i" % (counts))

Number of elements in RDD: 11


In [3]:
#collect() All the elements in the RDD are returned.
coll = words.collect()
print("Elements in RDD: %s" % coll)

Elements in RDD: ['scala', 'java', 'C#', 'vb.net', 'F#', 'objective-c', 'python', 'javascript', 'react', 'hadoop', 'hive']


In [4]:
#foreach(f) returns only those elements which meet the condition of the function inside foreach.

#define function
def f(x):
    print("element: %s" % x)

fore = words.foreach(f)

In [6]:
#filter(f) returns the elements which satisfies the function inside the filter

words_filter = words.filter(lambda x: 'a' in x)
print("Fitered RDD: %s" % words_filter.collect())

Fitered RDD: ['scala', 'java', 'javascript', 'react', 'hadoop']


In [7]:
#map(f) returns a new RDD by applying a function to each element in the RDD
words_map = words.map(lambda x: (x, 1))
mapping = words_map.collect()
print("Key value pair: %s" % mapping)

Key value pair: [('scala', 1), ('java', 1), ('C#', 1), ('vb.net', 1), ('F#', 1), ('objective-c', 1), ('python', 1), ('javascript', 1), ('react', 1), ('hadoop', 1), ('hive', 1)]


In [8]:
#reduce(f)
# After performing the specified commutative and associative binary operation, the element in the RDD is returned.

from operator import add
numbers = sc.parallelize([1,2,3,4,5,6,7,8,9,10])
adding = numbers.reduce(add)
print("Adding all the elements: %i" % adding)

Adding all the elements: 55


In [9]:
#join(other)
# returns RDD with a pair of elements with the matching keys and all the values for that particular key.

x = sc.parallelize([("spark",1), ("hadoop",4)])
y = sc.parallelize([("spark", 3), ("hadoop", 6)])

joined = x.join(y)
print("Join RDD: %s" % joined.collect())

Join RDD: [('hadoop', (4, 6)), ('spark', (1, 3))]


In [10]:
#cache()
# persis the RDD with the default storage level (MEMORY_ONLY)

words = sc.parallelize(["lampsplus", "amazon", "microsoft", "google"])
words.cache()
print("Words got cached: %s" % words.persist().is_cached)

Words got cached: True
