In [3]:

from pyspark import SparkContext

In [5]:
x = sc.parallelize(["a","e","f"])

In [6]:
y = x.map(lambda a: (a,1))

In [8]:
print(y)

PythonRDD[1] at RDD at PythonRDD.scala:53


In [10]:
print(y.collect())

[('a', 1), ('e', 1), ('f', 1)]


In [11]:
x1 = sc.parallelize([1,4,7,8])
x1

ParallelCollectionRDD[2] at parallelize at PythonRDD.scala:195

In [13]:
y1 = x1.filter(lambda a: a%2==0)
print(y1.collect())

[4, 8]


In [15]:
y2 = x1.flatMap(lambda a: (a,a*10))
print(y2.collect())

[1, 10, 4, 40, 7, 70, 8, 80]


In [17]:
#GroupBy

x2 = sc.parallelize(['Anil','Laddu','Anukush','Beru'])

In [18]:
y2 = x2.groupBy(lambda a: a[0])

In [19]:
print(y2.collect())

[('B', <pyspark.resultiterable.ResultIterable object at 0x7fdae0c19b38>), ('L', <pyspark.resultiterable.ResultIterable object at 0x7fdae0c19ba8>), ('A', <pyspark.resultiterable.ResultIterable object at 0x7fdae0c19cf8>)]


In [26]:
print([(k, list(v)) for (k,v) in y2.collect()])

[('B', ['Beru']), ('L', ['Laddu']), ('A', ['Anil', 'Anukush'])]


In [25]:
sorted([(x, sorted(y)) for (x, y) in y2.collect()])

[('A', ['Anil', 'Anukush']), ('B', ['Beru']), ('L', ['Laddu'])]

In [27]:
#groupByKey

In [28]:
g1 = sc.parallelize([('A',3),('B',3),('C',4),('A',6),('C',8),('D',8)])

In [30]:
h1 = g1.groupByKey()

In [31]:
h1.collect()

[('B', <pyspark.resultiterable.ResultIterable at 0x7fdae0c157f0>),
 ('C', <pyspark.resultiterable.ResultIterable at 0x7fdae0c15f60>),
 ('A', <pyspark.resultiterable.ResultIterable at 0x7fdae0c15dd8>),
 ('D', <pyspark.resultiterable.ResultIterable at 0x7fdae0c15ef0>)]

In [36]:
print(list((k[0], list(k[1]))  for k in h1.collect()))

[('B', [3]), ('C', [4, 8]), ('A', [3, 6]), ('D', [8])]


In [39]:
print([(k[0], list(k[1]))  for k in h1.collect()])

[('B', [3]), ('C', [4, 8]), ('A', [3, 6]), ('D', [8])]


In [42]:
h2 = h1.map(lambda a: (a[0],sum(a[1])))
h2.collect()

[('B', 3), ('C', 12), ('A', 9), ('D', 8)]

In [46]:
#reduceByKey

from operator import add

rbk1 = g1.reduceByKey(add)

In [47]:
rbk1.collect()

[('B', 3), ('C', 12), ('A', 9), ('D', 8)]

In [52]:
def simpleGeneratorFun(): 
    yield 1;     yield 2;     yield 3

In [53]:
for value in simpleGeneratorFun():
    print(value)



1
2
3


In [58]:
# mapPartitions
# Return a new RDD by applying a function to each partition of this RDD
# mapPartitions(f, preservesPartitioning=False)

x = sc.parallelize([1,2,3,4,5],2)
x.glom().collect()

[[1, 2], [3, 4, 5]]

In [63]:
def func1(iterator):
    yield sum(iterator); yield 45
    
y = x.mapPartitions(func1)

In [64]:
print(y.glom().collect())

[[3, 45], [12, 45]]


In [66]:
def func2(iterator):
    yield sum(iterator)
    
y = x.mapPartitions(func2)
print(y.glom().collect())

[[3], [12]]


In [67]:
# mapPartitionsWithIndex
# Return a new RDD by applying a function to each partition of this RDD,while tracking the index of the original partition
# mapPartitionsWithIndex(f, preservesPartitioning=False)
def func3(partitionIndex, Iterator1):
    yield (partitionIndex, sum(Iterator1))
    

yi = x.mapPartitionsWithIndex(func3)
print(yi.glom().collect())

[[(0, 3)], [(1, 12)]]


In [72]:
# Sample
# Return a new RDD containing a statistical sample of the original RDD
# sample(withReplacement, fraction, seed=None)

xs = sc.parallelize([1,2,3,4,5])
ys = xs.sample(False,0.8,40)
ys.collect()

[1, 2, 3, 5]

In [73]:
# glom()
# Return an RDD created by coalescing all elements within each partition into a list.
rdd = sc.parallelize([1, 2, 3, 4], 2)
sorted(rdd.glom().collect())

[[1, 2], [3, 4]]

In [78]:
# Union
# Return a new RDD containing all items from two original RDDs. Duplicates are not culled.
# union(otherRDD)

x0 = sc.parallelize([1,2,5,4],2)
y0 = sc.parallelize([3,5,6],2)

In [79]:
z0 = x0.union(y0)
z0.glom().collect()

[[1, 2], [5, 4], [3], [5, 6]]

In [80]:
z0.collect()

[1, 2, 5, 4, 3, 5, 6]

In [81]:
# Join
# Return a new RDD containing all pairs of elements having the same key in the original RDDs
# union(otherRDD, numPartitions=None)


In [85]:
xj = sc.parallelize([('a',1),('b',5),('c',7)])

In [86]:
yj = sc.parallelize([('a',6),('a',9),('c',8)])

In [87]:
zj = xj.join(yj)
zj.collect()

[('c', (7, 8)), ('a', (1, 6)), ('a', (1, 9))]

In [88]:
# Distinct
xd = sc.parallelize([1,2,3,3,4])
xd.distinct().collect()

[1, 2, 3, 4]

In [89]:
# COALESCE
# Return a new RDD which is reduced to a smaller number of partitions
# coalesce(numPartitions, shuffle=False)



In [91]:
xc = sc.parallelize([1,2,35,4,6,7,8,7,8,9],4)

In [94]:
xc.glom().collect()

[[1, 2], [35, 4], [6, 7], [8, 7, 8, 9]]

In [95]:
yc = xc.coalesce(3)
yc.glom().collect()


[[1, 2], [35, 4], [6, 7, 8, 7, 8, 9]]

In [96]:
# KEYBY
# Create a Pair RDD, forming one pair for each item in the original RDD. The pair’s key is calculated from the value via a user-supplied function.
# keyBy(f)

xk = sc.parallelize(['Anil','Ankush','Dhruvi','Himu'])
yk = xk.keyBy(lambda a: a[0])
yk.collect()

[('A', 'Anil'), ('A', 'Ankush'), ('D', 'Dhruvi'), ('H', 'Himu')]

In [97]:
# partitionBy
# Return a new RDD with the specified number of partitions, placing original items into the partition returned by a user supplied function
# partitionBy(numPartitions, partitioner=portable_hash)



In [98]:
xx = sc.parallelize([('A', 'Anil'), ('A', 'Ankush'), ('D', 'Dhruvi'), ('H', 'Himu')],3)

In [110]:
yx = xx.partitionBy(2, lambda a: 0 if a[0] < 'E' else 1)

In [111]:
print(xx.glom().collect())

[[('A', 'Anil')], [('A', 'Ankush')], [('D', 'Dhruvi'), ('H', 'Himu')]]


In [112]:
print(yx.glom().collect())

[[('A', 'Anil'), ('A', 'Ankush'), ('D', 'Dhruvi')], [('H', 'Himu')]]


In [113]:
# zip
# Return a new RDD containing pairs whose key is the item in the original RDD, and whose value is that item’s corresponding element (same partition, same index) in a second RDD
# zip(otherRDD)

In [114]:
x = sc.parallelize([1,3,5])

In [115]:
y = x.map(lambda a: a*a)

In [116]:
z = x.zip(y)

In [117]:
z.collect()

[(1, 1), (3, 9), (5, 25)]