In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

import os
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_261.jdk/Contents/Home"
os.environ["PYSPARK_PYTHON"] = "python3.7"

sc = SparkContext("local","abc")
sql= SQLContext(sc)

# Creating RDD 
1. Parellelize
2. Passing a list of values 
3. Reading from a text file 
4. Using range operation


In [9]:
#Parellelize
rdd1 = sc.parallelize([1,2,3,40,50,60,70,80,90])
rdd1.collect()

[1, 2, 3, 40, 50, 60, 70, 80, 90]

In [4]:
#Passing a list of values
ls = [1,2,3,4,5,6,7,8,9,10,11]
rdd2=sc.parallelize(ls)
rdd2.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [6]:
#Reading from a text file 
rdd3=sc.textFile('/Users/ansu/Spark_Luminar/sample_spark_text.txt')
rdd3.collect()

['1', '2', 'Df', 'Rf', '3', '5', '6', '7', 'Ty', 'Hu', 'B', 'T', 'G']

In [7]:
#Using Range operation
rdd4 = sc.range(1,15)
rdd4.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

# TRANSFORMATION on RDD


In [10]:
#map function
rdd_map = rdd1.map(lambda x:x*x)
rdd_map.collect()

[1, 4, 9, 1600, 2500, 3600, 4900, 6400, 8100]

In [12]:
#filter function
rdd_filter = rdd1.filter(lambda x:x==50)
rdd_filter.collect()

[50]

In [18]:
#flatmap 
rdd_text = sc.parallelize(['I am sad','I am happy','I am angry'])
rdd_flatmap = rdd_text.flatMap(lambda x:x.split())
rdd_flatmap.collect()

['I', 'am', 'sad', 'I', 'am', 'happy', 'I', 'am', 'angry']

In [19]:
#map and flatmap difference
rdd_map1 = rdd_text.map(lambda x:x.split())
rdd_map1.collect()

[['I', 'am', 'sad'], ['I', 'am', 'happy'], ['I', 'am', 'angry']]

In [22]:
#union
rdd5 = sc.parallelize([1,2,3,4,5,6])
rdd6 = sc.parallelize([5,6,7,8,9,10])
rdd_union = rdd5.union(rdd6)
rdd_union.collect()

[1, 2, 3, 4, 5, 6, 5, 6, 7, 8, 9, 10]

In [23]:
#intersection
rdd_intersection = rdd5.intersection(rdd6)
rdd_intersection.collect()

[6, 5]

In [25]:
#distinct - it can be done on one rdd at a time 
rdd_distinct = rdd5.distinct()
rdd_distinct.collect()

[1, 2, 3, 4, 5, 6]

In [34]:
#group by key
x = sc.parallelize([
    ("USA", 1), ("USA", 2), ("India", 1),
    ("UK", 1), ("India", 4), ("India", 9),
    ("USA", 8), ("USA", 3), ("India", 4),
    ("UK", 6), ("UK", 9), ("UK", 5)])
x.collect()

rdd_gpbykey = x.groupByKey()
rdd_result = rdd_gpbykey.map(lambda x : [x[0],[x for x in x[1]]]) ##here we get USA,<list> which is not displayed so we need to make a map to display list
rdd_result.collect()

[['USA', [1, 2, 8, 3]], ['UK', [1, 6, 9, 5]], ['India', [1, 4, 9, 4]]]

In [30]:
#or display as below
for t in rdd_gpbykey.collect():
    print(t[0], [v for v in t[1]])

USA [1, 2, 8, 3]
UK [1, 6, 9, 5]
India [1, 4, 9, 4]


In [38]:
#reduceBykey
x = sc.parallelize([("a", 1), ("b", 1), ("a", 1), ("a", 1),
                    ("b", 1), ("b", 1), ("b", 1), ("b", 1)])
x.collect()
rdd_rbykey = x.reduceByKey(lambda p,q:p+q)
rdd_rbykey.collect()

[('a', 3), ('b', 5)]

# ACTIONS

In [39]:
#collect
rdd10 = sc.range(1,21)
rdd10.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [41]:
#take
rdd10.take(5)

[1, 2, 3, 4, 5]

In [43]:
#first
rdd10.first()

1

In [45]:
#count
rdd10.count()

20

In [49]:
#reduce
rdd11=sc.parallelize([1,2,3])
rdd_reduce=rdd11.reduce(lambda x,y:x+y)
print(rdd_reduce)

6


# WORD COUNT USING TRANSFORMATION AND ACTION    

In [52]:
rdd_text = sc.parallelize(['I am sad','I am happy','I am angry'])

#using groupByKey
rdd_wc_gbk = rdd_text.flatMap(lambda x:x.split()).map(lambda x:(x.lower(),1))\
    .groupByKey().map(lambda x:(x[0],sum(x[1])))
rdd_wc_gbk.collect()


[('i', 3), ('am', 3), ('sad', 1), ('happy', 1), ('angry', 1)]

In [None]:
#using reduceByKey
rdd_wc = rdd_text.flatMap(lambda x : x.split()).map(lambda x: (x.lower(),1)).reduceByKey(lambda x,y:x+y)
rdd_wc.collect()