In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null


In [None]:
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz



In [None]:
!tar xf spark-3.0.0-bin-hadoop3.2.tgz


In [None]:
!pip install -q findspark


In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()

In [None]:
findspark.find()

'/content/spark-3.0.0-bin-hadoop3.2/python/pyspark'

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

 Creating bold text an RDD using parallelize() method

In [None]:
data = [1,2,3,4,5,6,7,8,9,10,11,12]
rdd = spark.sparkContext.parallelize(data)


In [None]:
rdd.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [None]:

#Create RDD from external Data source
rdd2 = spark.sparkContext.textFile("/content/textFile")


In [None]:
rdd2.collect()

In [None]:
rdd = spark.sparkContext.parallelize([],4)

In [None]:
rdd.collect()

[]

In [None]:
data = [1,2,3,4,5,6,7,8,9,10,11,12]
rdd = spark.sparkContext.parallelize([data],4)

In [None]:
rdd.collect()

In [None]:

print("initial partition count:"+str(rdd.getNumPartitions()))
#Outputs: initial partition count:2


initial partition count:4


# Repartition & Coalesce

In [None]:
reparRdd = rdd.repartition(4)
print("re-partition count"+str(reparRdd.getNumPartitions()))

re-partition count4


In [None]:
rdd2 = spark.sparkContext.parallelize((0,20))
print("From local[5]"+str(rdd2.getNumPartitions()))

From local[5]1


In [None]:
rdd2.collect()

[0, 20]

In [None]:
rdd1 = spark.sparkContext.textFile("/content/textFile.txt")

In [None]:
rdd1.collect()

# RDD TRANSFORMATIONS

flatMap – flatMap() transformation flattens the RDD after applying the function and returns a new RDD. On the below example, first, it splits each record by space in an RDD and finally flattens it. Resulting RDD consists of a single word on each record.



In [None]:
rdd2 = rdd1.flatMap(lambda x: x.split(" "))

In [None]:
rdd2.collect()

['Project',
 'Gutenberg’s',
 'Alice’s',
 'Adventures',
 'in',
 'Wonderland',
 'by',
 'Lewis',
 'Carroll',
 'This',
 'eBook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'Alice’s',
 'Adventures',
 'in',
 'Wonderland',
 'by',
 'Lewis',
 'Carroll',
 'This',
 'eBook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'This',
 'eBook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'Project',
 'Gutenberg’s',
 'Alice’s',
 'Adventures',
 'in',
 'Wonderland',
 'by',
 'Lewis',
 'Carroll',
 'This',
 'eBook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'Alice’s',
 'Adventures',
 'in',
 'Wonderland',
 'by',
 'Lewis',
 'Carroll',
 'This',
 'eBook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'This',
 'eBook',
 'is',
 

map – map() transformation is used the apply any complex operations like adding a column, updating a column e.t.c, the output of map transformations would always have the same number of records as input.

In our word count example, we are adding a new column with value 1 for each word, the result of the RDD is PairRDDFunctions which contains key-value pairs, word of type String as Key and 1 of type Int as value.



In [None]:
rdd3 = rdd2.map(lambda x: (x,1))

In [None]:
rdd3.collect()

[('Project', 1),
 ('Gutenberg’s', 1),
 ('Alice’s', 1),
 ('Adventures', 1),
 ('in', 1),
 ('Wonderland', 1),
 ('by', 1),
 ('Lewis', 1),
 ('Carroll', 1),
 ('This', 1),
 ('eBook', 1),
 ('is', 1),
 ('for', 1),
 ('the', 1),
 ('use', 1),
 ('of', 1),
 ('anyone', 1),
 ('anywhere', 1),
 ('at', 1),
 ('no', 1),
 ('cost', 1),
 ('and', 1),
 ('with', 1),
 ('Alice’s', 1),
 ('Adventures', 1),
 ('in', 1),
 ('Wonderland', 1),
 ('by', 1),
 ('Lewis', 1),
 ('Carroll', 1),
 ('This', 1),
 ('eBook', 1),
 ('is', 1),
 ('for', 1),
 ('the', 1),
 ('use', 1),
 ('of', 1),
 ('anyone', 1),
 ('anywhere', 1),
 ('at', 1),
 ('no', 1),
 ('cost', 1),
 ('and', 1),
 ('with', 1),
 ('This', 1),
 ('eBook', 1),
 ('is', 1),
 ('for', 1),
 ('the', 1),
 ('use', 1),
 ('of', 1),
 ('anyone', 1),
 ('anywhere', 1),
 ('at', 1),
 ('no', 1),
 ('cost', 1),
 ('and', 1),
 ('with', 1),
 ('Project', 1),
 ('Gutenberg’s', 1),
 ('Alice’s', 1),
 ('Adventures', 1),
 ('in', 1),
 ('Wonderland', 1),
 ('by', 1),
 ('Lewis', 1),
 ('Carroll', 1),
 ('This', 1)

reduceByKey – reduceByKey() merges the values for each key with the function specified. In our example, it reduces the word string by applying the sum function on value. The result of our RDD contains unique words and their count. 



In [None]:
rdd4 = rdd3.reduceByKey(lambda a,b: a+b)

In [None]:
rdd4.collect()

[('Project', 9),
 ('Gutenberg’s', 9),
 ('Alice’s', 18),
 ('Adventures', 18),
 ('in', 18),
 ('Wonderland', 18),
 ('by', 18),
 ('Lewis', 18),
 ('Carroll', 18),
 ('This', 27),
 ('eBook', 27),
 ('is', 27),
 ('for', 27),
 ('the', 27),
 ('use', 27),
 ('of', 27),
 ('anyone', 27),
 ('anywhere', 27),
 ('at', 27),
 ('no', 27),
 ('cost', 27),
 ('and', 27),
 ('with', 27),
 ('', 1)]

sortByKey – sortByKey() transformation is used to sort RDD elements on key. In our example, first, we convert RDD[(String,Int]) to RDD[(Int, String]) using map transformation and apply sortByKey which ideally does sort on an integer value. And finally, foreach with println statements returns all words in RDD and their count as key-value pair.

In [None]:
rdd5 = rdd4.map(lambda x: (x[1],x[0])).sortByKey()
rdd5.collect()

[(1, ''),
 (9, 'Project'),
 (9, 'Gutenberg’s'),
 (18, 'Alice’s'),
 (18, 'Adventures'),
 (18, 'in'),
 (18, 'Wonderland'),
 (18, 'by'),
 (18, 'Lewis'),
 (18, 'Carroll'),
 (27, 'This'),
 (27, 'eBook'),
 (27, 'is'),
 (27, 'for'),
 (27, 'the'),
 (27, 'use'),
 (27, 'of'),
 (27, 'anyone'),
 (27, 'anywhere'),
 (27, 'at'),
 (27, 'no'),
 (27, 'cost'),
 (27, 'and'),
 (27, 'with')]

filter – filter() transformation is used to filter the records in an RDD. In our example we are filtering all words starts with “a”.



In [None]:
rdd6 = rdd5.filter(lambda x : 'an' in x[1])
rdd6.collect()

[(18, 'Wonderland'), (27, 'anyone'), (27, 'anywhere'), (27, 'and')]

# RDD ACTIONS

count() – Returns the number of records in an RDD



In [None]:
print("count :"+str(rdd6.count()))

count :4


first() – Returns the first record.



In [None]:
firstRec = rdd6.first()
print("first record:" +str(firstRec[0]) + "," + firstRec[1])

first record:18,Wonderland


max() – Returns max record.



In [None]:
datMax = rdd6.max()
print("Max Record:" +str(datMax[0]) + "," + datMax[1])

Max Record:27,anywhere


reduce() – Reduces the records to single, we can use this to count or sum.



In [None]:
totalWordCount = rdd6.reduce(lambda a,b: (a[0]+b[0],a[1]))
print("Reduce Record:" +str(totalWordCount[0]))

Reduce Record:99


take() – Returns the record specified as an argument.

saveAsTextFile() – Using saveAsTestFile action, we can write the RDD to a text file.





---



In [None]:
rdd6.saveAsTextFile("/content/WordCount")