### Import the required libraries then Create SparkContext

In [1]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 45 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 54.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=9968b94edc669a5e3b61c93c3d6c36f1ab1ddb6706b7bd6b972b2855db4db752
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
from pyspark import SparkContext


In [4]:
scontext=SparkContext()

### Create and display an RDD from the following list

In [5]:
list = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25), ('J-Hope', 25), ('Suga', 26), ('Jin', 27)]

In [6]:
RDD=scontext.parallelize(list)
RDD.collect()

[('JK', 22),
 ('V', 24),
 ('Jimin', 24),
 ('RM', 25),
 ('J-Hope', 25),
 ('Suga', 26),
 ('Jin', 27)]

### Read sample1.txt file into RDD and displaying the first 4 elements

In [7]:
sample1=scontext.textFile('/content/sample1.txt')
sample1.take(4)

['Utilitatis causa amicitia est quaesita.',
 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. ',
 'Collatio igitur ista te nihil iuvat. Honesta oratio, Socratica, Platonis etiam. Primum in nostrane potestate est, quid meminerimus? ',
 'Duo Reges: constructio interrete. ']

### Count the total number of rows in RDD

In [8]:
sample1.count()

7

### Create a function to convert the data into lower case and splitting it

In [9]:
def l_split(input):
  return input.lower().split()

In [15]:
s1=sample1.map(l_split)
s1.collect()


[['utilitatis', 'causa', 'amicitia', 'est', 'quaesita.'],
 ['lorem',
  'ipsum',
  'dolor',
  'sit',
  'amet,',
  'consectetur',
  'adipiscing',
  'elit.'],
 ['collatio',
  'igitur',
  'ista',
  'te',
  'nihil',
  'iuvat.',
  'honesta',
  'oratio,',
  'socratica,',
  'platonis',
  'etiam.',
  'primum',
  'in',
  'nostrane',
  'potestate',
  'est,',
  'quid',
  'meminerimus?'],
 ['duo', 'reges:', 'constructio', 'interrete.'],
 ['quid,',
  'si',
  'etiam',
  'iucunda',
  'memoria',
  'est',
  'praeteritorum',
  'malorum?',
  'si',
  'quidem,',
  'inquit,',
  'tollerem,',
  'sed',
  'relinquo.',
  'an',
  'nisi',
  'populari',
  'fama?'],
 [],
 ['quamquam',
  'id',
  'quidem',
  'licebit',
  'iis',
  'existimare,',
  'qui',
  'legerint.',
  'summum',
  'a',
  'vobis',
  'bonum',
  'voluptas',
  'dicitur.',
  'at',
  'hoc',
  'in',
  'eo',
  'm.',
  'refert',
  'tamen,',
  'quo',
  'modo.',
  'quid',
  'sequatur,',
  'quid',
  'repugnet,',
  'vident.',
  'iam',
  'id',
  'ipsum',
  'absurdu

[['utilitatis', 'causa', 'amicitia', 'est', 'quaesita.'],
 ['lorem',
  'ipsum',
  'dolor',
  'sit',
  'amet,',
  'consectetur',
  'adipiscing',
  'elit.'],
 ['collatio',
  'igitur',
  'ista',
  'te',
  'nihil',
  'iuvat.',
  'honesta',
  'oratio,',
  'socratica,',
  'platonis',
  'etiam.',
  'primum',
  'in',
  'nostrane',
  'potestate',
  'est,',
  'quid',
  'meminerimus?'],
 ['duo', 'reges:', 'constructio', 'interrete.'],
 ['quid,',
  'si',
  'etiam',
  'iucunda',
  'memoria',
  'est',
  'praeteritorum',
  'malorum?',
  'si',
  'quidem,',
  'inquit,',
  'tollerem,',
  'sed',
  'relinquo.',
  'an',
  'nisi',
  'populari',
  'fama?']]

### Filter the stopwords from the previous text

In [12]:
stopwords = ['a','all','the','as','is','am','an','and',
             'be','been','from','had','I','I’d','why','with']
# Hint: you may need use flatMap

In [16]:
s2=s1.filter(lambda x: x not in stopwords)

In [17]:
s2.collect()

[['utilitatis', 'causa', 'amicitia', 'est', 'quaesita.'],
 ['lorem',
  'ipsum',
  'dolor',
  'sit',
  'amet,',
  'consectetur',
  'adipiscing',
  'elit.'],
 ['collatio',
  'igitur',
  'ista',
  'te',
  'nihil',
  'iuvat.',
  'honesta',
  'oratio,',
  'socratica,',
  'platonis',
  'etiam.',
  'primum',
  'in',
  'nostrane',
  'potestate',
  'est,',
  'quid',
  'meminerimus?'],
 ['duo', 'reges:', 'constructio', 'interrete.'],
 ['quid,',
  'si',
  'etiam',
  'iucunda',
  'memoria',
  'est',
  'praeteritorum',
  'malorum?',
  'si',
  'quidem,',
  'inquit,',
  'tollerem,',
  'sed',
  'relinquo.',
  'an',
  'nisi',
  'populari',
  'fama?'],
 [],
 ['quamquam',
  'id',
  'quidem',
  'licebit',
  'iis',
  'existimare,',
  'qui',
  'legerint.',
  'summum',
  'a',
  'vobis',
  'bonum',
  'voluptas',
  'dicitur.',
  'at',
  'hoc',
  'in',
  'eo',
  'm.',
  'refert',
  'tamen,',
  'quo',
  'modo.',
  'quid',
  'sequatur,',
  'quid',
  'repugnet,',
  'vident.',
  'iam',
  'id',
  'ipsum',
  'absurdu

['utilitatis',
 'causa',
 'amicitia',
 'est',
 'quaesita.',
 'lorem',
 'ipsum',
 'dolor',
 'sit',
 'amet,']

### Filter the words starting with ‘c’

In [27]:
s3=s2.flatMap(lambda x: x) 
s4=s3.filter(lambda x:x[0]=='c')

In [29]:
s4.collect()

['causa', 'consectetur', 'collatio', 'constructio']

['causa', 'consectetur', 'collatio', 'constructio']

### Reduce the data by key and sum it (use the data from the following list)

In [30]:
list = [('JK', 22), ('V', 24), ('Jimin',24), ('RM', 25)
        , ('J-Hope', 25), ('Suga', 26), ('Jin', 27)
       , ('J-Hope', 12), ('Suga', 25), ('Jin', 34)
       , ('JK', 32), ('V', 44), ('Jimin',14), ('RM', 35)]
# Hint: use reduceByKey

In [31]:
rlst=scontext.parallelize(list)

In [33]:
rlst1=rlst.reduceByKey(lambda x,y : x+y)
rlst1.collect()

[('Suga', 51),
 ('Jin', 61),
 ('JK', 54),
 ('V', 68),
 ('Jimin', 38),
 ('RM', 60),
 ('J-Hope', 37)]

[('Suga', 51),
 ('Jin', 61),
 ('JK', 54),
 ('V', 68),
 ('Jimin', 38),
 ('RM', 60),
 ('J-Hope', 37)]

### Creat some key value pairs RDDs

In [36]:
rdd1 =scontext.parallelize([('a',2),('b',3)])
rdd2 =scontext.parallelize([('a',9),('b',7),('c',10)])

### Perform Join operation on the RDDs (rdd1,rdd2)

In [38]:
rddf=rdd1.join(rdd2)
rddf.collect()

[('b', (3, 7)), ('a', (2, 9))]

[('b', (3, 7)), ('a', (2, 9))]