In [1]:
# Creating the Spark Session

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("spark_core_APIs_1") \
    .config("spark.dynamicAllocation.enabled","False") \
    .master("local[*]") \
    .getOrCreate()

spark

In [2]:
orders_rdd = spark.sparkContext.textFile("/home/jupyter/module_wise_notebooks/module_4/part-00000")

In [7]:
orders_rdd.distinct().count()

68883

In [8]:
words = ("big","Data","Is","SUPER","Interesting","BIG","data","IS","A","Trending","technology") # is a local file

In [9]:
words_rdd = spark.sparkContext.parallelize(words)

In [10]:
words_rdd.take(2)

['big', 'Data']

In [11]:
words_rdd.getNumPartitions()

4

In [12]:
spark.sparkContext.defaultParallelism

4

In [13]:
orders_rdd = spark.sparkContext.textFile("/home/jupyter/module_wise_notebooks/module_4/part-00000") # 3mbs

In [14]:
orders_rdd.getNumPartitions()

2

In [16]:
spark.sparkContext.defaultMinPartitions

2

In [17]:
words_rdd.take(5)

['big', 'Data', 'Is', 'SUPER', 'Interesting']

### we want to count the freq of each word

In [18]:
normalized_words_rdd = words_rdd.map(lambda word: word.lower())

In [19]:
normalized_words_rdd.collect()

['big',
 'data',
 'is',
 'super',
 'interesting',
 'big',
 'data',
 'is',
 'a',
 'trending',
 'technology']

In [20]:
 mapped_words_rdd = normalized_words_rdd.map(lambda word: (word,1))

In [21]:
mapped_words_rdd.collect()

[('big', 1),
 ('data', 1),
 ('is', 1),
 ('super', 1),
 ('interesting', 1),
 ('big', 1),
 ('data', 1),
 ('is', 1),
 ('a', 1),
 ('trending', 1),
 ('technology', 1)]

In [22]:
reduced_words_rdd = mapped_words_rdd.reduceByKey(lambda word1,word2: word1+word2)

In [24]:
reduced_words_rdd.collect()

[('super', 1),
 ('interesting', 1),
 ('trending', 1),
 ('technology', 1),
 ('a', 1),
 ('is', 2),
 ('big', 2),
 ('data', 2)]

In [25]:
words_rdd.collect()

['big',
 'Data',
 'Is',
 'SUPER',
 'Interesting',
 'BIG',
 'data',
 'IS',
 'A',
 'Trending',
 'technology']

## chaining all the operations

In [26]:
spark.sparkContext.parallelize(words).map(lambda word: word.lower()).map(lambda word: (word,1)).reduceByKey(lambda word1,word2: word1+word2).collect()

[('super', 1),
 ('interesting', 1),
 ('trending', 1),
 ('technology', 1),
 ('a', 1),
 ('is', 2),
 ('big', 2),
 ('data', 2)]

In [27]:
spark. \
sparkContext. \
parallelize(words). \
map(lambda word: word.lower()). \
map(lambda word: (word,1)). \
reduceByKey(lambda word1,word2: word1+word2). \
collect()

[('super', 1),
 ('interesting', 1),
 ('trending', 1),
 ('technology', 1),
 ('a', 1),
 ('is', 2),
 ('big', 2),
 ('data', 2)]

In [30]:
spark \
.sparkContext \
.parallelize(words) \
.map(lambda word: word.lower()) \
.map(lambda word: (word,1)) \
.reduceByKey(lambda word1,word2: word1+word2) \
.collect()

[('super', 1),
 ('interesting', 1),
 ('trending', 1),
 ('technology', 1),
 ('a', 1),
 ('is', 2),
 ('big', 2),
 ('data', 2)]

In [31]:
spark \
.sparkContext \
.textFile("/home/jupyter/module_wise_notebooks/module_4/part-00000") \
.map(lambda word: word.lower()) \
.map(lambda word: (word,1)) \
.reduceByKey(lambda word1,word2: word1+word2) \
.take(5)

[('1,2013-07-25 00:00:00.0,11599,closed', 1),
 ('2,2013-07-25 00:00:00.0,256,pending_payment', 1),
 ('3,2013-07-25 00:00:00.0,12111,complete', 1),
 ('4,2013-07-25 00:00:00.0,8827,closed', 1),
 ('9,2013-07-25 00:00:00.0,5657,pending_payment', 1)]

In [32]:
resultant_rdd = spark \
.sparkContext \
.parallelize(words) \
.map(lambda word: word.lower()) \
.map(lambda word: (word,1)) \
.reduceByKey(lambda word1,word2: word1 + word2)

In [33]:
resultant_rdd.collect()

[('super', 1),
 ('interesting', 1),
 ('trending', 1),
 ('technology', 1),
 ('a', 1),
 ('is', 2),
 ('big', 2),
 ('data', 2)]