In [None]:
# installing open java development kit
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# getting spark package
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz

# unzipping
!tar xf spark-3.2.0-bin-hadoop3.2.tgz

# Installing findspark
!pip install -q findspark

In [None]:
# importing OS
import os

# setting java environment
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# setting spark environment
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

In [None]:
# importing spark
import findspark

# initializing spark
findspark.init()

In [None]:
# importing spark session
from pyspark.sql import SparkSession

# creating spark session
spark = SparkSession.builder.master("local[*]").getOrCreate()

# getting sparkcontext from spark
sc = spark.sparkContext

# print
sc

In [None]:
# importing numpy
import numpy as np

# importing numpy.random
from numpy.random import uniform as u

# importing json
import json

In [None]:
# importing drive from google colab
from google.colab import drive

# mounting drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# path to the files folder
path = '/content/gdrive/MyDrive/data/assgn2_data/data'

Question 1

In [None]:
# reading all the files present in the path
textR = sc.textFile(path)

In [None]:
# defining a function called "process_rdd" which extracts the text part from json, splits the texts after transforming them to lower case
def process_rdd(rdd):

    # Extracting the text from JSON and perform text processing
    text_json = rdd.map(lambda x: json.loads(x))

    # getting the "text" from the json text
    only_text = text_json.map(lambda x: x.get("text"))

    # lower casing the text and spliting the words
    text_split = only_text.flatMap(lambda x: x.lower().split())

    # return
    return text_split

In [None]:
# calling the function on textR
text_split = process_rdd(textR)

In [None]:
# mapping 1 to each word
mapped_textR = text_split.map(lambda x:(x,1))

In [None]:
# counting all the words
count_rdd = mapped_textR.countByKey()

In [None]:
# converting the counts to rdd
count_rdd = sc.parallelize(count_rdd.items())

In [None]:
# filtering the words that has a freuency less than 10
filtered_rdd = count_rdd.filter(lambda x:x[1]>=10)

In [None]:
# total size of the rdd after filtering
Total_size = filtered_rdd.count()

Total size of the output data after filtering

In [None]:
# print
print("The total size of the output data", Total_size)

The total size of the output data 110620


Question 2

In [None]:
# list of given words
words = ["congress","london","washington","football"]

In [None]:
# filtering the frequency of the words from the above list
words_frequency = filtered_rdd.filter(lambda x:x[0] in words)

Frequency of the following words -  congress, london, washington, football

In [None]:
# collecting and printing the frequency for each word
for x, y in words_frequency.collect():
    print(f"Frequency of '{x}': {y}")

Frequency of 'congress': 28093
Frequency of 'washington': 546
Frequency of 'football': 1387
Frequency of 'london': 2389


Question 3

In [None]:
# Creating a list to store the RDDs
rdd_list = []

# creating a loop that reads through every file and creates an rdd for each month
for month in range(1, 13):
    file_pattern = '2012-' + str(month).zfill(2) + '*'
    rdd = sc.textFile(path + '/' + file_pattern)
    rdd_list.append(rdd)

The word with maximum frequency for each month

In [None]:
# Creating a dictionary to store the most frequent word for each month
most_frequent_words = {}

# Iterating through the list of RDDs and performing the required operations
for idx, rdd in enumerate(rdd_list):

    # calling the previously defined function to process rdd
    text_split = process_rdd(rdd)

    # Counting the words using reduceByKey
    word_counts = text_split.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)

    # finding the most frequent word
    most_frequent_word = word_counts.max(key=lambda x: x[1])

    # Storing the most frequent word in the dictionary
    month_variable_name = f'month_{str(idx + 1).zfill(2)}'

    # storing the most frequent word and it's count for the current month
    most_frequent_words[month_variable_name] = most_frequent_word

# Printing the most frequent word for each month
for name, word_count in most_frequent_words.items():
    print(f"Most frequent word in {name}: {word_count[0]} (Count: {word_count[1]})")


Most frequent word in month_01: the (Count: 245413)
Most frequent word in month_02: the (Count: 266950)
Most frequent word in month_03: the (Count: 283107)
Most frequent word in month_04: the (Count: 234165)
Most frequent word in month_05: the (Count: 309021)
Most frequent word in month_06: the (Count: 267383)
Most frequent word in month_07: the (Count: 294923)
Most frequent word in month_08: the (Count: 282393)
Most frequent word in month_09: the (Count: 264449)
Most frequent word in month_10: the (Count: 279876)
Most frequent word in month_11: the (Count: 305296)
Most frequent word in month_12: the (Count: 305414)


Question 4

In [None]:
# reading the file "2012-08-01"
text8 = sc.textFile(path + '/' + '2012-08-01')

# reading the file "2012-09-01"
text9 = sc.textFile(path + '/' + '2012-09-01')

In [None]:
# calling the previous defined process_rdd function on both files
processed_text8 = process_rdd(text8)
processed_text9 = process_rdd(text9)

In [None]:
# substracting the words that are present in fie "2012-08-01"
result = processed_text9.subtract(processed_text8)

List of words appeared on ‘2012-09-01’ but not on ‘2012-08-01’

In [None]:
# print
print("the words present in 2012-09-01 and not in 2012-08-01:")

# using set function to remove duplicates
for i in set(result.collect()):
  print(i)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
"deliberate
krida
carnage.
accomplices,
laptops.
sivaswaroop
signage
rameshwar
numbers,
pipes
2,758
20%,
samastipur
ps-34,
7-5
ima,
inasmuch
plaintiffs'
arupathu
0.32
finishes
debut,
future..
mechanized
designing,
wounds.
valsad.maharashtra
1.65
511
hit,
besieged
sholapur,
gupta.
antiquated
father.the
teens
interrogate
present.he
meeran
nitte
keynote
doll
shinde,
gorshkov
surjeet
younger,
vidyashilp
pollutant,"
leap
wec,
cunning
mobin,
indianorigin
jankipurm
hotelier
jet
onsichuk
foresaw
multigrain
rackets
hurt,
services.the
incomplete.
khatija
included.
dossier,
iskcon,
james.
mohalla.
suganya
mahakalapada
(pwd)
vishwa
used.in
mujeeb-ur-rehman
samastipur,
equipment,"
doorsteps
dabhade
pond
deducted
hod
government-in-exile
cd
akash
"transporting
punjab.
harshvir
justice."justice
ud.the
d.s.
sneezes,
convener.
"coveted"
1,285
iit-bbs,
fortunes
locations.he
colorado,
expunged,"
yes.
jaskirat
thakurli
race"
administrations,


Question 5: Frequency of the word ‘monsoon’ for all months

In [None]:
# Creating a dictionary to store the count for "monsoon" for each month
monsoon_counts = {}

# defining the word monsoon
word = "monsoon"

# Iterating through the list of RDDs and performing the operations
for idx, rdd in enumerate(rdd_list):

    # calling the previously defined function to process rdd
    text_split = process_rdd(rdd)

    # Counting the occurrences of the target word
    word_count = text_split.filter(lambda x: x == word).count()

    # Storing the word count into the dictionary
    month_variable_name = f'month_{str(idx + 1).zfill(2)}'

    # storing the word and it's count for the current month
    monsoon_counts[month_variable_name] = word_count

# Printing the counts of "monsoon" for each month
for name, count in monsoon_counts.items():
    print(f"Occurrences of 'monsoon' in {name}: {count}")

Occurrences of 'monsoon' in month_01: 42
Occurrences of 'monsoon' in month_02: 52
Occurrences of 'monsoon' in month_03: 81
Occurrences of 'monsoon' in month_04: 121
Occurrences of 'monsoon' in month_05: 334
Occurrences of 'monsoon' in month_06: 934
Occurrences of 'monsoon' in month_07: 904
Occurrences of 'monsoon' in month_08: 505
Occurrences of 'monsoon' in month_09: 410
Occurrences of 'monsoon' in month_10: 252
Occurrences of 'monsoon' in month_11: 113
Occurrences of 'monsoon' in month_12: 59
