**Installation of java and spark**



In [45]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
!tar xf spark-3.0.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [46]:
!ls /usr/lib/jvm/

default-java		   java-11-openjdk-amd64     java-8-openjdk-amd64
java-1.11.0-openjdk-amd64  java-1.8.0-openjdk-amd64


**Defining system environments**

In [47]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

**Installation of PySpark**

In [48]:
!pip install pyspark



**Importing libraries and Creating SparkSession and SparkContext**

In [49]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("ICP_2").getOrCreate()
sc = spark.sparkContext

In [50]:
def lower_clean_str(x):
  '''
  Source: https://stackoverflow.com/questions/53218312/pyspark-how-to-remove-punctuation-marks-and-make-lowercase-letters-in-rdd 
  '''
  punc='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
  lowercased_str = x.lower()
  for ch in punc:
    lowercased_str = lowercased_str.replace(ch, '')
  return lowercased_str

In [51]:
def make_it_titlecase(x):
  '''
  It returns the Titlecase of each input
  '''  
  if len(x.strip()) > 0:
    return x[0].upper()+x[1:].lower()
  return x

**Write a spark program to group the words in a given text file based on their starting letters.**

In [52]:
# Loading the file
file_path = '/content/drive/My Drive/BigDataAnalyticsAndApplications/Classroom/icp-2-acikgozmehmet/ICP_Materials/icp2.txt'
rdd = sc.textFile(file_path)

# Cleaning the punctuation marks and some special characters from the text
cleaned_rdd = rdd.map(lower_clean_str)

# Creating (Letter, word) tuples for each word
tuples_rdd = cleaned_rdd.flatMap(lambda line: line.split(" ")).map(lambda word: (word[0].upper(), make_it_titlecase(word)))

# Creating (Letter, set of words that start with the letter)
resultAsSet = tuples_rdd.groupByKey().sortByKey().map(lambda word: ([word[0]] + list(set(word[1]))))

# Creating (Letter, List of words that start with the letter)
resultAsList = tuples_rdd.groupByKey().sortByKey().map(lambda word: ([word[0]] + list(word[1])))

# Saving the output as Set
resultAsSet.coalesce(1).saveAsTextFile('/content/drive/My Drive/BigDataAnalyticsAndApplications/Classroom/icp-2-acikgozmehmet/ICP_Materials/outputAsSet')

# Saving the output as List
resultAsList.coalesce(1).saveAsTextFile('/content/drive/My Drive/BigDataAnalyticsAndApplications/Classroom/icp-2-acikgozmehmet/ICP_Materials/outputAsList')
