In [1]:
!pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=63f82f40ca33499f1359b5fa01de4e2eff5b13c2a12c517c0020b653363e0cb4
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName("NewsDataAnalysis").getOrCreate()


In [3]:
news_rdd = spark.sparkContext.textFile("news.txt")

total_news = news_rdd.count()

words_rdd = news_rdd.flatMap(lambda line: line.split())
total_words = words_rdd.count()
first_ten_words = words_rdd.take(10)

print("Total news items:", total_news)
print("Total words:", total_words)
print("First ten words:", first_ten_words)


Total news items: 12
Total words: 2787
First ten words: ['JAPAN', 'TO', 'REVISE', 'LONG', '-', 'TERM', 'ENERGY', 'DEMAND', 'DOWNWARDS', 'The']


In [4]:
lower_words_rdd = words_rdd.map(lambda word: word.lower())
word_counts = lower_words_rdd.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
sorted_word_counts = word_counts.sortBy(lambda word_count: word_count[1], ascending=False)
top_ten_words = sorted_word_counts.take(10)
print("Top ten words:", top_ten_words)


Top ten words: [('.', 130), ('the', 123), (',', 102), ('to', 84), ('of', 64), ('said', 55), ('and', 55), ('in', 54), ('a', 45), ('s', 33)]


In [5]:
def is_alpha(word):
    return word.isalpha()
filtered_words = sorted_word_counts.filter(lambda word_count: is_alpha(word_count[0]))
top_ten_filtered_words = filtered_words.take(10)
print("Top ten filtered words:", top_ten_filtered_words)


Top ten filtered words: [('the', 123), ('to', 84), ('of', 64), ('said', 55), ('and', 55), ('in', 54), ('a', 45), ('s', 33), ('on', 28), ('for', 22)]


In [6]:
first_letter_counts = filtered_words.map(lambda word_count: (word_count[0][0], 1)).reduceByKey(lambda a, b: a + b)
top_five_letters = first_letter_counts.sortBy(lambda letter_count: letter_count[1], ascending=False).take(5)
print("Top five letters:", top_five_letters)


Top five letters: [('c', 76), ('s', 74), ('p', 68), ('a', 57), ('r', 54)]
