In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("lab02_Spark_RDD_Core")
    .master("local[*]")
    .getOrCreate()
)

spark

26/02/10 18:44:54 WARN Utils: Your hostname, MacBook-Pro-de-ABIDHIAF.local resolves to a loopback address: 127.0.0.1; using 172.17.12.243 instead (on interface en0)
26/02/10 18:44:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/10 18:44:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 50638)
Traceback (most recent call last):
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/socketserver.py", line 747, in __init__
    self.handle()
  File "/Users/abidhiafahmed/projets/realtime-spark/.

In [2]:
data = [1, 2, 3, 4, 5, 6, 7, 8, 9]

rdd = spark.sparkContext.parallelize(data)

rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:289

In [3]:
# Show elements (be careful with collect on large RDDs)
print("RDD content:", rdd.collect())

# Number of elements
print("Number of elements:", rdd.count())

# Get the first 3 elements
print("First three elements:", rdd.take(3))

RDD content: [1, 2, 3, 4, 5, 6, 7, 8, 9]
Number of elements: 9
First three elements: [1, 2, 3]


In [4]:
rdd_square = rdd.map(lambda x: x * x)

print("Original RDD:", rdd.collect())
print("Squared RDD:", rdd_square.collect())

Original RDD: [1, 2, 3, 4, 5, 6, 7, 8, 9]
Squared RDD: [1, 4, 9, 16, 25, 36, 49, 64, 81]


In [5]:
rdd_even = rdd.filter(lambda x: x % 2 == 0)

print("Original RDD:", rdd.collect())
print("Even numbers:", rdd_even.collect())

Original RDD: [1, 2, 3, 4, 5, 6, 7, 8, 9]
Even numbers: [2, 4, 6, 8]


In [6]:
phrases = [
    "Hello Spark",
    "RDDs are powerful",
    "Distributed processing is useful"
]

rdd_phrases = spark.sparkContext.parallelize(phrases)

rdd_words = rdd_phrases.flatMap(lambda line: line.split(" "))

print("Phrases:", rdd_phrases.collect())
print("Words:", rdd_words.collect())

Phrases: ['Hello Spark', 'RDDs are powerful', 'Distributed processing is useful']
Words: ['Hello', 'Spark', 'RDDs', 'are', 'powerful', 'Distributed', 'processing', 'is', 'useful']


In [7]:
print("Total number of words:", rdd_words.count())
print("A few words:", rdd_words.take(5))

Total number of words: 9
A few words: ['Hello', 'Spark', 'RDDs', 'are', 'powerful']


In [8]:
total = rdd.reduce(lambda a, b: a + b)
print("Sum of RDD elements:", total)

Sum of RDD elements: 45


In [9]:
word_counts = rdd_words.countByValue()

print("Count by word:", word_counts)

Count by word: defaultdict(<class 'int'>, {'Hello': 1, 'Spark': 1, 'RDDs': 1, 'are': 1, 'powerful': 1, 'Distributed': 1, 'processing': 1, 'is': 1, 'useful': 1})


In [20]:
file_path = "/Users/abidhiafahmed/projets/realtime-spark/data/Reviews.csv"  # adapt the path

rdd_lines = spark.sparkContext.textFile(file_path)

print("Number of lines in the RDD:", rdd_lines.count())
print("Sample lines:", rdd_lines.take(5))

Number of lines in the RDD: 568455
Sample lines: ['Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text', '1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.', '2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,"Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as ""Jumbo""."', '3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all","This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Fil

In [21]:
import re

def clean_line(line: str) -> list[str]:
    # Lowercase
    line = line.lower()
    # Replace non-alphanumeric characters with spaces
    line = re.sub(r"[^a-zA-ZÀ-ÖØ-öø-ÿ0-9]+", " ", line)
    # Split into words
    words = line.split()
    return words

# 1. Split into words
rdd_words_reviews = rdd_lines.flatMap(clean_line)

# 2. Optional: filter very short words
rdd_words_filtered = rdd_words_reviews.filter(lambda w: len(w) > 2)

# 3. Map to (word, 1)
rdd_pairs = rdd_words_filtered.map(lambda w: (w, 1))

# 4. Aggregate by word
rdd_word_count = rdd_pairs.reduceByKey(lambda a, b: a + b)

# 5. Display a few results
print("Number of distinct words:", rdd_word_count.count())
print("Sample pairs (word, count):", rdd_word_count.take(10))



Number of distinct words: 1137224
Sample pairs (word, count): [('profilename', 1), ('score', 419), ('b001e4kfg0', 1), ('several', 20635), ('vitality', 527), ('most', 45513), ('not', 369149), ('sure', 32471), ('b000lqoch0', 1), ('1219017600', 82)]


                                                                                

In [22]:
rdd_count_word = rdd_word_count.map(lambda pair: (pair[1], pair[0]))

rdd_sorted = rdd_count_word.sortByKey(ascending=False)

top_20 = rdd_sorted.take(20)

print("Top 20 most frequent words:")
for count, word in top_20:
    print(f"{word} : {count}")

Top 20 most frequent words:
the : 1935633
and : 1327521
this : 699417
for : 593750
that : 458916
but : 401762
you : 382248
not : 369149
with : 363724
have : 350521
are : 326860
was : 319146
they : 318681
like : 269778
good : 253054
these : 247360
great : 240632
them : 218206
coffee : 192863
taste : 190810


In [23]:
print("Number of partitions in rdd_lines:", rdd_lines.getNumPartitions())

Number of partitions in rdd_lines: 9


In [24]:
rdd_repart_8 = rdd_lines.repartition(8)
print("Partitions after repartition(8):", rdd_repart_8.getNumPartitions())

Partitions after repartition(8): 8


In [25]:
rdd_coalesce_2 = rdd_lines.coalesce(2)
print("Partitions after coalesce(2):", rdd_coalesce_2.getNumPartitions())

Partitions after coalesce(2): 2


In [26]:
import time

def measure_time(action_fn, description: str):
    start = time.time()
    result = action_fn()
    end = time.time()
    print(f"{description} -> result = {result}, time = {end - start:.4f} s")

# Compare count across different versions
measure_time(lambda: rdd_lines.count(), "count with default partitioning")
measure_time(lambda: rdd_repart_8.count(), "count after repartition(8)")
measure_time(lambda: rdd_coalesce_2.count(), "count after coalesce(2)")

count with default partitioning -> result = 568455, time = 0.3383 s
count after repartition(8) -> result = 568455, time = 0.7419 s
count after coalesce(2) -> result = 568455, time = 0.4048 s


In [28]:
# Intermediate RDD: filtered words
rdd_words_filtered = rdd_words_reviews.filter(lambda w: len(w) > 2)

# Cache
rdd_words_filtered_cache = rdd_words_filtered.cache()

In [29]:
# First action
print("Number of filtered words:", rdd_words_filtered_cache.count())

# Second action
print("Sample filtered words:", rdd_words_filtered_cache.take(10))

                                                                                

Number of filtered words: 40380539
Sample filtered words: ['productid', 'userid', 'profilename', 'helpfulnessnumerator', 'helpfulnessdenominator', 'score', 'time', 'summary', 'text', 'b001e4kfg0']


26/02/10 19:46:21 WARN BlockManager: Task 229 already completed, not releasing lock for rdd_51_0
26/02/10 21:46:36 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1896742 ms exceeds timeout 120000 ms
26/02/10 21:46:36 WARN SparkContext: Killing executors is not supported by current scheduler.
26/02/10 21:46:38 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockMa